In [None]:
import os
import holidays 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

data = '2012-12-30'

path_stock = "../data/stock"
path_fed = "../data/fed"

# Top 10 Tech Companies based on Market Cap

In [None]:
AAPL_df = pd.read_csv(f"{path_stock}/AAPL_stock.csv")
MSFT_df = pd.read_csv(f"{path_stock}/MSFT_stock.csv")
GOOGL_df = pd.read_csv(f"{path_stock}/GOOGL_stock.csv")
NVDA_df = pd.read_csv(f"{path_stock}/NVDA_stock.csv")
AMZN_df = pd.read_csv(f"{path_stock}/AMZN_stock.csv")
META_df = pd.read_csv(f"{path_stock}/META_stock.csv")
TSLA_df = pd.read_csv(f"{path_stock}/TSLA_stock.csv")
AVGO_df = pd.read_csv(f"{path_stock}/AVGO_stock.csv")
AMD_df = pd.read_csv(f"{path_stock}/AMD_stock.csv")
CRM_df = pd.read_csv(f"{path_stock}/CRM_stock.csv")

## Tach companies stock Data Frame processing
- Remove the null / header
- Make some features Engineering
- Change the column name
- Change the time type

In [None]:
import pandas as pd

def process_stock_data(df, ticker_symbol):
    """
    Processes a stock data DataFrame.

    Args:
        df (pd.DataFrame): DataFrame with stock data (Price, Close, High, Low, Open, Volume, Ticker).
        ticker_symbol (str): Stock ticker symbol (e.g., 'AAPL').

    Returns:
        pd.DataFrame: Processed DataFrame with calculated features and renamed columns.
    """
    df.dropna(inplace=True)  # Remove rows containing any missing values.

    columns_to_convert = ['Close', 'High', 'Low', 'Open', 'Volume']
    df[columns_to_convert] = df[columns_to_convert].astype(float)  # Convert specified price/volume columns to floating-point numbers.

    # Calculate new features based on price data:
    df["delta_price"] = df["High"] - df["Low"]  # Calculate the difference between the high and low price for each day.
    df["avg_price"] = (df["Close"] + df["High"] + df["Low"] + df["Open"]) / 4  # Calculate the average of the close, high, low, and open prices.
    df["price_ratio"] = df["delta_price"] / df["avg_price"]  # Calculate the ratio of the delta price to the average price.
    df["invest"] = df["Volume"] * df["avg_price"]  # Calculate the difference between the trading volume and the average price (note: this might not be a standard financial metric and could be re-evaluated).

    # Rename the columns for clarity and to include the ticker symbol:
    df.rename(columns={"Price": "date",  # Rename the 'Price' column to 'date'.
                        "Close": f"close_{ticker_symbol}",  # Rename 'Close' to 'cl_ticker'.
                        "High": f"high_{ticker_symbol}",  # Rename 'High' to 'hi_ticker'.
                        "Low": f"low_{ticker_symbol}",  # Rename 'Low' to 'lo_ticker'.
                        "Open": f"open_{ticker_symbol}",  # Rename 'Open' to 'op_ticker'.
                        "delta_price": f"delta_price_{ticker_symbol}",  # Rename 'delta_price' to 'de_ticker'.
                        "avg_price": f"avg_price_{ticker_symbol}",  # Rename 'avg_price' to 'av_ticker'.
                        "invest": f"invest_{ticker_symbol}",  # Rename 'invest' to 'va_ticker'.
                        "price_ratio": f"price_ratio_{ticker_symbol}",  # Rename 'ratio' to 'ra_ticker'.
                        'Volume': f'volume_{ticker_symbol}'}, inplace=True)  # Rename 'Volume' to 'Vo_ticker'.

    df['date'] = pd.to_datetime(df['date'])  # Convert the 'date' column to datetime objects for proper time series handling.

    df.reset_index(drop=True, inplace=True)  # Reset the DataFrame's index to a default integer index and drop the original index.

    # Drop the 'Ticker' column as the ticker information is now embedded in the column names:
    if 'Ticker' in df.columns:
        df.drop('Ticker', axis=1, inplace=True)

    return df  # Return the processed DataFrame.

### Tech companies stock clean Data Frame 

In [None]:
AAPL_clean_df = process_stock_data(AAPL_df, 'AAPL')
MSFT_clean_df = process_stock_data(MSFT_df, 'MSFT')
GOOGL_clean_df = process_stock_data(GOOGL_df, 'GOOGL')
NVDA_clean_df = process_stock_data(NVDA_df, 'NVDA')
AMZN_clean_df = process_stock_data(AMZN_df, 'AMZN')
META_clean_df = process_stock_data(META_df, 'META')
TSLA_clean_df = process_stock_data(TSLA_df, 'TSLA')
AVGO_clean_df = process_stock_data(AVGO_df, 'AVGO')
AMD_clean_df = process_stock_data(AMD_df, 'AMD')
CRM_clean_df = process_stock_data(CRM_df, 'CRM')

###  Find the Max and Min od Data column in each companies stock Data Frame

In [None]:
stock_data_ranges = {}

dataframes = {
    "AAPL": AAPL_clean_df,
    "MSFT": MSFT_clean_df,
    "GOOGL": GOOGL_clean_df,
    "NVDA": NVDA_clean_df,
    "AMZN": AMZN_clean_df,
    "META": META_clean_df,
    "TSLA": TSLA_clean_df,
    "AVGO": AVGO_clean_df,
    "AMD": AMD_clean_df,
    "CRM": CRM_clean_df,
}

for name, df in dataframes.items():
    if 'date' in df.columns:
        min_date = df['date'].min()
        max_date = df['date'].max()
        stock_data_ranges[name] = {'min_date': min_date, 'max_date': max_date}
    else:
        print(f"Warning: 'date' column not found in {name}_clean_df")

# Create a Pandas DataFrame to display the results
date_range_df = pd.DataFrame.from_dict(stock_data_ranges, orient='index')
date_range_df.index.name = 'Stock'

print(date_range_df)

In above result , It seems that the META is started from 2012 while almost the others started from 2000.

## Macro Indicators from Yahoo Finance:
- Indices
- Commodities
- Sector ETFs (Proxies)
- Other Market Metrics

In [None]:
macro_df = pd.read_csv(f"{path_stock}/macro_indicators_full.csv")
# Convert the 'date' column to datetime objects
macro_df['Date'] = pd.to_datetime(macro_df['Date'])
macro_df.rename(columns={"Date": "date"}, inplace=True)

In [None]:
macro_df.isnull().sum()

Data Frame : macro_df ---> Has some missing values that need to be check according to the time. 

Let Filter the time after the '2012-05-31'. This is exactly after the time which we have the META stock data frame. 

In [None]:
macro_df_filter = macro_df[macro_df['date'] > data ]
min_date_macro_df_filter = macro_df_filter['date'].min()
max_date_macro_df_filter = macro_df_filter['date'].max()
macro_df_filter.isnull().sum()

In [None]:
#macro_df_filter = macro_df_filter.drop('Brent_Crude_Futures',axis=1)

In [None]:
df = macro_df_filter
data_name = 'macro_df_filter'
# 1. Matrix Plot: Visualize the pattern of missingness
plt.figure(figsize=(10, 6))
msno.matrix(df)
plt.title(f'Missing Value Matrix - {data_name}')
plt.show()

In [None]:
macro_clean_df = macro_df_filter.dropna()
macro_clean_df.isnull().sum()

# Fed Data frame

In [None]:
fed_df = pd.read_csv(f"{path_fed}/combined_economic_indicators.csv")

# Rename the 'Unnamed: 0' column to 'date'
fed_df.rename(columns={'Unnamed: 0': 'date'}, inplace=True)

# Convert the 'date' column to datetime objects
fed_df['date'] = pd.to_datetime(fed_df['date'])

In [None]:
fed_df.isnull().sum()

In [None]:
fed_df_filter = fed_df[fed_df['date'] > data]
min_date_fed_df_filter = fed_df_filter['date'].min()
max_date_fed_df_filter = fed_df_filter['date'].max()
fed_df_filter.isnull().sum()

In [None]:
df = fed_df_filter
data_name = 'fed_df_filter'
# 1. Matrix Plot: Visualize the pattern of missingness
plt.figure(figsize=(10, 6))
msno.matrix(df)
plt.title(f'Missing Value Matrix - {data_name}')
plt.show()

In [None]:
fed_clean_df = fed_df_filter[['date', 'cpi', 'fed_rate', 'consumer_confidence','vix', 'oil', 'nonfarm_payrolls',
       'treasury_yield', 'industrial_production', 'retail_sales', 'pmi',
        'day_of_week', 'is_holiday', 'is_working_day']].dropna()
fed_clean_df.isnull().sum()

# Clean Data Frame
- Take care of date column and its max when join

In [None]:
fed_clean_df.describe()

In [None]:
macro_clean_df.describe()

In [None]:
AAPL_clean_df
MSFT_clean_df
GOOGL_clean_df
NVDA_clean_df
AMZN_clean_df
META_clean_df
TSLA_clean_df
AVGO_clean_df
AMD_clean_df
CRM_clean_df