In [4]:
import pandas as pd

# Load the uploaded combined datasets
sp500_data_path = '../data/processed/combined_data_sp500.csv'
nasdaq_data_path = '../data/processed/combined_data_nasdaq.csv'

# Load the datasets to inspect their contents
sp500_data = pd.read_csv(sp500_data_path)
nasdaq_data = pd.read_csv(nasdaq_data_path)

# Display initial info to understand the data structure and missing values
sp500_info = sp500_data.info()
nasdaq_info = nasdaq_data.info()

# Display first few rows to examine the structure
sp500_head = sp500_data.head()
nasdaq_head = nasdaq_data.head()

(sp500_info, sp500_head, nasdaq_info, nasdaq_head)

# Convert 'Date' to datetime for both datasets
sp500_data['Date'] = pd.to_datetime(sp500_data['Date'])
nasdaq_data['Date'] = pd.to_datetime(nasdaq_data['Date'])

# Drop redundant columns if necessary (e.g., Unnamed columns)
sp500_data = sp500_data.drop(columns=['Unnamed: 0'], errors='ignore')
nasdaq_data = nasdaq_data.drop(columns=['Unnamed: 0'], errors='ignore')

# Handle missing values
def fill_missing_values(df):
    # Forward fill and backward fill for continuous data
    df = df.ffill().bfill()
    # Replace remaining NaNs with the mean
    for col in df.select_dtypes(include=['float64', 'int64']).columns:
        df[col] = df[col].fillna(df[col].mean())
    return df

# Apply the function to clean data
sp500_cleaned = fill_missing_values(sp500_data)
nasdaq_cleaned = fill_missing_values(nasdaq_data)

# Save cleaned datasets for verification and further use
sp500_cleaned_path = '../data/processed/fill_combined_data_sp500.csv'
nasdaq_cleaned_path = '../data/processed/fill_combined_data_nasdaq.csv'

sp500_cleaned.to_csv(sp500_cleaned_path, index=False)
nasdaq_cleaned.to_csv(nasdaq_cleaned_path, index=False)

sp500_cleaned_path, nasdaq_cleaned_path

print(sp500_cleaned.head())
print(nasdaq_cleaned.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8033 entries, 0 to 8032
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Date               8033 non-null   object 
 1   Market_Label       8033 non-null   int64  
 2   GDP Growth         66 non-null     float64
 3   CPI                249 non-null    float64
 4   Interest Rate      7968 non-null   float64
 5   M2 Money Supply    1329 non-null   float64
 6   PPI                249 non-null    float64
 7   Unemployment Rate  249 non-null    float64
 8   VIX_Close          8033 non-null   float64
dtypes: float64(7), int64(1), object(1)
memory usage: 564.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6491 entries, 0 to 6490
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Date               6491 non-null   object 
 1   Market_Label       6491 non-null   int64  
 2  