In [11]:
import pandas as pd

# Load the uploaded combined datasets
sp500_data_path = '../data/processed/combined_data_sp500.csv'
nasdaq_data_path = '../data/processed/combined_data_nasdaq.csv'

# Load the datasets to inspect their contents
sp500_data = pd.read_csv(sp500_data_path)
nasdaq_data = pd.read_csv(nasdaq_data_path)

# Display initial info to understand the data structure and missing values
sp500_info = sp500_data.info()
nasdaq_info = nasdaq_data.info()

# Display first few rows to examine the structure
sp500_head = sp500_data.head()
nasdaq_head = nasdaq_data.head()

(sp500_info, sp500_head, nasdaq_info, nasdaq_head)

# Convert 'Date' to datetime for both datasets
sp500_data['Date'] = pd.to_datetime(sp500_data['Date'])
nasdaq_data['Date'] = pd.to_datetime(nasdaq_data['Date'])

# Drop redundant columns if necessary (e.g., Unnamed columns)
sp500_data = sp500_data.drop(columns=['Unnamed: 0'], errors='ignore')
nasdaq_data = nasdaq_data.drop(columns=['Unnamed: 0'], errors='ignore')

# Handle missing values
def fill_missing_values(df):
    # Forward fill and backward fill for continuous data
    df = df.ffill().bfill()
    # Replace remaining NaNs with the mean
    for col in df.select_dtypes(include=['float64', 'int64']).columns:
        df[col] = df[col].fillna(df[col].mean())
    return df

# Apply the function to clean data
sp500_cleaned = fill_missing_values(sp500_data)
nasdaq_cleaned = fill_missing_values(nasdaq_data)

# Save cleaned datasets for verification and further use
sp500_cleaned_path = '../data/processed/fill_combined_data_sp500.csv'
nasdaq_cleaned_path = '../data/processed/fill_combined_data_nasdaq.csv'

sp500_cleaned.to_csv(sp500_cleaned_path, index=False)
nasdaq_cleaned.to_csv(nasdaq_cleaned_path, index=False)

sp500_cleaned_path, nasdaq_cleaned_path

print(sp500_cleaned.head())
print(nasdaq_cleaned.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8032 entries, 0 to 8031
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         8032 non-null   int64  
 1   Date               8032 non-null   object 
 2   Close              8032 non-null   float64
 3   High               8032 non-null   float64
 4   Low                8032 non-null   float64
 5   Open               8032 non-null   float64
 6   Volume             8032 non-null   int64  
 7   Market_Label       8032 non-null   int64  
 8   GDP Growth         66 non-null     float64
 9   CPI                249 non-null    float64
 10  Interest Rate      7968 non-null   float64
 11  M2 Money Supply    1329 non-null   float64
 12  PPI                249 non-null    float64
 13  Unemployment Rate  249 non-null    float64
dtypes: float64(10), int64(3), object(1)
memory usage: 878.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6490 entri