In [1]:
import pandas as pd

# Load the uploaded combined datasets
sp500_data_path = '../data/processed/fill_combined_data_sp500.csv'
nasdaq_data_path = '../data/processed/fill_combined_data_nasdaq.csv'

# Load the datasets to inspect their contents
sp500_data = pd.read_csv(sp500_data_path)
nasdaq_data = pd.read_csv(nasdaq_data_path)

# Display initial info to understand the data structure and missing values
sp500_info = sp500_data.info()
nasdaq_info = nasdaq_data.info()

# Display first few rows to examine the structure
sp500_head = sp500_data.head()
nasdaq_head = nasdaq_data.head()

(sp500_info, sp500_head, nasdaq_info, nasdaq_head)

from sklearn.preprocessing import MinMaxScaler

# Function to scale data using MinMaxScaler
def minmax_scale_data(df, exclude_columns):
    scaler = MinMaxScaler()
    
    # Separate columns to exclude from scaling (e.g., Date, Market_Label)
    scaled_data = df.copy()
    cols_to_scale = [col for col in df.columns if col not in exclude_columns]
    
    # Apply MinMaxScaler to numerical columns
    scaled_data[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])
    return scaled_data

# Exclude 'Date' and 'Market_Label' from scaling
exclude_columns = ['Date', 'Market_Label']

# Apply MinMax Scaling to the datasets
sp500_minmax_scaled = minmax_scale_data(sp500_data, exclude_columns)
nasdaq_minmax_scaled = minmax_scale_data(nasdaq_data, exclude_columns)

# Save the MinMax scaled datasets
sp500_minmax_scaled_path = '../data/min_max_scaling/minmax_scaled_combined_data_sp500.csv'
nasdaq_minmax_scaled_path = '../data/min_max_scaling/minmax_scaled_combined_data_nasdaq.csv'

sp500_minmax_scaled.to_csv(sp500_minmax_scaled_path, index=False)
nasdaq_minmax_scaled.to_csv(nasdaq_minmax_scaled_path, index=False)

sp500_minmax_scaled_path, nasdaq_minmax_scaled_path

print(sp500_minmax_scaled.head())
print(nasdaq_minmax_scaled.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8033 entries, 0 to 8032
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Date               8033 non-null   object 
 1   Market_Label       8033 non-null   int64  
 2   GDP Growth         8033 non-null   float64
 3   CPI                8033 non-null   float64
 4   Interest Rate      8033 non-null   float64
 5   M2 Money Supply    8033 non-null   float64
 6   PPI                8033 non-null   float64
 7   Unemployment Rate  8033 non-null   float64
 8   VIX_Close          8033 non-null   float64
dtypes: float64(7), int64(1), object(1)
memory usage: 564.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6491 entries, 0 to 6490
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Date               6491 non-null   object 
 1   Market_Label       6491 non-null   int64  
 2  