# 3. Feature Selection and Data Preparation

In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('../data/processed/processed_data.csv')  
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4660 entries, 0 to 4659
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   date               4660 non-null   object 
 1   Open               4660 non-null   float64
 2   High               4660 non-null   float64
 3   Low                4660 non-null   float64
 4   Close              4660 non-null   float64
 5   Volume             4660 non-null   float64
 6   sentiment_scores   4660 non-null   float64
 7   is_real_sentiment  4660 non-null   bool   
dtypes: bool(1), float64(6), object(1)
memory usage: 259.5+ KB


In [2]:
# Feature Selection
selected_features = ['Open', 'Volume', 'sentiment_scores','is_real_sentiment']
target = 'Close'

In [3]:
X = df[selected_features]
y = df[target]

In [4]:
# Data Normalization 
from sklearn.preprocessing import MinMaxScaler
import joblib

scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

X_normalized = pd.DataFrame(scaler_X.fit_transform(X), columns=X.columns, index=X.index)
y_normalized = pd.DataFrame(scaler_y.fit_transform(y.values.reshape(-1, 1)), columns=[target], index=y.index)


In [5]:
print("Normalized X head:")
print(X_normalized.head())
print("\nNormalized y head:")
print(y_normalized.head())

Normalized X head:
       Open  Volume  sentiment_scores  is_real_sentiment
0  0.000006     0.0          0.382675                0.0
1  0.000008     0.0          0.382675                0.0
2  0.000012     0.0          0.382675                0.0
3  0.000016     0.0          0.382675                0.0
4  0.000028     0.0          0.382675                0.0

Normalized y head:
      Close
0  0.000006
1  0.000008
2  0.000012
3  0.000016
4  0.000028


In [7]:
try:
    # Save scalers for future use
    joblib.dump(scaler_X, '../models/scaler_X.pkl')
    joblib.dump(scaler_y, '../models/scaler_y.pkl')
    
    # Save normalized data as a dictionary
    normalized_data = {
        'X_normalized': X_normalized,
        'y_normalized': y_normalized
    }
    joblib.dump(normalized_data, '../models/normalized_data.pkl')
    
    print("Data saved successfully!")
    print(f"X_normalized shape: {X_normalized.shape}")
    print(f"y_normalized shape: {y_normalized.shape}")
    
except Exception as e:
    print(f"Error saving data: {e}")

Data saved successfully!
X_normalized shape: (4660, 4)
y_normalized shape: (4660, 1)
