# Section 1
#### Import necessary packages for feature engineering

In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from feature_engine.encoding import OneHotEncoder
from feature_engine.imputation import MeanMedianImputer
from feature_engine.selection import DropConstantFeatures
from scipy.stats import zscore

#### Load cleaned dataset

In [59]:
clean_stock_df = pd.read_csv('/Users/jotech/Milestone Repo/milestone-project-market-prediction-model/data/^GSPC.csv')
clean_stock_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2023-06-22,4355.399902,4382.25,4351.819824,4381.890137,4381.890137,3511000000
1,2023-06-21,4380.009766,4386.220215,4360.140137,4365.689941,4365.689941,3709330000
2,2023-06-20,4396.109863,4400.149902,4367.189941,4388.709961,4388.709961,4055790000
3,2023-06-16,4440.950195,4448.470215,4407.439941,4409.589844,4409.589844,6848600000
4,2023-06-15,4365.330078,4439.200195,4362.600098,4425.839844,4425.839844,4176690000


#### Extract Date and Time features

In [60]:
clean_stock_df['Date'] = pd.to_datetime(clean_stock_df['Date'])
clean_stock_df['DayOfWeek'] = clean_stock_df['Date'].dt.dayofweek
clean_stock_df['Month'] = clean_stock_df['Date'].dt.month
clean_stock_df['Year'] = clean_stock_df['Date'].dt.year

#### Extract lagging feature columns for additional pattern recognition

In [61]:
clean_stock_df['PreviousClose'] = clean_stock_df['Close'].shift(1)
clean_stock_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,DayOfWeek,Month,Year,PreviousClose
0,2023-06-22,4355.399902,4382.25,4351.819824,4381.890137,4381.890137,3511000000,3,6,2023,
1,2023-06-21,4380.009766,4386.220215,4360.140137,4365.689941,4365.689941,3709330000,2,6,2023,4381.890137
2,2023-06-20,4396.109863,4400.149902,4367.189941,4388.709961,4388.709961,4055790000,1,6,2023,4365.689941
3,2023-06-16,4440.950195,4448.470215,4407.439941,4409.589844,4409.589844,6848600000,4,6,2023,4388.709961
4,2023-06-15,4365.330078,4439.200195,4362.600098,4425.839844,4425.839844,4176690000,3,6,2023,4409.589844


#### Create rolling mean and standard deviation for additional pattern recognition

In [62]:
clean_stock_df['RollingMean_7'] = clean_stock_df['Close'].rolling(window=7).mean()

clean_stock_df.head(20)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,DayOfWeek,Month,Year,PreviousClose,RollingMean_7
0,2023-06-22,4355.399902,4382.25,4351.819824,4381.890137,4381.890137,3511000000,3,6,2023,,
1,2023-06-21,4380.009766,4386.220215,4360.140137,4365.689941,4365.689941,3709330000,2,6,2023,4381.890137,
2,2023-06-20,4396.109863,4400.149902,4367.189941,4388.709961,4388.709961,4055790000,1,6,2023,4365.689941,
3,2023-06-16,4440.950195,4448.470215,4407.439941,4409.589844,4409.589844,6848600000,4,6,2023,4388.709961,
4,2023-06-15,4365.330078,4439.200195,4362.600098,4425.839844,4425.839844,4176690000,3,6,2023,4409.589844,
5,2023-06-14,4366.290039,4391.819824,4337.850098,4372.589844,4372.589844,4252110000,2,6,2023,4425.839844,
6,2023-06-13,4352.609863,4375.370117,4349.310059,4369.009766,4369.009766,4275400000,1,6,2023,4372.589844,4387.617048
7,2023-06-12,4308.319824,4340.129883,4304.370117,4338.930176,4338.930176,3945670000,0,6,2023,4369.009766,4381.479911
8,2023-06-09,4304.879883,4322.620117,4291.700195,4298.859863,4298.859863,3786510000,4,6,2023,4338.930176,4371.932757
9,2023-06-08,4268.689941,4298.009766,4261.069824,4293.930176,4293.930176,3826740000,3,6,2023,4298.859863,4358.392787


#### Analyze feature correlations to target

In [63]:
corr_matrix = clean_stock_df.corr()['Close'].sort_values(ascending=False)
corr_matrix

Close            1.000000
Adj Close        1.000000
Low              0.989684
High             0.988422
Open             0.972304
PreviousClose    0.960820
RollingMean_7    0.930990
Date             0.507618
Year             0.504129
DayOfWeek        0.005318
Volume          -0.211129
Month           -0.359199
Name: Close, dtype: float64

In [64]:
clean_stock_df.drop(['Adj Close', 'Open', 'High', 'Low', 'Month'], axis=1, inplace=True)

In [65]:
clean_stock_df.head()

Unnamed: 0,Date,Close,Volume,DayOfWeek,Year,PreviousClose,RollingMean_7
0,2023-06-22,4381.890137,3511000000,3,2023,,
1,2023-06-21,4365.689941,3709330000,2,2023,4381.890137,
2,2023-06-20,4388.709961,4055790000,1,2023,4365.689941,
3,2023-06-16,4409.589844,6848600000,4,2023,4388.709961,
4,2023-06-15,4425.839844,4176690000,3,2023,4409.589844,


#### Imputation for missing values

In [46]:
median_imputer = MeanMedianImputer(imputation_method='median')
clean_stock_df = median_imputer.fit_transform(clean_stock_df)

# Feature selection and Transformation

In [34]:
constant_features = DropConstantFeatures(tol=0.998)
clean_stock_df = constant_features.fit_transform(clean_stock_df)

ValueError: Some of the variables in the dataset contain NaN. Check and remove those before using this transformer.