In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import warnings
warnings.filterwarnings("ignore")

In [2]:
os.chdir("C:/Users/levi0/Downloads")

In [4]:
df = pd.read_csv("Preprocessed.csv")

In [6]:
df['publish_date'] = pd.to_datetime(df['publish_date']) 
df["day_of_week"] = df['publish_date'].dt.weekday
df["week_of_year"] = df['publish_date'].dt.week
df["month"] = df['publish_date'].dt.month
df['quarter'] = df['publish_date'].dt.quarter

In [7]:
df.head(3)

Unnamed: 0,Close,publish_date,polarity,day_of_week,week_of_year,month,quarter
0,4060.02002,2001-01-03,0.0,2,1,1,1
1,4115.370117,2001-01-04,0.4767,3,1,1,1
2,4183.72998,2001-01-05,0.0,4,1,1,1


**Percentage change and related features**

In [8]:
df['percent_change'] = df.Close.pct_change().fillna(0)

In [10]:
group = df.groupby(['quarter']).agg({'percent_change': ['mean']})
group.columns = ['pct_quarter']
group.reset_index(inplace=True)

df = pd.merge(df, group, on=['quarter'], how='left')

In [11]:
group = df.groupby(['week_of_year']).agg({'percent_change': ['mean']})
group.columns = ['pct_week_of_year']
group.reset_index(inplace=True)

df = pd.merge(df, group, on=['week_of_year'], how='left')

In [12]:
group = df.groupby(['month']).agg({'percent_change': ['mean']})
group.columns = ['pct_month']
group.reset_index(inplace=True)

df = pd.merge(df, group, on=['month'], how='left')

In [13]:
df.head(3)

Unnamed: 0,Close,publish_date,polarity,day_of_week,week_of_year,month,quarter,percent_change,pct_quarter,pct_week_of_year,pct_month
0,4060.02002,2001-01-03,0.0,2,1,1,1,0.0,-0.000218,0.002471,5.6e-05
1,4115.370117,2001-01-04,0.4767,3,1,1,1,0.013633,-0.000218,0.002471,5.6e-05
2,4183.72998,2001-01-05,0.0,4,1,1,1,0.016611,-0.000218,0.002471,5.6e-05


**Polarity related features**

In [16]:
df['polarity_shift_2'] = df.polarity.shift(2)
df['polarity_shift_3'] = df.polarity.shift(3)
df['polarity_shift_4'] = df.polarity.shift(4)

In [18]:
df['pol_s2_rw3'] = df.polarity_shift_2.rolling(3).mean()
df['pol_s2_rw7'] = df.polarity_shift_2.rolling(7).mean()
df['pol_s2_rw14'] = df.polarity_shift_2.rolling(14).mean()

In [19]:
df['pol_s3_rw3'] = df.polarity_shift_3.rolling(3).mean()
df['pol_s3_rw7'] = df.polarity_shift_3.rolling(7).mean()
df['pol_s3_rw14'] = df.polarity_shift_3.rolling(14).mean()

In [20]:
df['pol_s4_rw3'] = df.polarity_shift_4.rolling(3).mean()
df['pol_s4_rw7'] = df.polarity_shift_4.rolling(7).mean()
df['pol_s4_rw14'] = df.polarity_shift_4.rolling(14).mean()

**Features extracted from target variable**

In [25]:
df['lag1_Close'] = df.Close.shift(-1)

In [26]:
df['lag_close_rw3'] = df.lag1_Close.rolling(3).mean()
df['lag_close_rw7'] = df.lag1_Close.rolling(7).mean()
df['lag_close_rw30'] = df.lag1_Close.rolling(30).mean()
df['lag_close_rw120'] = df.lag1_Close.rolling(120).mean()

In [27]:
df['lag_close_ew'] = df.lag1_Close.expanding().mean()

In [28]:
df['lag_close_diff'] = df.lag1_Close.diff()

In [30]:
df

Unnamed: 0,Close,publish_date,polarity,day_of_week,week_of_year,month,quarter,percent_change,pct_quarter,pct_week_of_year,...,pol_s4_rw3,pol_s4_rw7,pol_s4_rw14,lag1_Close,lag_close_rw3,lag_close_rw7,lag_close_rw30,lag_close_rw120,lag_close_ew,lag_close_diff
0,4060.020020,2001-01-03,0.0000,2,1,1,1,0.000000,-0.000218,0.002471,...,,,,4115.370117,,,,,4115.370117,
1,4115.370117,2001-01-04,0.4767,3,1,1,1,0.013633,-0.000218,0.002471,...,,,,4183.729980,,,,,4149.550049,68.359863
2,4183.729980,2001-01-05,0.0000,4,1,1,1,0.016611,-0.000218,0.002471,...,,,,4120.430176,4139.843424,,,,4139.843424,-63.299804
3,4120.430176,2001-01-08,0.0000,0,2,1,1,-0.015130,-0.000218,-0.001488,...,,,,4125.310059,4143.156738,,,,4136.210083,4.879883
4,4125.310059,2001-01-09,0.0000,1,2,1,1,0.001184,-0.000218,-0.001488,...,,,,4047.639893,4097.793376,,,,4118.496045,-77.670166
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4862,35430.429688,2020-06-23,0.6369,1,26,6,2,0.014869,0.000766,0.001859,...,0.381000,0.446743,0.351986,34868.980469,35070.243490,34466.236049,32920.341797,35425.225960,17890.048576,-561.449219
4863,34868.980469,2020-06-24,0.0000,2,26,6,2,-0.015847,0.000766,0.001859,...,0.578333,0.520471,0.368329,34842.101563,35047.170573,34642.933594,33014.791537,35369.260140,17893.533784,-26.878906
4864,34842.101563,2020-06-25,0.5106,3,26,6,2,-0.000771,0.000766,0.001859,...,0.528200,0.464600,0.398436,35171.269531,34960.783854,34880.554688,33149.737500,35318.572900,17897.085220,329.167968
4865,35171.269531,2020-06-26,0.5423,4,26,6,2,0.009447,0.000766,0.001859,...,0.544333,0.496614,0.437171,34961.519531,34991.630208,34988.193081,33278.530469,35266.137744,17900.592091,-209.750000


In [31]:
df.dropna()

Unnamed: 0,Close,publish_date,polarity,day_of_week,week_of_year,month,quarter,percent_change,pct_quarter,pct_week_of_year,...,pol_s4_rw3,pol_s4_rw7,pol_s4_rw14,lag1_Close,lag_close_rw3,lag_close_rw7,lag_close_rw30,lag_close_rw120,lag_close_ew,lag_close_diff
119,3410.949951,2001-06-19,0.0000,1,25,6,2,0.017250,0.000766,0.000819,...,0.000000,0.000000,0.034050,3406.050049,3390.036702,3428.117153,3555.926343,3818.973250,3818.973250,-4.899902
120,3406.050049,2001-06-20,0.4404,2,25,6,2,-0.001437,0.000766,0.000819,...,0.000000,0.000000,0.034050,3405.639893,3407.546631,3414.867153,3550.505339,3813.058832,3815.557272,-0.410156
121,3405.639893,2001-06-21,0.0000,3,25,6,2,-0.000120,0.000766,0.000819,...,0.000000,0.000000,0.034050,3381.760010,3397.816651,3397.745710,3544.571672,3806.375749,3812.001557,-23.879883
122,3381.760010,2001-06-22,0.6369,4,25,6,2,-0.007012,0.000766,0.000819,...,0.000000,0.000000,0.000000,3318.669922,3368.689942,3378.445696,3536.229671,3799.694413,3807.990731,-63.090088
123,3318.669922,2001-06-25,0.0000,0,26,6,2,-0.018656,0.000766,0.001859,...,0.000000,0.000000,0.000000,3407.320068,3369.250000,3383.357143,3530.575008,3793.711163,3804.759516,88.650146
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4861,34911.320313,2020-06-22,0.5423,0,26,6,2,0.005171,0.000766,0.001859,...,0.449800,0.408514,0.350757,35430.429688,35024.493490,34231.924665,32803.746419,35481.110628,17886.556405,519.109375
4862,35430.429688,2020-06-23,0.6369,1,26,6,2,0.014869,0.000766,0.001859,...,0.381000,0.446743,0.351986,34868.980469,35070.243490,34466.236049,32920.341797,35425.225960,17890.048576,-561.449219
4863,34868.980469,2020-06-24,0.0000,2,26,6,2,-0.015847,0.000766,0.001859,...,0.578333,0.520471,0.368329,34842.101563,35047.170573,34642.933594,33014.791537,35369.260140,17893.533784,-26.878906
4864,34842.101563,2020-06-25,0.5106,3,26,6,2,-0.000771,0.000766,0.001859,...,0.528200,0.464600,0.398436,35171.269531,34960.783854,34880.554688,33149.737500,35318.572900,17897.085220,329.167968


In [36]:
4865-119

4746

In [34]:
df.dropna(inplace = True)

In [35]:
df.to_csv("FeaturesEngineered.csv", index = False)