# Preprocessing Data with Technical Analysis Indicators

It is necessary to view the data so that I can handle NULL values and scale correctly

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

df = pd.read_csv('./data/AAPL_technical_indicators.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6314 entries, 0 to 6313
Columns: 224 entries, time_stamp to ZS_30
dtypes: float64(202), int64(21), object(1)
memory usage: 10.8+ MB


In [3]:
df.head()

Unnamed: 0,time_stamp,open,high,low,close,volume,ABER_ZG_5_15,ABER_SG_5_15,ABER_XG_5_15,ABER_ATR_5_15,...,VIDYA_14,VTXP_14,VTXM_14,VWAP_D,VWMA_10,WCP,WILLR_14,WMA_10,ZL_EMA_10,ZS_30
0,1999-11-01,0.714286,0.720446,0.690804,0.693036,2487300,,,,,...,,,,0.701429,,0.69933,,,,
1,1999-11-02,0.696429,0.729375,0.690268,0.716518,3564600,,,,,...,,,,0.712054,,0.71317,,,,
2,1999-11-03,0.72875,0.743304,0.723214,0.727679,2932700,,,,,...,,,,0.731399,,0.730469,,,,
3,1999-11-04,0.732679,0.762232,0.719821,0.746607,3384700,,,,,...,,,,0.742887,,0.743817,,,,
4,1999-11-05,0.755536,0.789018,0.75,0.788482,3721500,0.73272,,,,...,,,,0.775833,,0.778996,,,,


In [4]:
df.shape

(6314, 224)

In [5]:
# Find null columns

null_columns = df.isna()

print(null_columns)

      time_stamp   open   high    low  close  volume  ABER_ZG_5_15  \
0          False  False  False  False  False   False          True   
1          False  False  False  False  False   False          True   
2          False  False  False  False  False   False          True   
3          False  False  False  False  False   False          True   
4          False  False  False  False  False   False         False   
...          ...    ...    ...    ...    ...     ...           ...   
6309       False  False  False  False  False   False         False   
6310       False  False  False  False  False   False         False   
6311       False  False  False  False  False   False         False   
6312       False  False  False  False  False   False         False   
6313       False  False  False  False  False   False         False   

      ABER_SG_5_15  ABER_XG_5_15  ABER_ATR_5_15  ...  VIDYA_14  VTXP_14  \
0             True          True           True  ...      True     True   
1        

In [6]:
drop_nulls = df.dropna(inplace=False)

drop_nulls.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Columns: 224 entries, time_stamp to ZS_30
dtypes: float64(202), int64(21), object(1)
memory usage: 0.0+ bytes


With the new technical indicators, every row now has a null value. Since there are 224 features, I have two options: impute null values with another value, or remove features with lots of null values. Since my dataset is not very large, I would like all the features I can get. 

# Imputation

In [7]:
# Time series interpolation method
df.set_index(pd.DatetimeIndex(df["time_stamp"]), inplace=True)

for column in df.columns:
    df[f'{column}'] = df[f'{column}'].interpolate(method='time')

print(df.isna().sum())

time_stamp     0
open           0
high           0
low            0
close          0
              ..
WCP            0
WILLR_14      13
WMA_10         9
ZL_EMA_10      9
ZS_30         29
Length: 224, dtype: int64


In [8]:
# Use backfill for edge cases

df = df.ffill().bfill()

print(df.isna().sum())

print(f"Total missing values: {df.isna().sum().sum()}")

time_stamp    0
open          0
high          0
low           0
close         0
             ..
WCP           0
WILLR_14      0
WMA_10        0
ZL_EMA_10     0
ZS_30         0
Length: 224, dtype: int64
Total missing values: 0


### Okay the data should be good to go! Lets save it and use it in our `ta_model.py` script

In [9]:
df.to_csv(f'./data/AAPL_preprocessed_data.csv', index=False)
