## 1. Relevant imports



In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn import preprocessing

## 2. Getting the data

In [2]:
vix = yf.Ticker("^vix") #instanciate vix data

historical_price_data = vix.history(period="max") # getting all historical data, this will return a dataframe object

#### Let's have a look at the raw data that we found

In [3]:
print(historical_price_data)

                 Open       High        Low      Close  Volume  Dividends  \
Date                                                                        
1990-01-02  17.240000  17.240000  17.240000  17.240000       0          0   
1990-01-03  18.190001  18.190001  18.190001  18.190001       0          0   
1990-01-04  19.219999  19.219999  19.219999  19.219999       0          0   
1990-01-05  20.110001  20.110001  20.110001  20.110001       0          0   
1990-01-08  20.260000  20.260000  20.260000  20.260000       0          0   
...               ...        ...        ...        ...     ...        ...   
2021-11-16  16.860001  17.080000  16.030001  16.370001       0          0   
2021-11-17  16.360001  17.190001  16.280001  17.110001       0          0   
2021-11-18  16.809999  18.150000  16.379999  17.590000       0          0   
2021-11-19  17.360001  19.010000  17.230000  17.910000       0          0   
2021-11-22  18.200001  19.590000  17.350000  19.170000       0          0   

## 3. Deal with missing data
- we see that the "Volume", "Dividends" and "Stock Splits" columns are empty. Why?
- Either fill in missing data if possible or just don't use it

#### We proceed to discard the empty columns

In [4]:
historical_price_data_cleaned = historical_price_data.drop(["Volume", "Dividends", "Stock Splits"], axis = 1)

#### OR

In [5]:
# historical_price_data.drop(["Volume", "Dividends", "Stock Splits"], axis = 1, inplace = True)

#### Lets check out our data now

In [6]:
print(historical_price_data_cleaned)

                 Open       High        Low      Close
Date                                                  
1990-01-02  17.240000  17.240000  17.240000  17.240000
1990-01-03  18.190001  18.190001  18.190001  18.190001
1990-01-04  19.219999  19.219999  19.219999  19.219999
1990-01-05  20.110001  20.110001  20.110001  20.110001
1990-01-08  20.260000  20.260000  20.260000  20.260000
...               ...        ...        ...        ...
2021-11-16  16.860001  17.080000  16.030001  16.370001
2021-11-17  16.360001  17.190001  16.280001  17.110001
2021-11-18  16.809999  18.150000  16.379999  17.590000
2021-11-19  17.360001  19.010000  17.230000  17.910000
2021-11-22  18.200001  19.590000  17.350000  19.170000

[8037 rows x 4 columns]


## 4. Nominal Values? Depends. In most cases, no
#### Lets now change the nominal values to a percentage change value between each day (daily is not a requirement, we could also have a rolling average of the last $x$ trading sessions)

In [7]:
historical_price_data_cleaned_pct_change = historical_price_data_cleaned.pct_change()

historical_price_data_cleaned_pct_change = historical_price_data_cleaned_pct_change[1:] # chops off the first NaN value

#### Lets look at out data again

In [8]:
print(historical_price_data_cleaned_pct_change)

                Open      High       Low     Close
Date                                              
1990-01-03  0.055104  0.055104  0.055104  0.055104
1990-01-04  0.056624  0.056624  0.056624  0.056624
1990-01-05  0.046306  0.046306  0.046306  0.046306
1990-01-08  0.007459  0.007459  0.007459  0.007459
1990-01-09  0.095755  0.095755  0.095755  0.095755
...              ...       ...       ...       ...
2021-11-16 -0.009982 -0.021764 -0.027896 -0.007277
2021-11-17 -0.029656  0.006440  0.015596  0.045205
2021-11-18  0.027506  0.055846  0.006142  0.028054
2021-11-19  0.032719  0.047383  0.051893  0.018192
2021-11-22  0.048387  0.030510  0.006965  0.070352

[8036 rows x 4 columns]


#### Lets take a peek at the summary statistics of our data

In [9]:
historical_price_data_cleaned_pct_change.describe()

Unnamed: 0,Open,High,Low,Close
count,8036.0,8036.0,8036.0,8036.0
mean,0.002475,0.002381,0.001804,0.002309
std,0.071982,0.070942,0.060921,0.069947
min,-0.358284,-0.370974,-0.400595,-0.295727
25%,-0.039045,-0.03653,-0.033168,-0.036927
50%,-0.002739,-0.002709,-0.003089,-0.003653
75%,0.036071,0.033297,0.032016,0.032757
max,1.023861,1.172452,0.601269,1.155979


## 5. Make Input data small values. They work better. i.e. 0 - 1 or -1 - 1
- standardize data
- normalize data

#### Iterate over the columns of data in our dataframe and normalise the data

In [10]:
for column in historical_price_data_cleaned_pct_change:

    mean = np.mean(historical_price_data_cleaned_pct_change[column])
    std = np.std(historical_price_data_cleaned_pct_change[column])
    historical_price_data_cleaned_pct_change[column] = (historical_price_data_cleaned_pct_change[column] - mean)/std # vectorised operation to standardise all columns

In [11]:
historical_price_data_cleaned_pct_change

Unnamed: 0_level_0,Open,High,Low,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1990-01-03,0.731197,0.743233,0.874962,0.754839
1990-01-04,0.752315,0.764660,0.899914,0.776571
1990-01-05,0.608959,0.619202,0.730529,0.629043
1990-01-08,0.069249,0.071581,0.092824,0.073627
1990-01-09,1.295966,1.316280,1.542276,1.336043
...,...,...,...,...
2021-11-16,-0.173067,-0.340370,-0.487549,-0.137062
2021-11-17,-0.446396,0.057222,0.226396,0.613296
2021-11-18,0.347767,0.753691,0.071212,0.368081
2021-11-19,0.420188,0.634384,0.822237,0.227084


In [12]:
historical_price_data_cleaned_pct_change.describe()

Unnamed: 0,Open,High,Low,Close
count,8036.0,8036.0,8036.0,8036.0
mean,0.0,-1.149459e-17,7.957796e-18,9.726195000000001e-18
std,1.000062,1.000062,1.000062,1.000062
min,-5.012087,-5.263132,-6.605708,-4.261171
25%,-0.576846,-0.5485195,-0.5741006,-0.5609797
50%,-0.07243,-0.07174916,-0.08033302,-0.08524439
75%,0.466765,0.4358103,0.4959436,0.4353319
max,14.190315,16.49431,9.840685,16.49461


#### We see that the extreme values are still really big. This may cause trouble in training We can now normalize our data so it is between 0 and 1

In [15]:
scaler = preprocessing.MinMaxScaler() #Import the min max scalar tool

historical_price_data_cleaned_pct_change_copy = historical_price_data_cleaned_pct_change # I made a copy for testing purposes. If I messed up I could go back

# Pick the columns that we want to normalise. In our case, all.
historical_price_data_cleaned_pct_change_copy[["Open", "High", "Low", "Close"]] = scaler.fit_transform(historical_price_data_cleaned_pct_change_copy[["Open", "High", "Low", "Close"]])

#### Lets check out our data again

In [16]:
historical_price_data_cleaned_pct_change_copy

Unnamed: 0_level_0,Open,High,Low,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1990-01-03,0.299092,0.276060,0.454852,0.241668
1990-01-04,0.300192,0.277045,0.456369,0.242715
1990-01-05,0.292726,0.270360,0.446070,0.235607
1990-01-08,0.264620,0.245190,0.407295,0.208848
1990-01-09,0.328503,0.302398,0.495427,0.269670
...,...,...,...,...
2021-11-16,0.252001,0.226256,0.372006,0.198697
2021-11-17,0.237767,0.244530,0.415417,0.234849
2021-11-18,0.279124,0.276541,0.405981,0.223034
2021-11-19,0.282896,0.271057,0.451646,0.216241


In [17]:
historical_price_data_cleaned_pct_change_copy.describe()

Unnamed: 0,Open,High,Low,Close
count,8036.0,8036.0,8036.0,8036.0
mean,0.261014,0.2419,0.401651,0.2053
std,0.05208,0.045964,0.060807,0.048182
min,0.0,0.0,0.0,0.0
25%,0.230973,0.21669,0.366743,0.178273
50%,0.257242,0.238603,0.396766,0.201193
75%,0.285321,0.261931,0.431806,0.226274
max,1.0,1.0,1.0,1.0


# DONE