# <u> Creating Time Series Object</u>

In [2]:
from datetime import date
from nsepy import get_history
import pandas as pd
import numpy as np

### Import Data

In [3]:
symbols = "NIFTY 500,NIFTY 100,NIFTY 200,NIFTY NEXT 50".split(',')
symbols

data={}
for s in symbols:
    data[s] = get_history(symbol=s,
                           start=date(2019,1,1),
                           end=date(2021,10,31),
                           index=True)['Close']

df_comp= pd.DataFrame(data)
df_comp


Unnamed: 0_level_0,NIFTY 500,NIFTY 100,NIFTY 200,NIFTY NEXT 50
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-01-01,9197.90,11148.80,5773.70,28111.65
2019-01-02,9103.40,11032.00,5712.10,27876.95
2019-01-03,9014.80,10914.80,5652.95,27674.65
2019-01-04,9054.90,10966.15,5680.50,27719.50
2019-01-07,9081.15,11005.10,5699.10,27706.60
...,...,...,...,...
2021-10-25,15334.35,18272.40,9519.10,42032.60
2021-10-26,15499.65,18425.15,9610.55,42528.95
2021-10-27,15484.35,18390.05,9595.60,42748.65
2021-10-28,15190.85,18028.10,9406.80,41842.65


In [4]:
df_comp.describe()

Unnamed: 0,NIFTY 500,NIFTY 100,NIFTY 200,NIFTY NEXT 50
count,702.0,702.0,702.0,702.0
mean,10456.493447,12741.036111,6558.207835,30130.929487
std,2080.641761,2328.570637,1249.659623,5526.968722
min,6243.0,7719.1,3944.8,18524.65
25%,9128.375,11233.2375,5753.05,26645.7625
50%,9689.675,11947.925,6119.65,27938.825
75%,12167.7625,14689.175,7585.2875,34136.0
max,15886.15,18734.9,9823.15,44708.1


###  Setting the desired Frequency

The asfreq() function is used to convert TimeSeries to specified frequency.

Optionally provide filling method to pad/backfill missing values.

Returns the original data conformed to a new index with the specified frequency. resample is more appropriate if an operation, such as summarization, is necessary to represent the data at the new frequency.



'h' - hourly,<br/>w' weekly,<br/>  'd' daily, <br/> 'm' monthly


Data measured once per year is annual 'a'

In [5]:
df_comp = df_comp.asfreq('d')

df_comp.head()

# 5/1/19 data are null as they are weekends

Unnamed: 0_level_0,NIFTY 500,NIFTY 100,NIFTY 200,NIFTY NEXT 50
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-01-01,9197.9,11148.8,5773.7,28111.65
2019-01-02,9103.4,11032.0,5712.1,27876.95
2019-01-03,9014.8,10914.8,5652.95,27674.65
2019-01-04,9054.9,10966.15,5680.5,27719.5
2019-01-05,,,,


In [6]:
df_comp = df_comp.asfreq('b')   # < -- setting b (business days) 
                                        #avoids unnecessary time periods

df_comp.head()

Unnamed: 0_level_0,NIFTY 500,NIFTY 100,NIFTY 200,NIFTY NEXT 50
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-01-01,9197.9,11148.8,5773.7,28111.65
2019-01-02,9103.4,11032.0,5712.1,27876.95
2019-01-03,9014.8,10914.8,5652.95,27674.65
2019-01-04,9054.9,10966.15,5680.5,27719.5
2019-01-07,9081.15,11005.1,5699.1,27706.6


###  Handling Missing Values

In [7]:
df_comp.isna()

Unnamed: 0_level_0,NIFTY 500,NIFTY 100,NIFTY 200,NIFTY NEXT 50
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-01-01,False,False,False,False
2019-01-02,False,False,False,False
2019-01-03,False,False,False,False
2019-01-04,False,False,False,False
2019-01-07,False,False,False,False
...,...,...,...,...
2021-10-25,False,False,False,False
2021-10-26,False,False,False,False
2021-10-27,False,False,False,False
2021-10-28,False,False,False,False


<font color='red'>Setting the frequency to business days must have generated <b>40</b> dates for which we have no data available. </font>

In [8]:
df_comp.isna().sum()

NIFTY 500        40
NIFTY 100        40
NIFTY 200        40
NIFTY NEXT 50    40
dtype: int64

### Filling out missing values in several different ways.

The <u>first one</u> is called <b>Front Filling</b>, and it assigns the value of the previous period.

For instance, if we have no data available for the 15th of July 2004, we assign to it the value recorded on the 14th of July, 2004

The <u>second way</u> we can tackle missing values is called <b>Backfilling.</b>

As you can probably guess, it assigns empty entries, the value for the next period, for instance,

if we have no data for July 7th, 2004, we pass it the same value as the one we have recorded for July

8th, the same year.

A <u>third way</u> we deal with missing data is by assigning the same value to all of the time periods.

For instance, it is common practice <b>to assign the average value of the Time series to all the missing values</b>.


#### NOTE:

Usually filling missing values with the mean is a bad approach when it comes to Time series because there are underlying time variant patterns in the data.

This approach is appropriate only when the data heavily fluctuates around the mean from the first to the last day.

In [10]:
df_comp['NIFTY 500'] = df_comp['NIFTY 500'].fillna(method='ffill')

# df_comp['NIFTY 500'] = df_comp['NIFTY 500'].fillna(method='bfill')    <-- Backfilling


#df_comp['NIFTY 100'] = df_comp['NIFTY 100'].fillna(value = df_comp['NIFTY 100'].mean())   <-- Filling by mean values

In [20]:
df_comp.isna().sum()

NIFTY 500         0
NIFTY 100        40
NIFTY 200        40
NIFTY NEXT 50    40
dtype: int64

In [21]:
df_comp['NIFTY 100'] = df_comp['NIFTY 100'].fillna(method='ffill')
df_comp['NIFTY 200'] = df_comp['NIFTY 200'].fillna(method='ffill')
df_comp['NIFTY NEXT 50'] = df_comp['NIFTY NEXT 50'].fillna(method='ffill')

In [22]:
df_comp.isna().sum()

NIFTY 500        0
NIFTY 100        0
NIFTY 200        0
NIFTY NEXT 50    0
dtype: int64

### Adding and removing columns the Dataset

In [23]:
df_comp['test'] = df_comp['NIFTY 100']

In [24]:
df_comp

Unnamed: 0_level_0,NIFTY 500,NIFTY 100,NIFTY 200,NIFTY NEXT 50,test
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-01-01,9197.90,11148.80,5773.70,28111.65,11148.80
2019-01-02,9103.40,11032.00,5712.10,27876.95,11032.00
2019-01-03,9014.80,10914.80,5652.95,27674.65,10914.80
2019-01-04,9054.90,10966.15,5680.50,27719.50,10966.15
2019-01-07,9081.15,11005.10,5699.10,27706.60,11005.10
...,...,...,...,...,...
2021-10-25,15334.35,18272.40,9519.10,42032.60,18272.40
2021-10-26,15499.65,18425.15,9610.55,42528.95,18425.15
2021-10-27,15484.35,18390.05,9595.60,42748.65,18390.05
2021-10-28,15190.85,18028.10,9406.80,41842.65,18028.10


In [25]:
del df_comp['test']   # del df['col1'],df['col2']...

In [26]:
df_comp

Unnamed: 0_level_0,NIFTY 500,NIFTY 100,NIFTY 200,NIFTY NEXT 50
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-01-01,9197.90,11148.80,5773.70,28111.65
2019-01-02,9103.40,11032.00,5712.10,27876.95
2019-01-03,9014.80,10914.80,5652.95,27674.65
2019-01-04,9054.90,10966.15,5680.50,27719.50
2019-01-07,9081.15,11005.10,5699.10,27706.60
...,...,...,...,...
2021-10-25,15334.35,18272.40,9519.10,42032.60
2021-10-26,15499.65,18425.15,9610.55,42528.95
2021-10-27,15484.35,18390.05,9595.60,42748.65
2021-10-28,15190.85,18028.10,9406.80,41842.65


### Splitting up the data

We need to split our available data into two sets, a training set and a testing set.

The goal is to have the option of feeding new information into the model and comparing its predictions to actual values.

The closer the forecasts match, the actual values, the better our model performs.

For many different methods, we would shuffle the data before splitting it to make both sets equally representative.

<font color='blue'>Time series data relies on keeping the chronological order of the values within the set.

This, unfortunately, makes shuffling impossible.</font>

<font color='green'>Therefore, the training set should include all values from the beginning of the data up to a specific

point in time while the testing set the rest.</font>

<i>The appropriate size of the training set is debatable if it's too large.

The model will fit the training set too well and will perform poorly with the new data.

If it's too small, we won't be able to create a model accurate enough for the purposes of this course.</i>

And <font color='red'><b>80 20</b></font> split between the training and testing set is reasonable.

In [31]:
size = int(len(df_comp)*0.8)   # find the length of the entire set
                               # and take 80 percent of that.
size

591

In [29]:
df  = df_comp.iloc[:size]

In [32]:
df_test = df_comp.iloc[size:]

In [33]:
df.tail()

Unnamed: 0_level_0,NIFTY 500,NIFTY 100,NIFTY 200,NIFTY NEXT 50
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-03-31,12313.7,14807.45,7671.6,34176.25
2021-04-01,12479.0,14993.05,7772.85,34689.55
2021-04-02,12479.0,14993.05,7772.85,34689.55
2021-04-05,12298.6,14775.7,7661.35,34370.65
2021-04-06,12358.1,14836.25,7697.55,34714.05


In [34]:
df_test.head()

Unnamed: 0_level_0,NIFTY 500,NIFTY 100,NIFTY 200,NIFTY NEXT 50
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-04-07,12481.3,14973.95,7772.7,35057.25
2021-04-08,12541.65,15036.15,7806.1,35321.15
2021-04-09,12527.4,15004.15,7793.3,35358.9
2021-04-12,12024.1,14450.05,7486.15,33713.95
2021-04-13,12186.7,14639.4,7588.2,34063.5
