# Decision Trees

In [1]:
# necessary imports
import os
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [2]:
hdfc_dataset = pd.read_csv("/Users/janhavi/Desktop/HDFC.csv", parse_dates=['Date'])
hdfc_dataset

Unnamed: 0,Date,Symbol,Series,Prev Close,Open,High,Low,Last,Close,VWAP,Volume,Turnover,Trades,Deliverable Volume,%Deliverble
0,2000-01-03,HDFC,EQ,271.75,293.50,293.50,293.50,293.50,293.50,293.50,22744,6.675364e+11,,,
1,2000-01-04,HDFC,EQ,293.50,317.00,317.00,297.00,304.00,304.05,303.62,255251,7.749972e+12,,,
2,2000-01-05,HDFC,EQ,304.05,290.00,303.90,285.00,295.00,292.80,294.53,269087,7.925368e+12,,,
3,2000-01-06,HDFC,EQ,292.80,301.00,314.00,295.00,296.00,296.45,300.14,305916,9.181669e+12,,,
4,2000-01-07,HDFC,EQ,296.45,290.00,296.35,281.00,287.10,286.55,288.80,197039,5.690480e+12,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5301,2021-04-26,HDFC,EQ,2497.35,2500.00,2534.10,2483.20,2502.00,2509.80,2508.07,3916088,9.821805e+14,121028.0,2440395.0,0.6232
5302,2021-04-27,HDFC,EQ,2509.80,2494.15,2526.80,2486.25,2514.00,2518.40,2509.18,2040799,5.120730e+14,102250.0,1040749.0,0.5100
5303,2021-04-28,HDFC,EQ,2518.40,2516.10,2609.00,2508.30,2575.00,2577.00,2574.21,3407461,8.771527e+14,117425.0,1815110.0,0.5327
5304,2021-04-29,HDFC,EQ,2577.00,2590.90,2628.00,2533.00,2539.70,2538.85,2569.65,3005468,7.722995e+14,132826.0,1472924.0,0.4901


In [3]:
# Replacing space in columns name with a _ since pandas cannot handle it
hdfc_dataset.columns = [x.replace(' ', '_') for x in hdfc_dataset.columns]

In [4]:
# Replacing % in columns name with a _ 
hdfc_dataset.columns = [x.replace('%', '_') for x in hdfc_dataset.columns]
hdfc_dataset.head(5)

Unnamed: 0,Date,Symbol,Series,Prev_Close,Open,High,Low,Last,Close,VWAP,Volume,Turnover,Trades,Deliverable_Volume,_Deliverble
0,2000-01-03,HDFC,EQ,271.75,293.5,293.5,293.5,293.5,293.5,293.5,22744,667536400000.0,,,
1,2000-01-04,HDFC,EQ,293.5,317.0,317.0,297.0,304.0,304.05,303.62,255251,7749972000000.0,,,
2,2000-01-05,HDFC,EQ,304.05,290.0,303.9,285.0,295.0,292.8,294.53,269087,7925368000000.0,,,
3,2000-01-06,HDFC,EQ,292.8,301.0,314.0,295.0,296.0,296.45,300.14,305916,9181669000000.0,,,
4,2000-01-07,HDFC,EQ,296.45,290.0,296.35,281.0,287.1,286.55,288.8,197039,5690480000000.0,,,


In [5]:
# Doing data cleaning by removing unnecessary columns which wont be used for predictions
drop_column = ['Date', 'Symbol', 'Series', 'Trades', 'Deliverable_Volume', '_Deliverble']
hdfc_dataset.drop(drop_column, axis = 1, inplace = True)

In [6]:
# remove the last observation (last row) sincethere is no tomorrow for the last row to predict
X = hdfc_dataset.iloc[0:-1]
X

y = hdfc_dataset.Close.shift(-1).dropna()
y

0        304.05
1        292.80
2        296.45
3        286.55
4        287.20
         ...   
5300    2509.80
5301    2518.40
5302    2577.00
5303    2538.85
5304    2420.10
Name: Close, Length: 5305, dtype: float64

In [7]:
# predict for next 250 days for HDFC dataset
L = hdfc_dataset.iloc[0:-250]
L

m = hdfc_dataset.Close.shift(-250).dropna()
m

0        538.25
1        542.15
2        552.15
3        550.60
4        583.30
         ...   
5051    2509.80
5052    2518.40
5053    2577.00
5054    2538.85
5055    2420.10
Name: Close, Length: 5056, dtype: float64

In [8]:
X.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5295,5296,5297,5298,5299,5300,5301,5302,5303,5304
Prev_Close,271.75,293.5,304.05,292.8,296.45,286.55,287.2,283.85,285.6,283.85,...,2512.95,2547.15,2574.05,2492.35,2415.9,2479.7,2497.35,2509.8,2518.4,2577.0
Open,293.5,317.0,290.0,301.0,290.0,292.0,290.0,287.0,288.0,284.0,...,2540.0,2550.0,2474.0,2542.15,2400.0,2455.05,2500.0,2494.15,2516.1,2590.9
High,293.5,317.0,303.9,314.0,296.35,296.0,292.0,293.0,290.5,294.0,...,2555.0,2589.8,2510.0,2544.0,2485.0,2504.15,2534.1,2526.8,2609.0,2628.0
Low,293.5,297.0,285.0,295.0,281.0,285.0,273.25,284.5,283.0,276.25,...,2493.0,2546.85,2452.0,2406.2,2373.0,2437.5,2483.2,2486.25,2508.3,2533.0
Last,293.5,304.0,295.0,296.0,287.1,288.4,282.85,285.25,284.0,291.0,...,2543.95,2569.0,2501.05,2409.0,2478.05,2491.0,2502.0,2514.0,2575.0,2539.7
Close,293.5,304.05,292.8,296.45,286.55,287.2,283.85,285.6,283.85,286.55,...,2547.15,2574.05,2492.35,2415.9,2479.7,2497.35,2509.8,2518.4,2577.0,2538.85
VWAP,293.5,303.62,294.53,300.14,288.8,289.42,284.54,287.6,285.84,283.6,...,2527.23,2572.39,2484.26,2454.68,2437.46,2479.65,2508.07,2509.18,2574.21,2569.65
Volume,22744.0,255251.0,269087.0,305916.0,197039.0,133363.0,337411.0,222537.0,113238.0,152322.0,...,4201633.0,3133311.0,3550890.0,7365765.0,4514740.0,3143779.0,3916088.0,2040799.0,3407461.0,3005468.0
Turnover,667536400000.0,7749972000000.0,7925368000000.0,9181669000000.0,5690480000000.0,3859779000000.0,9600617000000.0,6400217000000.0,3236741000000.0,4319905000000.0,...,1061850000000000.0,806009300000000.0,882134300000000.0,1808059000000000.0,1100450000000000.0,779546500000000.0,982180500000000.0,512073000000000.0,877152700000000.0,772299500000000.0


In [9]:
L.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5046,5047,5048,5049,5050,5051,5052,5053,5054,5055
Prev_Close,271.75,293.5,304.05,292.8,296.45,286.55,287.2,283.85,285.6,283.85,...,1625.75,1680.6,1727.7,1669.7,1664.15,1663.45,1580.3,1591.45,1715.8,1836.75
Open,293.5,317.0,290.0,301.0,290.0,292.0,290.0,287.0,288.0,284.0,...,1690.0,1709.8,1670.0,1669.7,1674.9,1603.0,1618.1,1615.8,1734.0,1857.0
High,293.5,317.0,303.9,314.0,296.35,296.0,292.0,293.0,290.5,294.0,...,1738.0,1738.0,1694.8,1679.0,1687.0,1624.95,1652.15,1725.0,1845.0,1927.0
Low,293.5,297.0,285.0,295.0,281.0,285.0,273.25,284.5,283.0,276.25,...,1652.4,1673.4,1651.1,1626.15,1651.6,1569.1,1585.35,1612.65,1726.0,1855.5
Last,293.5,304.0,295.0,296.0,287.1,288.4,282.85,285.25,284.0,291.0,...,1680.0,1734.0,1665.0,1670.0,1656.0,1588.0,1600.6,1724.0,1827.95,1916.15
Close,293.5,304.05,292.8,296.45,286.55,287.2,283.85,285.6,283.85,286.55,...,1680.6,1727.7,1669.7,1664.15,1663.45,1580.3,1591.45,1715.8,1836.75,1916.0
VWAP,293.5,303.62,294.53,300.14,288.8,289.42,284.54,287.6,285.84,283.6,...,1688.58,1709.85,1676.04,1659.27,1666.79,1599.71,1620.72,1671.93,1803.34,1894.76
Volume,22744.0,255251.0,269087.0,305916.0,197039.0,133363.0,337411.0,222537.0,113238.0,152322.0,...,8954559.0,7372380.0,6604554.0,5301750.0,4419421.0,7166440.0,5719079.0,7691600.0,11881510.0,8237079.0
Turnover,667536400000.0,7749972000000.0,7925368000000.0,9181669000000.0,5690480000000.0,3859779000000.0,9600617000000.0,6400217000000.0,3236741000000.0,4319905000000.0,...,1512048000000000.0,1260565000000000.0,1106948000000000.0,879705300000000.0,736626400000000.0,1146423000000000.0,926901700000000.0,1285981000000000.0,2142640000000000.0,1560732000000000.0


In [10]:
# Split the data into train and test data using the train)test_split API 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [11]:
# Split the data into train and test data using the train)test_split API 
L_train, L_test, m_train, m_test = train_test_split(L, m, random_state=42)

In [12]:
# Create a model 
rf_model = RandomForestRegressor()

# Train the model
rf_model.fit(X_train, y_train)

RandomForestRegressor()

In [13]:
# Create a model 
rf_model = RandomForestRegressor()

# Train the model
rf_model.fit(L_train, m_train)

RandomForestRegressor()

In [14]:
# Get R2 measure (indicator of accuracy 1 is perfect, 0 is bad)
rf_model.score(X_test, y_test)

0.3690580184690099

In [15]:
# Get R2 measure (indicator of accuracy 1 is perfect, 0 is bad)
rf_model.score(L_test, m_test)

0.8492164026062246