In [12]:
# Import packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt     
import yfinance as yf
import datetime 
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import pickle


In [13]:
# import dataset
start = datetime.date(year = 2010,month=1,day=1)
end = datetime.date(year = 2025,month=7,day=31)

df = yf.download(tickers='^NSEI' , start=start , end=end, auto_adjust=True)
df.columns = ['Close','High','Low','Open','Volume']
df = df.drop('Volume',axis=1)
df

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Close,High,Low,Open
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-04,5232.200195,5238.450195,5167.100098,5200.899902
2010-01-05,5277.899902,5288.350098,5242.399902,5277.149902
2010-01-06,5281.799805,5310.850098,5260.049805,5278.149902
2010-01-07,5263.100098,5302.549805,5244.750000,5281.799805
2010-01-08,5244.750000,5276.750000,5234.700195,5264.250000
...,...,...,...,...
2025-07-24,25062.099609,25246.250000,25018.699219,25243.300781
2025-07-25,24837.000000,25010.349609,24806.349609,25010.349609
2025-07-28,24680.900391,24889.199219,24646.599609,24782.449219
2025-07-29,24821.099609,24847.150391,24598.599609,24609.650391


In [14]:
# Differencing 
for i in df.columns:
    df[f'{i}_change'] = round(df[i].diff(1)/df[i]*100,2)

df = df.drop(['Open','High','Low','Close'],axis=1)

df = df.dropna()
df
    

Unnamed: 0_level_0,Close_change,High_change,Low_change,Open_change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-05,0.87,0.94,1.44,1.44
2010-01-06,0.07,0.42,0.34,0.02
2010-01-07,-0.36,-0.16,-0.29,0.07
2010-01-08,-0.35,-0.49,-0.19,-0.33
2010-01-11,0.09,0.20,-0.13,-0.01
...,...,...,...,...
2025-07-24,-0.63,0.05,-0.27,0.41
2025-07-25,-0.91,-0.94,-0.86,-0.93
2025-07-28,-0.63,-0.49,-0.65,-0.92
2025-07-29,0.56,-0.17,-0.20,-0.70


In [15]:
# Correlation analysis
df.corr()

Unnamed: 0,Close_change,High_change,Low_change,Open_change
Close_change,1.0,0.690951,0.671896,0.298383
High_change,0.690951,1.0,0.65028,0.725362
Low_change,0.671896,0.65028,1.0,0.655197
Open_change,0.298383,0.725362,0.655197,1.0


# Step 1
Predict High_change from Open_change

In [16]:
X = pd.DataFrame(df['Low_change'])
y = pd.DataFrame(df['High_change'])


In [17]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
# modeling
model = LinearRegression()
model_name = 'Linear Regression'

model.fit(X_train,y_train)

y_pred = model.predict(X_test)
rmse = root_mean_squared_error(y_test,y_pred)
rmse

0.6671396215259876

In [19]:
# modeling
model = DecisionTreeRegressor()
model_name = 'Decision Tree Regressor'

model.fit(X_train,y_train)

y_pred = model.predict(X_test)
rmse = root_mean_squared_error(y_test,y_pred)
rmse

0.7236172122787464

In [20]:
# modeling
model = RandomForestRegressor()
model_name = 'Random Forest Regressor'

model.fit(X_train,y_train)

y_pred = model.predict(X_test)
rmse = root_mean_squared_error(y_test,y_pred)
rmse

  return fit_method(estimator, *args, **kwargs)


0.6967568036226025

In [21]:
# modeling
model = XGBRegressor()
model_name = 'XGBRegressor Regressor'

model.fit(X_train,y_train)

y_pred = model.predict(X_test)
rmse = root_mean_squared_error(y_test,y_pred)
rmse

0.6544550061225891

In [22]:
# modeling
model = LGBMRegressor()
model_name = 'LightGBM Regressor'

model.fit(X_train,y_train)

y_pred = model.predict(X_test)
rmse = root_mean_squared_error(y_test,y_pred)
rmse

# This is with least RMSE hence we will use this model
# Pickle this model
with open("../models/pred_low.pkl","wb") as file:
    pickle.dump(obj=model,file=file)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000039 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 3059, number of used features: 1
[LightGBM] [Info] Start training from score 0.031389
