In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDRegressor,LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score,accuracy_score, recall_score, precision_score, classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from joblib import dump

In [2]:
# read sentiment data
df_news = pd.read_csv("https://raw.githubusercontent.com/gitsim02/FoundationProject-1/main/data/Reliance_sentiment.csv")

In [3]:
# read price data
df_price = pd.read_csv("https://raw.githubusercontent.com/gitsim02/FoundationProject-1/main/data/reliance_prices.csv")

In [4]:
df_price.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1384 entries, 0 to 1383
Data columns (total 21 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Date         1384 non-null   object 
 1   Open         1384 non-null   float64
 2   High         1384 non-null   float64
 3   Low          1384 non-null   float64
 4   Close        1384 non-null   float64
 5   Adj Close    1384 non-null   float64
 6   Volume       1384 non-null   int64  
 7   1d_diff      1384 non-null   float64
 8   5d_diff      1384 non-null   float64
 9   10d_diff     1384 non-null   float64
 10  SMA3         1384 non-null   float64
 11  SMA5         1384 non-null   float64
 12  SMA9         1384 non-null   float64
 13  SMA15        1384 non-null   float64
 14  SMA30        1384 non-null   float64
 15  EMA3         1384 non-null   float64
 16  EMA5         1384 non-null   float64
 17  EMA9         1384 non-null   float64
 18  EMA15        1384 non-null   float64
 19  EMA30 

In [5]:
df_news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 654 entries, 0 to 653
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   date              654 non-null    object 
 1   title_senti_comp  654 non-null    float64
 2   news_senti_comp   654 non-null    float64
 3   avg_senti_comp    654 non-null    float64
dtypes: float64(3), object(1)
memory usage: 20.6+ KB


In [6]:
df_news['date'] = pd.to_datetime(df_news.date)
df_price['Date'] = pd.to_datetime(df_price.Date)

In [7]:
# combine sentiment & price data based on "date"
df_combined = pd.merge(left=df_price,right=df_news,how="left",left_on = 'Date',right_on='date')

In [8]:
df_combined.drop(['date'],axis=1,inplace=True)

In [9]:
df_combined.tail()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,1d_diff,5d_diff,10d_diff,...,SMA30,EMA3,EMA5,EMA9,EMA15,EMA30,Volume_100k,title_senti_comp,news_senti_comp,avg_senti_comp
1379,2022-07-11,2376.5,2428.800049,2370.0,2423.899902,2423.899902,6390604,3.449951,1.649902,3.5,...,2574.979997,2411.709763,2417.68741,2439.967735,2472.975022,2523.815943,63.906,,,
1380,2022-07-12,2404.0,2439.699951,2404.0,2420.449951,2420.449951,4974502,42.899902,-16.650146,-1.050049,...,2567.906665,2416.079857,2418.608257,2436.064178,2466.409388,2517.147169,49.745,,,
1381,2022-07-13,2427.300049,2434.0,2373.0,2377.550049,2377.550049,6564435,-19.599854,-125.449951,-41.649902,...,2559.375,2396.814953,2404.922187,2424.361352,2455.301971,2508.140903,65.644,0.0,-0.0772,-0.04
1382,2022-07-14,2388.0,2433.949951,2376.949951,2397.149902,2397.149902,7831798,-4.650146,-89.150146,-59.950195,...,2548.469995,2396.982428,2402.331426,2418.919062,2448.032962,2500.980193,78.318,0.0,0.4703,0.24
1383,2022-07-15,2415.0,2415.0,2383.100098,2401.800049,2401.800049,4431880,-20.449951,-101.300049,-107.649902,...,2535.879997,2399.391238,2402.1543,2415.49526,2442.253848,2494.581474,44.319,,,


In [10]:
df_combined[['Date','Close','1d_diff']].head()

Unnamed: 0,Date,Close,1d_diff
0,2016-12-13,515.018127,-9.757507
1,2016-12-14,524.775635,3.244202
2,2016-12-15,521.531433,-2.080261
3,2016-12-16,523.611694,-2.674622
4,2016-12-19,526.286316,2.303162


In [11]:
# ~845 days no news for reliance over 5+ years
df_combined.isna().sum()

Date                  0
Open                  0
High                  0
Low                   0
Close                 0
Adj Close             0
Volume                0
1d_diff               0
5d_diff               0
10d_diff              0
SMA3                  0
SMA5                  0
SMA9                  0
SMA15                 0
SMA30                 0
EMA3                  0
EMA5                  0
EMA9                  0
EMA15                 0
EMA30                 0
Volume_100k           0
title_senti_comp    846
news_senti_comp     846
avg_senti_comp      846
dtype: int64

In [12]:
# impute that with 0
df_combined.fillna(0.0,inplace=True)

In [13]:
df_combined['next_day_close'] = df_combined['Close'] - df_combined['1d_diff']

In [14]:
df_combined[['Date','Close','next_day_close','1d_diff']].tail()

Unnamed: 0,Date,Close,next_day_close,1d_diff
1379,2022-07-11,2423.899902,2420.449951,3.449951
1380,2022-07-12,2420.449951,2377.550049,42.899902
1381,2022-07-13,2377.550049,2397.149902,-19.599854
1382,2022-07-14,2397.149902,2401.800049,-4.650146
1383,2022-07-15,2401.800049,2422.25,-20.449951


### model saving

In [15]:
class StockPredictionModel():
    
    def __init__(self, model, features, rmse):
        self.model = model
        self.features = features
        self.rmse = rmse  

## model creation

In [16]:
def linear_model_build_eval(X,Y,regressor_obj,train_size=0.7):
    
    ## train test split
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    shuffle=False,
                                                    train_size = train_size,
                                                    random_state = 80)
    
    print(f"Train size : {X_train.shape}; Test Size : {X_test.shape}")
    
    # transformations
    scaler = StandardScaler()
    
    # pipeline
    lreg = Pipeline(steps=[('scaler', scaler),
                          ('regressor', regressor_obj )])
    
    #fit train set
    lreg.fit(X_train,y_train)
    
    print(f"Intercept : {lreg['regressor'].intercept_}")
    
    print(f"Coefficient : {dict(zip(X, np.round(lreg['regressor'].coef_, 2)))}")
    
    y_pred = lreg.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test,y_pred))
    r2 = r2_score(y_test,y_pred)
    
    return y_pred,y_test,X_test,rmse,r2,lreg

## 1d prediction

In [17]:
x_features_1d = ['Volume_100k','EMA3','EMA5','news_senti_comp']

In [18]:
df_combined[x_features_1d].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1384 entries, 0 to 1383
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Volume_100k      1384 non-null   float64
 1   EMA3             1384 non-null   float64
 2   EMA5             1384 non-null   float64
 3   news_senti_comp  1384 non-null   float64
dtypes: float64(4)
memory usage: 54.1 KB


In [19]:
# df_combined[x_features_1d]

In [20]:
X1d = df_combined[x_features_1d]
y1d= df_combined['next_day_close']

In [21]:
sgdregressor = SGDRegressor(max_iter=1000, eta0=0.01)

In [22]:
y_pred1,y_test1,X_test1,rmse_1,r2_1,linear_1d = linear_model_build_eval(X=X1d,Y=y1d,
                                                     regressor_obj=sgdregressor,
                                                     train_size=0.7)

Train size : (968, 4); Test Size : (416, 4)
Intercept : [1169.72720974]
Coefficient : {'Volume_100k': 5.63, 'EMA3': 209.77, 'EMA5': 203.97, 'news_senti_comp': 1.07}


In [23]:
rmse_1

49.66879086934358

In [24]:
r2_1

0.9622538456668742

In [25]:
pd.DataFrame({"y_actual":y_test1,
             "y_pred":y_pred1,
             "residual":y_test1-y_pred1})

Unnamed: 0,y_actual,y_pred,residual
968,1996.400024,2009.226906,-12.826882
969,2002.300049,2011.244623,-8.944574
970,1993.250000,1996.354089,-3.104089
971,1987.199951,2009.022988,-21.823036
972,1973.150024,1996.653077,-23.503053
...,...,...,...
1379,2420.449951,2411.202513,9.247438
1380,2377.550049,2412.885642,-35.335593
1381,2397.149902,2397.283892,-0.133990
1382,2401.800049,2398.644108,3.155941


In [26]:
import os

os.getcwd()

'C:\\Users\\Siddharth Maheshwari\\OneDrive\\Desktop\\ISB AMPBA\\Term 2\\FP1\\FoundationProject-1\\regression_code'

In [27]:
## save model and dump
linear_model_save = StockPredictionModel(linear_1d,list(X1d.columns),rmse_1)

dump(linear_model_save,".\\..\\linear_models\\lm_1d.pkl")

['.\\..\\linear_models\\lm_1d.pkl']

## 5d Prediction

##### Monday to Monday prediction

In [28]:
df_combined['next5_day_close'] = df_combined['Close'] - df_combined['5d_diff']

In [29]:
df_combined[['Date','Close','next5_day_close','5d_diff']].head(7)

Unnamed: 0,Date,Close,next5_day_close,5d_diff
0,2016-12-13,515.018127,523.983154,-8.965027
1,2016-12-14,524.775635,526.633057,-1.857422
2,2016-12-15,521.531433,521.729553,-0.19812
3,2016-12-16,523.611694,523.685974,-0.07428
4,2016-12-19,526.286316,518.807251,7.479065
5,2016-12-20,523.983154,527.351257,-3.368103
6,2016-12-21,526.633057,519.921692,6.711365


In [30]:
x_features_5d = ['Volume_100k','EMA3','EMA5','EMA9','news_senti_comp']

In [31]:
df_combined[x_features_5d].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1384 entries, 0 to 1383
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Volume_100k      1384 non-null   float64
 1   EMA3             1384 non-null   float64
 2   EMA5             1384 non-null   float64
 3   EMA9             1384 non-null   float64
 4   news_senti_comp  1384 non-null   float64
dtypes: float64(5)
memory usage: 64.9 KB


In [32]:
X5d = df_combined[x_features_5d]
y5d= df_combined['next5_day_close']

In [33]:
y_pred5,y_test5,X_test5,rmse_5,r2_5,linear_5d = linear_model_build_eval(X=X5d,Y=y5d,
                                                     regressor_obj=sgdregressor,
                                                     train_size=0.7)

Train size : (968, 5); Test Size : (416, 5)
Intercept : [1176.04533616]
Coefficient : {'Volume_100k': 12.77, 'EMA3': 148.67, 'EMA5': 137.69, 'EMA9': 122.77, 'news_senti_comp': 1.46}


In [34]:
rmse_5

94.49103095224903

In [35]:
r2_5

0.8624666450205959

In [36]:
pd.DataFrame({"y_actual":y_test5,
             "y_pred":y_pred5,
             "residual":y_test5-y_pred5})

Unnamed: 0,y_actual,y_pred,residual
968,1973.150024,2015.972525,-42.822501
969,1899.500000,2021.463519,-121.963519
970,1950.699951,1988.904851,-38.204900
971,1964.050049,2019.645735,-55.595686
972,1947.800049,2000.236886,-52.436837
...,...,...,...
1379,2422.250000,2409.957192,12.292808
1380,2437.100098,2408.450836,28.649262
1381,2503.000000,2395.698649,107.301351
1382,2486.300049,2397.538164,88.761885


In [37]:
## save model and dump
linear_model_save = StockPredictionModel(linear_5d,list(X5d.columns),rmse_5)

dump(linear_model_save,".\\..\\linear_models\\lm_5d.pkl")

['.\\..\\linear_models\\lm_5d.pkl']

## 10d Prediction

##### Monday to Monday(14 days including weekend)

In [38]:
df_combined['next10_day_close'] = df_combined['Close'] - df_combined['10d_diff']

In [39]:
df_combined[['Date','Close','next10_day_close','10d_diff']].head(15)

Unnamed: 0,Date,Close,next10_day_close,10d_diff
0,2016-12-13,515.018127,527.351257,-12.33313
1,2016-12-14,524.775635,519.921692,4.853943
2,2016-12-15,521.531433,527.722717,-6.191284
3,2016-12-16,523.611694,536.118164,-12.50647
4,2016-12-19,526.286316,537.455444,-11.169128
5,2016-12-20,523.983154,539.585266,-15.602112
6,2016-12-21,526.633057,528.564758,-1.931702
7,2016-12-22,521.729553,533.641602,-11.912048
8,2016-12-23,523.685974,532.428101,-8.742126
9,2016-12-26,518.807251,533.790222,-14.982971


In [40]:
x_features_10d = ['Volume_100k','EMA5','EMA9','EMA15','EMA30','news_senti_comp']

In [41]:
df_combined[x_features_10d].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1384 entries, 0 to 1383
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Volume_100k      1384 non-null   float64
 1   EMA5             1384 non-null   float64
 2   EMA9             1384 non-null   float64
 3   EMA15            1384 non-null   float64
 4   EMA30            1384 non-null   float64
 5   news_senti_comp  1384 non-null   float64
dtypes: float64(6)
memory usage: 75.7 KB


In [42]:
X10d = df_combined[x_features_10d]
y10d= df_combined['next10_day_close']

In [43]:
y_pred10,y_test10,X_test10,rmse_10,r2_10,linear_10d = linear_model_build_eval(X=X10d,Y=y10d,
                                                     regressor_obj=sgdregressor,
                                                     train_size=0.7)

Train size : (968, 6); Test Size : (416, 6)
Intercept : [1182.89443391]
Coefficient : {'Volume_100k': 22.0, 'EMA5': 276.09, 'EMA9': 162.89, 'EMA15': 55.01, 'EMA30': -93.17, 'news_senti_comp': -1.43}


In [44]:
rmse_10

131.21119133889692

In [45]:
r2_10

0.7300947474809683

In [46]:
## save model and dump
linear_model_save = StockPredictionModel(linear_10d,list(X10d.columns),rmse_10)

dump(linear_model_save,".\\..\\linear_models\\lm_10d.pkl")

['.\\..\\linear_models\\lm_10d.pkl']

In [47]:
df_combined.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,1d_diff,5d_diff,10d_diff,...,EMA9,EMA15,EMA30,Volume_100k,title_senti_comp,news_senti_comp,avg_senti_comp,next_day_close,next5_day_close,next10_day_close
0,2016-12-13,509.173553,515.612488,508.950653,515.018127,501.832367,4783826,-9.757507,-8.965027,-12.33313,...,503.683619,500.98471,501.394308,47.838,0.0,0.0,0.0,524.775635,523.983154,527.351257
1,2016-12-14,515.018127,527.227417,514.225647,524.775635,511.339996,11140265,3.244202,-1.857422,4.853943,...,507.902022,503.958576,502.902781,111.403,0.0,0.730367,0.366667,521.531433,526.633057,519.921692
2,2016-12-15,521.258972,528.787598,519.401611,521.531433,508.178925,8921708,-2.080261,-0.19812,-6.191284,...,510.627905,506.155183,504.104629,89.217,0.0,0.0,0.0,523.611694,521.729553,527.722717
3,2016-12-16,521.060852,526.013916,521.060852,523.611694,510.205872,6895825,-2.674622,-0.07428,-12.50647,...,513.224663,508.337247,505.36315,68.958,0.0,0.0,0.0,526.286316,523.685974,536.118164
4,2016-12-19,522.546753,528.465698,522.224854,526.286316,512.812012,6926943,2.303162,7.479065,-11.169128,...,515.836993,510.58088,506.713031,69.269,0.0,0.0,0.0,523.983154,518.807251,537.455444
