In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df_news = pd.read_csv("Reliance_sentiment.csv")

In [3]:
df_price = pd.read_csv("reliance_prices.csv")

In [4]:
df_price.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1384 entries, 0 to 1383
Data columns (total 21 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Date         1384 non-null   object 
 1   Open         1384 non-null   float64
 2   High         1384 non-null   float64
 3   Low          1384 non-null   float64
 4   Close        1384 non-null   float64
 5   Adj Close    1384 non-null   float64
 6   Volume       1384 non-null   int64  
 7   1d_diff      1384 non-null   float64
 8   5d_diff      1384 non-null   float64
 9   10d_diff     1384 non-null   float64
 10  SMA3         1384 non-null   float64
 11  SMA5         1384 non-null   float64
 12  SMA9         1384 non-null   float64
 13  SMA15        1384 non-null   float64
 14  SMA30        1384 non-null   float64
 15  EMA3         1384 non-null   float64
 16  EMA5         1384 non-null   float64
 17  EMA9         1384 non-null   float64
 18  EMA15        1384 non-null   float64
 19  EMA30 

In [5]:
df_news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 654 entries, 0 to 653
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   date              654 non-null    object 
 1   title_senti_comp  654 non-null    float64
 2   news_senti_comp   654 non-null    float64
 3   avg_senti_comp    654 non-null    float64
dtypes: float64(3), object(1)
memory usage: 20.6+ KB


In [6]:
df_news['date'] = pd.to_datetime(df_news.date)
df_price['Date'] = pd.to_datetime(df_price.Date)

In [7]:
df_combined = pd.merge(left=df_price,right=df_news,how="left",left_on = 'Date',right_on='date')

In [8]:
df_combined.drop(['date'],axis=1,inplace=True)

In [9]:
df_combined.tail()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,1d_diff,5d_diff,10d_diff,...,SMA30,EMA3,EMA5,EMA9,EMA15,EMA30,Volume_100k,title_senti_comp,news_senti_comp,avg_senti_comp
1379,2022-07-11,2376.5,2428.800049,2370.0,2423.899902,2423.899902,6390604,3.449951,1.649902,3.5,...,2574.979997,2411.709763,2417.68741,2439.967735,2472.975022,2523.815943,63.906,,,
1380,2022-07-12,2404.0,2439.699951,2404.0,2420.449951,2420.449951,4974502,42.899902,-16.650146,-1.050049,...,2567.906665,2416.079857,2418.608257,2436.064178,2466.409388,2517.147169,49.745,,,
1381,2022-07-13,2427.300049,2434.0,2373.0,2377.550049,2377.550049,6564435,-19.599854,-125.449951,-41.649902,...,2559.375,2396.814953,2404.922187,2424.361352,2455.301971,2508.140903,65.644,0.0,-0.0772,-0.04
1382,2022-07-14,2388.0,2433.949951,2376.949951,2397.149902,2397.149902,7831798,-4.650146,-89.150146,-59.950195,...,2548.469995,2396.982428,2402.331426,2418.919062,2448.032962,2500.980193,78.318,0.0,0.4703,0.24
1383,2022-07-15,2415.0,2415.0,2383.100098,2401.800049,2401.800049,4431880,-20.449951,-101.300049,-107.649902,...,2535.879997,2399.391238,2402.1543,2415.49526,2442.253848,2494.581474,44.319,,,


In [10]:
df_combined[['Date','Close','1d_diff']].head()

Unnamed: 0,Date,Close,1d_diff
0,2016-12-13,515.018127,-9.757507
1,2016-12-14,524.775635,3.244202
2,2016-12-15,521.531433,-2.080261
3,2016-12-16,523.611694,-2.674622
4,2016-12-19,526.286316,2.303162


In [11]:
df_combined.isna().sum()

Date                  0
Open                  0
High                  0
Low                   0
Close                 0
Adj Close             0
Volume                0
1d_diff               0
5d_diff               0
10d_diff              0
SMA3                  0
SMA5                  0
SMA9                  0
SMA15                 0
SMA30                 0
EMA3                  0
EMA5                  0
EMA9                  0
EMA15                 0
EMA30                 0
Volume_100k           0
title_senti_comp    846
news_senti_comp     846
avg_senti_comp      846
dtype: int64

In [12]:
df_combined.fillna(0.0,inplace=True)

In [13]:
df_combined['next_day_close'] = df_combined['Close'] - df_combined['1d_diff']

In [14]:
df_combined[['Date','Close','next_day_close','1d_diff']].tail()

Unnamed: 0,Date,Close,next_day_close,1d_diff
1379,2022-07-11,2423.899902,2420.449951,3.449951
1380,2022-07-12,2420.449951,2377.550049,42.899902
1381,2022-07-13,2377.550049,2397.149902,-19.599854
1382,2022-07-14,2397.149902,2401.800049,-4.650146
1383,2022-07-15,2401.800049,2422.25,-20.449951


## model creation

In [72]:
def linear_model_build_eval(X,Y,regressor_obj,train_size=0.7):
    
    ## train test split
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    shuffle=False,
                                                    train_size = train_size,
                                                    random_state = 80)
    
    print(f"Train size : {X_train.shape}; Test Size : {X_test.shape}")
    
    # transformations
    scaler = StandardScaler()
    
    # pipeline
    lreg = Pipeline(steps=[('scaler', scaler),
                          ('regressor', regressor_obj )])
    
    #fit train set
    lreg.fit(X_train,y_train)
    
    print(f"Intercept : {lreg['regressor'].intercept_}")
    
    print(f"Coefficient : {dict(zip(X, np.round(lreg['regressor'].coef_, 2)))}")
    
    y_pred = lreg.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test,y_pred))
    r2 = r2_score(y_test,y_pred)
    
    return y_pred,y_test,X_test,rmse,r2

## 1d prediction

In [15]:
x_features_1d = ['Volume_100k','EMA3','EMA5','avg_senti_comp']

In [16]:
df_combined[x_features_1d].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1384 entries, 0 to 1383
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Volume_100k     1384 non-null   float64
 1   EMA3            1384 non-null   float64
 2   EMA5            1384 non-null   float64
 3   avg_senti_comp  1384 non-null   float64
dtypes: float64(4)
memory usage: 54.1 KB


In [17]:
X1d = df_combined[x_features_1d]
y1d= df_combined['next_day_close']

In [64]:
sgdregressor = SGDRegressor(max_iter=100, eta0=0.01)

In [73]:
y_pred1,y_test1,X_test1,rmse_1,r2_1 = linear_model_build_eval(X=X1d,Y=y1d,
                                                     regressor_obj=sgdregressor,
                                                     train_size=0.7)

Train size : (968, 4); Test Size : (416, 4)
Intercept : [1169.6232917]
Coefficient : {'Volume_100k': 4.98, 'EMA3': 209.81, 'EMA5': 203.49, 'avg_senti_comp': 1.19}


In [74]:
rmse_1

49.6933187968476

In [75]:
r2_1

0.9622165561124777

## 5d Prediction

##### Monday to Monday prediction

In [37]:
df_combined['next5_day_close'] = df_combined['Close'] - df_combined['5d_diff']

In [40]:
df_combined[['Date','Close','next5_day_close','5d_diff']].head(7)

Unnamed: 0,Date,Close,next5_day_close,5d_diff
0,2016-12-13,515.018127,523.983154,-8.965027
1,2016-12-14,524.775635,526.633057,-1.857422
2,2016-12-15,521.531433,521.729553,-0.19812
3,2016-12-16,523.611694,523.685974,-0.07428
4,2016-12-19,526.286316,518.807251,7.479065
5,2016-12-20,523.983154,527.351257,-3.368103
6,2016-12-21,526.633057,519.921692,6.711365


In [41]:
x_features_5d = ['Volume_100k','EMA3','EMA5','EMA9','avg_senti_comp']

In [43]:
df_combined[x_features_5d].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1384 entries, 0 to 1383
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Volume_100k     1384 non-null   float64
 1   EMA3            1384 non-null   float64
 2   EMA5            1384 non-null   float64
 3   EMA9            1384 non-null   float64
 4   avg_senti_comp  1384 non-null   float64
dtypes: float64(5)
memory usage: 64.9 KB


In [44]:
X5d = df_combined[x_features_5d]
y5d= df_combined['next5_day_close']

In [76]:
y_pred5,y_test5,X_test5,rmse_5,r2_5 = linear_model_build_eval(X=X5d,Y=y5d,
                                                     regressor_obj=sgdregressor,
                                                     train_size=0.7)

Train size : (968, 5); Test Size : (416, 5)
Intercept : [1175.82710931]
Coefficient : {'Volume_100k': 12.93, 'EMA3': 164.0, 'EMA5': 138.3, 'EMA9': 104.93, 'avg_senti_comp': 0.88}


In [77]:
rmse_5

95.09310932217095

In [78]:
r2_5

0.8607083898993245

## 10d Prediction

##### Monday to Monday(14 days including weekend)

In [80]:
df_combined['next10_day_close'] = df_combined['Close'] - df_combined['10d_diff']

In [82]:
df_combined[['Date','Close','next10_day_close','10d_diff']].head(15)

Unnamed: 0,Date,Close,next10_day_close,10d_diff
0,2016-12-13,515.018127,527.351257,-12.33313
1,2016-12-14,524.775635,519.921692,4.853943
2,2016-12-15,521.531433,527.722717,-6.191284
3,2016-12-16,523.611694,536.118164,-12.50647
4,2016-12-19,526.286316,537.455444,-11.169128
5,2016-12-20,523.983154,539.585266,-15.602112
6,2016-12-21,526.633057,528.564758,-1.931702
7,2016-12-22,521.729553,533.641602,-11.912048
8,2016-12-23,523.685974,532.428101,-8.742126
9,2016-12-26,518.807251,533.790222,-14.982971


In [83]:
x_features_10d = ['Volume_100k','EMA5','EMA9','EMA15','avg_senti_comp']

In [84]:
df_combined[x_features_10d].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1384 entries, 0 to 1383
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Volume_100k     1384 non-null   float64
 1   EMA5            1384 non-null   float64
 2   EMA9            1384 non-null   float64
 3   EMA15           1384 non-null   float64
 4   avg_senti_comp  1384 non-null   float64
dtypes: float64(5)
memory usage: 64.9 KB


In [85]:
X10d = df_combined[x_features_10d]
y10d= df_combined['next10_day_close']

In [86]:
y_pred10,y_test10,X_test10,rmse_10,r2_10 = linear_model_build_eval(X=X10d,Y=y10d,
                                                     regressor_obj=sgdregressor,
                                                     train_size=0.7)

Train size : (968, 5); Test Size : (416, 5)
Intercept : [1182.46540397]
Coefficient : {'Volume_100k': 22.81, 'EMA5': 191.88, 'EMA9': 133.33, 'EMA15': 75.74, 'avg_senti_comp': -2.41}


In [87]:
rmse_10

128.18309078966976

In [88]:
r2_10

0.7424087792433609