In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df_news = pd.read_csv("Reliance_sentiment.csv")

In [3]:
df_price = pd.read_csv("reliance_prices.csv")

In [4]:
df_price.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1384 entries, 0 to 1383
Data columns (total 21 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Date         1384 non-null   object 
 1   Open         1384 non-null   float64
 2   High         1384 non-null   float64
 3   Low          1384 non-null   float64
 4   Close        1384 non-null   float64
 5   Adj Close    1384 non-null   float64
 6   Volume       1384 non-null   int64  
 7   SMA3         1384 non-null   float64
 8   SMA5         1384 non-null   float64
 9   SMA9         1384 non-null   float64
 10  SMA15        1384 non-null   float64
 11  SMA30        1384 non-null   float64
 12  EMA3         1384 non-null   float64
 13  EMA5         1384 non-null   float64
 14  EMA9         1384 non-null   float64
 15  EMA15        1384 non-null   float64
 16  EMA30        1384 non-null   float64
 17  1d_diff      1384 non-null   float64
 18  5d_diff      1384 non-null   float64
 19  10d_di

In [5]:
df_news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 654 entries, 0 to 653
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   date              654 non-null    object 
 1   title_senti_comp  654 non-null    float64
 2   news_senti_comp   654 non-null    float64
 3   avg_senti_comp    654 non-null    float64
dtypes: float64(3), object(1)
memory usage: 20.6+ KB


In [6]:
df_combined = pd.merge(left=df_price,right=df_news,how="left",left_on = 'Date',right_on='date')

In [7]:
df_combined.drop(['date'],axis=1,inplace=True)

In [8]:
df_combined.tail()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,SMA3,SMA5,SMA9,...,EMA9,EMA15,EMA30,1d_diff,5d_diff,10d_diff,Volume_100k,title_senti_comp,news_senti_comp,avg_senti_comp
1379,2022-07-25,2467.449951,2467.449951,2403.0,2420.399902,2420.399902,10665470,2469.93335,2469.980029,2438.738905,...,2450.729008,2453.819271,2484.103313,-82.700195,-1.850098,-3.5,106.655,,,
1380,2022-07-26,2421.100098,2443.899902,2411.350098,2421.5,2421.5,5216466,2448.333333,2466.86001,2443.622233,...,2444.883206,2449.779362,2480.06439,1.100098,-15.600098,1.050049,52.165,,,
1381,2022-07-27,2419.949951,2427.0,2402.0,2419.199951,2419.199951,3994321,2420.366618,2450.1,2446.072238,...,2439.746555,2445.956936,2476.137652,-2.300049,-83.800049,41.649902,39.943,,,
1382,2022-07-28,2436.0,2467.949951,2421.0,2457.100098,2457.100098,5766936,2432.600016,2444.26001,2452.216688,...,2443.217264,2447.349831,2474.909423,37.900146,-29.199951,59.950195,57.669,,,
1383,2022-07-29,2474.699951,2517.100098,2464.0,2509.449951,2509.449951,6982808,2461.916667,2445.52998,2461.905572,...,2456.463801,2455.112346,2477.137844,52.349854,6.349854,107.649902,69.828,,,


In [13]:
df_combined[['Date','Close','1d_diff']].head()

Unnamed: 0,Date,Close,1d_diff
0,2016-12-27,527.351257,8.325287
1,2016-12-28,519.921692,-7.239319
2,2016-12-29,527.722717,7.601318
3,2016-12-30,536.118164,8.18042
4,2017-01-02,537.455444,1.303101


In [9]:
df_combined.isna().sum()

Date                  0
Open                  0
High                  0
Low                   0
Close                 0
Adj Close             0
Volume                0
SMA3                  0
SMA5                  0
SMA9                  0
SMA15                 0
SMA30                 0
EMA3                  0
EMA5                  0
EMA9                  0
EMA15                 0
EMA30                 0
1d_diff               0
5d_diff               0
10d_diff              0
Volume_100k           0
title_senti_comp    845
news_senti_comp     845
avg_senti_comp      845
dtype: int64

In [10]:
df_combined.fillna(0.0,inplace=True)

In [None]:
## model creation

## 1d prediction

In [14]:
x_features_1d = ['Volume_100k','EMA3','EMA5','avg_senti_comp']

In [15]:
df_combined[x_features_1d].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1384 entries, 0 to 1383
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Volume_100k     1384 non-null   float64
 1   EMA3            1384 non-null   float64
 2   EMA5            1384 non-null   float64
 3   avg_senti_comp  1384 non-null   float64
dtypes: float64(4)
memory usage: 54.1 KB


In [16]:
X1d = df_combined[x_features_1d]
y1d= df_combined['1d_diff']

In [17]:
## train test split
X_train, X_test, y_train, y_test = train_test_split(X1d,
                                                    y1d,
                                                    train_size = 0.7,
                                                    random_state = 80)

In [18]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((968, 4), (416, 4), (968,), (416,))

## defining transformations

In [19]:
scaler = StandardScaler()

In [20]:
lreg_v1 = Pipeline(steps=[('scaler', scaler),
                          ('regressor', SGDRegressor(max_iter=100, eta0=0.01))])

In [21]:
lreg_v1.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()),
                ('regressor', SGDRegressor(max_iter=100))])

In [22]:
lreg_v1['regressor'].intercept_

array([1.10603013])

In [23]:
lreg_v1['regressor'].coef_

array([ 3.29206105,  2.52111844, -1.92397264,  0.93147149])

In [24]:
dict(zip(x_features_1d, np.round(lreg_v1['regressor'].coef_, 2)))

{'Volume_100k': 3.29, 'EMA3': 2.52, 'EMA5': -1.92, 'avg_senti_comp': 0.93}

In [25]:
y_pred = lreg_v1.predict(X_test)

In [26]:
y_df = pd.DataFrame({"actual": y_test,
                     "predicted": y_pred,
                     "residual": y_pred - y_test})

In [27]:
y_df.sample(10, random_state = 100)

Unnamed: 0,actual,predicted,residual
1118,-14.75,-1.240395,13.509605
345,-11.181702,0.021602,11.203304
1120,-21.400146,-1.572719,19.827427
1011,-34.489136,6.883316,41.372451
789,38.759277,7.456286,-31.302991
148,-5.639465,-2.113893,3.525573
80,7.432434,4.729925,-2.70251
632,-12.571289,0.306736,12.878025
520,30.768188,0.23855,-30.529638
1350,-34.799805,0.058606,34.85841


In [28]:
mse_v1 = mean_squared_error(y_test, y_pred)

In [29]:
mse_v1

783.4970041375224

In [30]:
rmse_v1 = np.sqrt(mse_v1)

In [31]:
rmse_v1

27.991016489894083

In [32]:
r2_score(y_test, y_pred)

0.011161629818256857