# Importing Relavant Libraries

In [1]:
import numpy as np 
import pandas as pd
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
%matplotlib inline

# Reading Dataset

In [2]:
#Import Dataset 
# data = pd.read_csv('r"SP_500_Tweets.csv"', sep=',', index_col='Date', parse_dates=True)
data = pd.read_csv(r"SP500_Reddit.csv", index_col=[0], parse_dates=True)
data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-04-01,3764.61,3769.99,3662.71,3700.65,3700.65,5006680000
2021-05-01,3698.02,3737.83,3695.07,3726.86,3726.86,4582620000
2021-06-01,3712.2,3783.04,3705.34,3748.14,3748.14,6049970000
2021-07-01,3764.71,3811.55,3764.71,3803.79,3803.79,5080870000
2021-08-01,3815.05,3826.69,3783.6,3824.68,3824.68,4764180000


# Cleaning Dataset

In [3]:
print('Dataset size:',data.shape)
print('', '-'*20, '\n\nColumns are:',data.columns)
print('', '-'*100, '\n\n',data.dtypes)

Dataset size: (145, 6)
 -------------------- 

Columns are: Index(['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object')
 ---------------------------------------------------------------------------------------------------- 

 Open         float64
High         float64
Low          float64
Close        float64
Adj Close    float64
Volume         int64
dtype: object


In [4]:
data.isnull().sum()

Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

In [5]:
# Change the datatype of the volume column to float
data['Volume'] = data['Volume'].astype(float)

In [6]:
# Remove columns which you can't use as features
data.drop(['Open', 'High', 'Low', 'Adj Close'], axis=1, inplace=True)
data.head()

Unnamed: 0_level_0,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-04-01,3700.65,5006680000.0
2021-05-01,3726.86,4582620000.0
2021-06-01,3748.14,6049970000.0
2021-07-01,3803.79,5080870000.0
2021-08-01,3824.68,4764180000.0


### Transforming into Supervised Problem 

In [7]:
#Create new column for price difference 

data['Difference']= data['Close'].diff()
data.dropna(inplace = True)

data.head()

Unnamed: 0_level_0,Close,Volume,Difference
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-05-01,3726.86,4582620000.0,26.21
2021-06-01,3748.14,6049970000.0,21.28
2021-07-01,3803.79,5080870000.0,55.65
2021-08-01,3824.68,4764180000.0,20.89
2021-11-01,3799.61,4450500000.0,-25.07


### Finding daily Stock Trend -> Binary

In [8]:
Rise = 1 
Fall = 0 

data['Trend'] = np.where(
    data['Difference'] > 0, Rise, Fall
)

data.head()

Unnamed: 0_level_0,Close,Volume,Difference,Trend
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-05-01,3726.86,4582620000.0,26.21,1
2021-06-01,3748.14,6049970000.0,21.28,1
2021-07-01,3803.79,5080870000.0,55.65,1
2021-08-01,3824.68,4764180000.0,20.89,1
2021-11-01,3799.61,4450500000.0,-25.07,0


### Creating Target

In [9]:
#Target is next day's Trend

data['Target'] = data.Trend.shift(-1)
data.dropna(inplace = True)

data.head()

Unnamed: 0_level_0,Close,Volume,Difference,Trend,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-05-01,3726.86,4582620000.0,26.21,1,1.0
2021-06-01,3748.14,6049970000.0,21.28,1,1.0
2021-07-01,3803.79,5080870000.0,55.65,1,1.0
2021-08-01,3824.68,4764180000.0,20.89,1,0.0
2021-11-01,3799.61,4450500000.0,-25.07,0,1.0


### Storing Cleaned Data

In [10]:
data.to_csv("sp_reddit_cleaned_data.csv")

# Preparing data to attach Sentiments with it 

### Shifting Window

In [11]:
#Shifting window
#data.Trend.shift(1)
#data.dropna(inplace = True)
#data.head()

In [12]:
def train_test_split(data,perc):
    data = data.values
    n = int(len(data) * (1 - perc))
    return data[:n], data[n:]

In [13]:
train, test = train_test_split(data,0.2)

In [14]:
#def sliding_window(train, window_size, horizon):
#    '''
#    sliding window generator.
#    '''
#    for i in range(len(train) - window_size - horizon + 1):
#        split_train = train[i:window_size+1]
#        split_val = train[i+window_size:window_size+1+horizon]
#        yield split_train, split_val

In [15]:
def rolling(train, min_train_size, horizon):
    '''
    rolling window generator.
    '''
    for i in range(len(train) - min_train_size - horizon + 1):
        split_train = train[i:min_train_size+1]
        split_val = train[i+min_train_size:min_train_size+1+horizon]
        yield split_train, split_val

In [16]:
cv_rolling = rolling(train, min_train_size=5, horizon=1)

print('data:{0}\n'.format(train))
i = 0
for cv_train, cv_val in cv_rolling:
    print(f'CV[{i+1}]')
    print(f'Train:\t{cv_train}')
    print(f'Val:\t{cv_val}')
    print('----')
    i+=1

data:[[ 3.72686e+03  4.58262e+09  2.62100e+01  1.00000e+00  1.00000e+00]
 [ 3.74814e+03  6.04997e+09  2.12800e+01  1.00000e+00  1.00000e+00]
 [ 3.80379e+03  5.08087e+09  5.56500e+01  1.00000e+00  1.00000e+00]
 [ 3.82468e+03  4.76418e+09  2.08900e+01  1.00000e+00  0.00000e+00]
 [ 3.79961e+03  4.45050e+09 -2.50700e+01  0.00000e+00  1.00000e+00]
 [ 3.80119e+03  4.97721e+09  1.58000e+00  1.00000e+00  1.00000e+00]
 [ 3.80984e+03  4.59042e+09  8.65000e+00  1.00000e+00  0.00000e+00]
 [ 3.79554e+03  5.18014e+09 -1.43000e+01  0.00000e+00  0.00000e+00]
 [ 3.76825e+03  5.35306e+09 -2.72900e+01  0.00000e+00  1.00000e+00]
 [ 3.79891e+03  4.98294e+09  3.06600e+01  1.00000e+00  1.00000e+00]
 [ 3.85185e+03  4.55179e+09  5.29400e+01  1.00000e+00  1.00000e+00]
 [ 3.85307e+03  4.48446e+09  1.22000e+00  1.00000e+00  0.00000e+00]
 [ 3.84147e+03  5.08043e+09 -1.16000e+01  0.00000e+00  1.00000e+00]
 [ 3.85536e+03  6.95586e+09  1.38900e+01  1.00000e+00  0.00000e+00]
 [ 3.84962e+03  6.02909e+09 -5.74000e+00  0

In [17]:
#def cross_validation_score(model, train, cv, metric):
#    cv_scores = []
#    for cv_train, cv_test in cv:
#        model.fit(cv_train, cv_test)
#        preds = model.predict(horizon=len(cv_test))
#        score=metric(y_true=cv_test, y_pred=preds)
#        cv_scores.append(score)
#    return np.array(cv_scores)

In [18]:
#cv_sliding = sliding_window(train, window_size=5, horizon=1)

#cv_scores_1 = cross_validation_score(model, train=train, cv=cv_sliding, metric=accuracy_score)

### Creating Sentiment Column

In [19]:
data = data.reset_index()
data.Date = data.Date.astype("object")
data.Date = pd.to_datetime(data.Date)

data["Sentiment"] = np.zeros(data.shape[0])
data["compound"] = np.zeros(data.shape[0])
data.head()

Unnamed: 0,Date,Close,Volume,Difference,Trend,Target,Sentiment,compound
0,2021-05-01,3726.86,4582620000.0,26.21,1,1.0,0.0,0.0
1,2021-06-01,3748.14,6049970000.0,21.28,1,1.0,0.0,0.0
2,2021-07-01,3803.79,5080870000.0,55.65,1,1.0,0.0,0.0
3,2021-08-01,3824.68,4764180000.0,20.89,1,0.0,0.0,0.0
4,2021-11-01,3799.61,4450500000.0,-25.07,0,1.0,0.0,0.0


### Reading Sentiment Data

In [20]:
data1 = pd.read_csv("final_polarity_data.csv")[['created_at', 'prediction_vader', 'compound']]
data1.columns = ['Date', 'Sentiment', 'compound']
data1.Date = pd.to_datetime(data1.Date)
data1.head()

Unnamed: 0,Date,Sentiment,compound
0,2020-04-15,1,0.9972
1,2020-06-25,1,0.9993
2,2020-06-04,1,0.9989
3,2020-07-03,1,0.9881
4,2020-07-09,1,0.9994


### Attaching sentiment with data

In [21]:
# Generating copy of both datasets
d = data.copy()
d1 = data1.copy()

# Attaching sentiment
rows = []
for i in d.Date:
    for j in d1.Date:
        if i==j:
            index = d[d.Date == i].index[0]
            new_row = dict(d.iloc[index,:])
            
            sent = d1[d1.Date == i].Sentiment.values[0]
            new_row["Sentiment"] = sent
            
            comp = d1[d1.Date == i].compound.values[0]
            new_row["compound"] = comp
            
            rows.append(new_row)
            
new_data = pd.DataFrame(data=rows, columns=d.columns)
new_data.set_index('Date', inplace=True)
new_data.head()

Unnamed: 0_level_0,Close,Volume,Difference,Trend,Target,Sentiment,compound
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-05-01,3726.86,4582620000.0,26.21,1,1.0,1,0.998
2021-06-01,3748.14,6049970000.0,21.28,1,1.0,1,0.9603
2021-07-01,3803.79,5080870000.0,55.65,1,1.0,1,0.9997
2021-08-01,3824.68,4764180000.0,20.89,1,0.0,1,0.9996
2021-11-01,3799.61,4450500000.0,-25.07,0,1.0,1,0.9907


In [22]:
new_data.to_csv("reddit_sent_data.csv")