# Importing Relavant Libraries

In [1]:
import numpy as np 
import pandas as pd
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
%matplotlib inline

# Reading Dataset

In [2]:
#Import Dataset 
# data = pd.read_csv('r"SP_500_Tweets.csv"', sep=',', index_col='Date', parse_dates=True)
data = pd.read_csv(r"SP500_Tweets.csv", index_col=[0], parse_dates=True)
data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-11-05,2915.46,2944.25,2903.44,2930.19,2930.19,4807320000
2020-12-05,2939.5,2945.82,2869.59,2870.12,2870.12,5107710000
2020-05-13,2865.86,2874.14,2793.15,2820.0,2820.0,6143130000
2020-05-14,2794.54,2852.8,2766.64,2852.5,2852.5,5641920000
2020-05-15,2829.95,2865.01,2816.78,2863.7,2863.7,5477040000


# Cleaning Dataset

In [3]:
print('Dataset size:',data.shape)
print('', '-'*20, '\n\nColumns are:',data.columns)
print('', '-'*100, '\n\n',data.dtypes)

Dataset size: (45, 6)
 -------------------- 

Columns are: Index(['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object')
 ---------------------------------------------------------------------------------------------------- 

 Open         float64
High         float64
Low          float64
Close        float64
Adj Close    float64
Volume         int64
dtype: object


In [4]:
data.isnull().sum()

Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

In [5]:
# Change the datatype of the volume column to float
data['Volume'] = data['Volume'].astype(float)

In [6]:
# Remove columns which you can't use as features
data.drop(['Open', 'High', 'Low', 'Adj Close'], axis=1, inplace=True)
data.head()

Unnamed: 0_level_0,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-11-05,2930.19,4807320000.0
2020-12-05,2870.12,5107710000.0
2020-05-13,2820.0,6143130000.0
2020-05-14,2852.5,5641920000.0
2020-05-15,2863.7,5477040000.0


### Transforming into Supervised Problem 

In [7]:
#Create new column for price difference 

data['Difference']= data['Close'].diff()
data.dropna(inplace = True)

data.head()

Unnamed: 0_level_0,Close,Volume,Difference
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-12-05,2870.12,5107710000.0,-60.07
2020-05-13,2820.0,6143130000.0,-50.12
2020-05-14,2852.5,5641920000.0,32.5
2020-05-15,2863.7,5477040000.0,11.2
2020-05-18,2953.91,6364290000.0,90.21


### Finding daily Stock Trend -> Binary

In [8]:
Rise = 1 
Fall = 0 

data['Trend'] = np.where(
    data['Difference'] > 0, Rise, Fall
)

data.head()

Unnamed: 0_level_0,Close,Volume,Difference,Trend
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-12-05,2870.12,5107710000.0,-60.07,0
2020-05-13,2820.0,6143130000.0,-50.12,0
2020-05-14,2852.5,5641920000.0,32.5,1
2020-05-15,2863.7,5477040000.0,11.2,1
2020-05-18,2953.91,6364290000.0,90.21,1


### Creating Target

In [9]:
#Target is next day's Trend

data['Target'] = data.Trend.shift(-1)
data.dropna(inplace = True)

data.head()

Unnamed: 0_level_0,Close,Volume,Difference,Trend,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-12-05,2870.12,5107710000.0,-60.07,0,0.0
2020-05-13,2820.0,6143130000.0,-50.12,0,1.0
2020-05-14,2852.5,5641920000.0,32.5,1,1.0
2020-05-15,2863.7,5477040000.0,11.2,1,1.0
2020-05-18,2953.91,6364290000.0,90.21,1,0.0


### Storing Cleaned Data

In [10]:
data.to_csv("sp_tweets_cleaned_data.csv")

# Preparing data to attach Sentiments to data

### Shifting Window

In [11]:
#Shifting window
#data.Trend.shift(1)
#data.dropna(inplace = True)
#data.head()

In [12]:
def train_test_split(data,perc):
    data = data.values
    n = int(len(data) * (1 - perc))
    return data[:n], data[n:]

In [13]:
train, test = train_test_split(data,0.2)

In [14]:
#def sliding_window(train, window_size, horizon):
#    '''
#    sliding window generator.
#    '''
#    for i in range(len(train) - window_size - horizon + 1):
#        split_train = train[i:window_size+1]
#        split_val = train[i+window_size:window_size+1+horizon]
#        yield split_train, split_val

In [15]:
def rolling(train, min_train_size, horizon):
    '''
    rolling window generator.
    '''
    for i in range(len(train) - min_train_size - horizon + 1):
        split_train = train[i:min_train_size+1]
        split_val = train[i+min_train_size:min_train_size+1+horizon]
        yield split_train, split_val

In [16]:
cv_rolling = rolling(train, min_train_size=5, horizon=1)

print('data:{0}\n'.format(train))
i = 0
for cv_train, cv_val in cv_rolling:
    print(f'CV[{i+1}]')
    print(f'Train:\t{cv_train}')
    print(f'Val:\t{cv_val}')
    print('----')
    i+=1

data:[[ 2.87012e+03  5.10771e+09 -6.00700e+01  0.00000e+00  0.00000e+00]
 [ 2.82000e+03  6.14313e+09 -5.01200e+01  0.00000e+00  1.00000e+00]
 [ 2.85250e+03  5.64192e+09  3.25000e+01  1.00000e+00  1.00000e+00]
 [ 2.86370e+03  5.47704e+09  1.12000e+01  1.00000e+00  1.00000e+00]
 [ 2.95391e+03  6.36429e+09  9.02100e+01  1.00000e+00  0.00000e+00]
 [ 2.92294e+03  4.96933e+09 -3.09700e+01  0.00000e+00  1.00000e+00]
 [ 2.97161e+03  4.99297e+09  4.86700e+01  1.00000e+00  0.00000e+00]
 [ 2.94851e+03  4.96694e+09 -2.31000e+01  0.00000e+00  1.00000e+00]
 [ 2.95545e+03  3.95280e+09  6.94000e+00  1.00000e+00  1.00000e+00]
 [ 2.99177e+03  5.83706e+09  3.63200e+01  1.00000e+00  1.00000e+00]
 [ 3.03613e+03  6.37123e+09  4.43600e+01  1.00000e+00  0.00000e+00]
 [ 3.02973e+03  5.40267e+09 -6.40000e+00  0.00000e+00  1.00000e+00]
 [ 3.04431e+03  7.27508e+09  1.45800e+01  1.00000e+00  1.00000e+00]
 [ 3.05573e+03  4.67341e+09  1.14200e+01  1.00000e+00  1.00000e+00]
 [ 3.08082e+03  5.18723e+09  2.50900e+01  1

In [17]:
#def cross_validation_score(model, train, cv, metric):
#    cv_scores = []
#    for cv_train, cv_test in cv:
#        model.fit(cv_train, cv_test)
#        preds = model.predict(horizon=len(cv_test))
#        score=metric(y_true=cv_test, y_pred=preds)
#        cv_scores.append(score)
#    return np.array(cv_scores)

In [18]:
#cv_sliding = sliding_window(train, window_size=5, horizon=1)

#cv_scores_1 = cross_validation_score(model, train=train, cv=cv_sliding, metric=accuracy_score)

### Creating Sentiment Column

In [19]:
data = data.reset_index()
data.Date = data.Date.astype("object")
data.Date = pd.to_datetime(data.Date)

data["Sentiment"] = np.zeros(data.shape[0])
data["compound"] = np.zeros(data.shape[0])
data.head()

Unnamed: 0,Date,Close,Volume,Difference,Trend,Target,Sentiment,compound
0,2020-12-05,2870.12,5107710000.0,-60.07,0,0.0,0.0,0.0
1,2020-05-13,2820.0,6143130000.0,-50.12,0,1.0,0.0,0.0
2,2020-05-14,2852.5,5641920000.0,32.5,1,1.0,0.0,0.0
3,2020-05-15,2863.7,5477040000.0,11.2,1,1.0,0.0,0.0
4,2020-05-18,2953.91,6364290000.0,90.21,1,0.0,0.0,0.0


### Reading Sentiment Data

In [20]:
data1 = pd.read_csv("final_polarity_data.csv")[['created_at', 'prediction_vader', 'compound']]
data1.columns = ['Date', 'Sentiment', 'compound']
data1.Date = pd.to_datetime(data1.Date)
data1.head()

Unnamed: 0,Date,Sentiment,compound
0,2020-04-15,1,0.9972
1,2020-06-25,1,0.9993
2,2020-06-04,1,0.9989
3,2020-07-03,1,0.9881
4,2020-07-09,1,0.9994


### Attaching sentiment with data

In [21]:
# Generating copy of both datasets
d = data.copy()
d1 = data1.copy()

# Attaching sentiment
rows = []
for i in d.Date:
    for j in d1.Date:
        if i==j:
            index = d[d.Date == i].index[0]
            new_row = dict(d.iloc[index,:])
            
            sent = d1[d1.Date == i].Sentiment.values[0]
            new_row["Sentiment"] = sent
            
            comp = d1[d1.Date == i].compound.values[0]
            new_row["compound"] = comp
            
            rows.append(new_row)
            
new_data = pd.DataFrame(data=rows, columns=d.columns)
new_data.set_index('Date', inplace=True)
new_data.head()

Unnamed: 0_level_0,Close,Volume,Difference,Trend,Target,Sentiment,compound
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-05-13,2820.0,6143130000.0,-50.12,0,1.0,-1,-0.8402
2020-05-14,2852.5,5641920000.0,32.5,1,1.0,1,0.8357
2020-05-15,2863.7,5477040000.0,11.2,1,1.0,-1,-0.7506
2020-05-18,2953.91,6364290000.0,90.21,1,0.0,-1,-0.7096
2020-05-19,2922.94,4969330000.0,-30.97,0,1.0,-1,-0.8225


In [22]:
new_data.to_csv("tweets_sent_data.csv")