In [None]:
import os
import pandas as pd
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
import time
import plotly.express as px
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import make_scorer
import statsmodels.api as sm
import matplotlib.pyplot as plt
import statsmodels.graphics.tsaplots as smt


In [None]:
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv("Combined_News_DJIA.csv",low_memory=False,
                    parse_dates=[0])

full_stock = pd.read_csv("DJIA_table.csv",low_memory=False,
                    parse_dates=[0])

#add the closing stock value to the df - this will be the y variable
df["Close"]=full_stock.Close
df["Open"]=full_stock.Open
#df["Low"]=full_stock.Close

#show how the dataset looks like
df.head(5)

In [None]:
df['Close'][1988]

In [None]:
#adding new label(sentiment) based on news impact on stock prices
def news_impact_label():
    news_impact = []
    for i in range(len(df['Close'])):
        if i != 0:
            if df['Close'][i-1] < df['Close'][i]:
                news_impact.append(1)
            elif df['Close'][i-1] > df['Close'][i]:
                news_impact.append(0)
            elif df['Close'][i-1] == df['Close'][i]:
                news_impact.append(0)
    news_impact.append(0)
    return news_impact

In [None]:
df['News_Impact'] = news_impact_label()

In [None]:
df.head(3)

In [None]:
#drop the label column
df = df.drop(["Label"], axis=1)

In [None]:
#check for NAN
df.isnull().sum()

In [None]:
df = df.replace(np.nan, ' ', regex=True)

#sanity check
df.isnull().sum().sum()

In [None]:
df = df.replace('b\"|b\'|\\\\|\\\"', '', regex=True)
df.head(2)

In [None]:
Anakin = SentimentIntensityAnalyzer()

Anakin.polarity_scores(" ")

In [None]:
def detect_subjectivity(text):
    return TextBlob(text).sentiment.subjectivity

detect_subjectivity(" ") #should return 0

In [None]:
#get the headline columns' names
cols = []
for i in range(1,26):
    col = ("Top{}".format(i))
    cols.append(col)

In [None]:
start_vect=time.time()
print("ANAKIN: 'Intializing the process..'")

#get the name of the headline columns
cols = []
for i in range(1,26):
    col = ("Top{}".format(i))
    cols.append(col)


for col in cols:
    df[col] = df[col].astype(str) # Make sure data is treated as a string
    df[col+'_comp']= df[col].apply(lambda x:Anakin.polarity_scores(x)['compound'])
    df[col+'_sub'] = df[col].apply(detect_subjectivity)
    print("{} Done".format(col))
    
print("VADER: Vaderization completed after %0.2f Minutes"%((time.time() - start_vect)/60))

In [None]:
#the text isn't required anymore
df = df.drop(cols,axis=1)
df.head(5)

In [None]:
comp_cols = []
for col in cols:
    comp_col = col + "_comp"
    comp_cols.append(comp_col)

w = np.arange(1,26,1).tolist()
w.reverse()

weighted_comp = []
max_comp = []
min_comp = []
for i in range(0,len(df)):
    a = df.loc[i,comp_cols].tolist()
    weighted_comp.append(np.average(a, weights=w))
    max_comp.append(max(a))
    min_comp.append(min(a))

df['compound_mean'] = weighted_comp
df['compound_max'] = max_comp
df['compound_min'] = min_comp


sub_cols = []
for col in cols:
    sub_col = col + "_sub"
    sub_cols.append(sub_col)


weighted_sub = []
max_sub = []
min_sub = []
for i in range(0,len(df)):
    a = df.loc[i,sub_cols].tolist()
    weighted_sub.append(np.average(a, weights=w))
    max_sub.append(max(a))
    min_sub.append(min(a))

df['subjectivity_mean'] = weighted_sub
df['subjectivity_max'] = max_sub
df['subjectivity_min'] = min_sub

to_drop = sub_cols+comp_cols
df = df.drop(to_drop, axis=1)

In [None]:
#df

In [None]:
def avg(avg_len, col, df):
    list_of_avg = []
    list_len = [0 for x in range(avg_len)]
    for i in range(len(df[col])):
        if i > avg_len:
            a = []
            for val in list_len:
                d = df[col][i - val] 
                a.append(d)
            print(a)
            avg = sum(a)/avg_len
            list_len.append(avg)
    df_name = 'avg '+str(avg_len)
    print('list len: ', len(list_len))
    print('df len: ', len(df[col]))
    df[df_name] = list_len
    return

i = 0
while i <= df.Close.count():
    if i <= df.Close.count()-4:
        new_df = np.append(new_df, [[df['Open'][i],df['Open'][i-1],df['Open'][i-2],df['Open'][i-3]]
        ], axis=0)
        
    if i <= df.Open.count()-5:
        target_df = np.append(target_df, [[df['Open'][i+4]]], axis=0)
    print(i)
    i = i + 1

In [None]:
df['Close'][0]

In [None]:
#df

In [None]:
#avg(5, 'Close', df)

In [None]:
df.head(3)

In [None]:
list_len = [x for x in range(10)]
list_len

In [None]:
df.head(5)

Exploratory data analysis

In [None]:
import plotly.graph_objects as go
import matplotlib.pyplot as plt

In [None]:
fig1 = go.Figure()
fig1.add_trace(go.Scatter(x=df.Date, y=df.Close,
                    mode='lines'))
title = []
title.append(dict(xref='paper', yref='paper', x=0.0, y=1.05,
                              xanchor='left', yanchor='bottom',
                              text='Development of stock values from Aug, 2008 to Jun, 2016',
                              font=dict(family='Arial',
                                        size=30,
                                        color='rgb(37,37,37)'),
                              showarrow=False))
fig1.update_layout(xaxis_title='Date',
                   yaxis_title='Closing stock value (in $)',
                  annotations=title)
fig1.show()

In [None]:
#function for quick plotting and testing of stationarity
def stationary_plot(y, lags=None, figsize=(12, 7), style='bmh'):
    """
        Plot time series, its ACF and PACF, calculate Dickey–Fuller test
        
        y - timeseries
        lags - how many lags to include in ACF, PACF calculation
    """
    if not isinstance(y, pd.Series):
        y = pd.Series(y)
        
    with plt.style.context(style):    
        fig = plt.figure(figsize=figsize)
        layout = (2, 2)
        ts_ax = plt.subplot2grid(layout, (0, 0), colspan=2)
        acf_ax = plt.subplot2grid(layout, (1, 0))
        pacf_ax = plt.subplot2grid(layout, (1, 1))
        
        y.plot(ax=ts_ax)
        p_value = sm.tsa.stattools.adfuller(y)[1]
        ts_ax.set_title('Time Series Analysis Plots\n Dickey-Fuller: p={0:.5f}'.format(p_value))
        smt.plot_acf(y, lags=lags, ax=acf_ax)
        smt.plot_pacf(y, lags=lags, ax=pacf_ax)
        plt.tight_layout()


In [None]:
stationary_plot(df.Close)

In [None]:
diff = df.Close - df.Close.shift(7)
stationary_plot(diff[7:])

In [None]:
diff2 = diff - diff.shift(1)
stationary_plot(diff2[7+1:], lags=60)

In [None]:
fig2 = go.Figure()
fig2.add_trace(go.Scatter(x=df.Date, y=df.compound_mean,
                    mode='lines',
                    name='Mean'))
fig2.add_trace(go.Scatter(x=df.Date, y=df.compound_max,
                    mode='lines',
                    name='Maximum'))
fig2.add_trace(go.Scatter(x=df.Date, y=df.compound_min,
                    mode='lines',
                    name='Minimum'))
title = []
title.append(dict(xref='paper', yref='paper', x=0.0, y=1.05,
                              xanchor='left', yanchor='bottom',
                              text='Development of sentiment compound score',
                               font=dict(family='Arial',
                                       size=30,
                                        color='rgb(37,37,37)'),
                              showarrow=False))
fig2.update_layout(xaxis_title='Date',
                   yaxis_title='Compound score',
                  annotations=title)
fig2.show()

In [None]:
compm_hist = px.histogram(df, x="compound_mean")
compm_hist.show()

In [None]:
fig3 = go.Figure()
fig3.add_trace(go.Scatter(x=df.Date, y=df.subjectivity_mean,
                    mode='lines',
                    name='Mean'))
fig3.add_trace(go.Scatter(x=df.Date, y=df.subjectivity_min,
                    mode='lines',
                    name='Min'))
fig3.add_trace(go.Scatter(x=df.Date, y=df.subjectivity_max,
                    mode='lines',
                    name='Max'))
title = []
title.append(dict(xref='paper', yref='paper', x=0.0, y=1.05,
                              xanchor='left', yanchor='bottom',
                              text='Development of subjectivity score',
                              font=dict(family='Arial',
                                        size=30,
                                        color='rgb(37,37,37)'),
                              showarrow=False))
fig3.update_layout(xaxis_title='Date',
                   yaxis_title='Subjectivity score',
                  annotations=title)
fig3.show()

In [None]:
subm_hist = px.histogram(df, x="subjectivity_mean")
subm_hist.show()

In [None]:
df.describe()

In [None]:
df.corr()

Feature Selection

In [None]:
def unique_ratio (col):
    return len(np.unique(col))/len(col)

cols = ['Close', 'News_Impact', 'compound_mean', 'compound_max', 'compound_min', 'subjectivity_mean', 'subjectivity_max', 'subjectivity_min']

ur = []
var = []
for col in cols:
    ur.append(unique_ratio(df[col]))
    var.append(np.var(df[col]))
    
feature_sel = pd.DataFrame({'Column': cols, 
              'Unique': ur,
              'Variance': var})
feature_sel

In [None]:
sel_fig = go.Figure(data=go.Scatter(
    x=feature_sel.Column,
    y=feature_sel.Unique,
    mode='markers',
    marker=dict(size=(feature_sel.Unique*100)),
))
sel_fig.update_layout(title='Ratio of unique values', 
                      yaxis_title='Unique ratio')
sel_fig.show()

In [None]:
drop = ['subjectivity_min', 'subjectivity_max']
clean_df = df.drop(drop,axis=1)

Lag the extracted feature

In [None]:
lag_df = clean_df.copy()
lag_df.head(3)

In [None]:
to_lag = list(lag_df.columns)
to_lag_4 = to_lag[1]
to_lag_1 = to_lag[2:len(to_lag)]

In [None]:
#lagging text features two days back
for col in to_lag_1:
    for i in range(1,3):
        new_name = col + ('_lag_{}'.format(i))
        lag_df[new_name] = lag_df[col].shift(i)
    
#lagging closing values 4 days back
for i in range(1, 5):
    new_name = to_lag_4 + ('_lag_{}'.format(i))
    lag_df[new_name] = lag_df[to_lag_4].shift(i)

In [None]:
#Show many rows need to be removed
lag_df.head(10) 

In [None]:
lag_df = lag_df.drop(lag_df.index[[np.arange(0,4)]])
lag_df = lag_df.reset_index(drop=True)

#sanity check for NaNs
lag_df.isnull().sum().sum()

In [None]:
lag_df.head(5)

cell training

In [None]:
# for time-series cross-validation set 10 folds 
tscv = TimeSeriesSplit(n_splits=10)

def mape(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
scorer = make_scorer(mean_squared_error)
scaler = StandardScaler()   

In [None]:
def ts_train_test_split(X, y, test_size):
    """
        Perform train-test split with respect to time series structure
    """
    
    # get the index after which test set starts
    test_index = int(len(X)*(1-test_size))
    
    X_train = X.iloc[:test_index]
    y_train = y.iloc[:test_index]
    X_test = X.iloc[test_index:]
    y_test = y.iloc[test_index:]
    
    return X_train, X_test, y_train, y_test

In [None]:
X = lag_df.drop(['News_Impact'],axis=1)
#X = lag_df.drop(['Label'],axis=1)
#X.index = X["Date"]
X = X.drop(['Date'],axis=1)
y = lag_df.News_Impact

X_train, X_test, y_train, y_test = ts_train_test_split(X, y, test_size = 0.2)

#sanity check  hahaha
(len(X_train)+len(X_test))==len(X)

In [None]:
X_train

In [None]:
#len_df = int((75/100)*len(df))
#train_df = df[:len_df]
#test_df = df[len_df:]
train_df = X_train.copy()
train_df['News_Impact'] = y_train.values
#train_df['Label'] = y_train.values
test_df = X_test.copy()
test_df['News_Impact'] = y_test.values
#test_df['Label'] = y_test.values

In [None]:
train_df

In [None]:
y_train

In [None]:
#dff_i

In [None]:
#dff = df.drop(columns = ['Date', 'News_Impact'])
#dff_i = df[['News_Impact']]
#dff = dff.merge(dff_i, left_index = True, right_index = True)

In [None]:
#dff

In [None]:
#dff['News_Impact'].value_counts()

In [None]:
import torch
from torchvision import datasets
from torchvision import transforms
# Tensor transform
transform = transforms.ToTensor()

# SVHN training datasets
#svhn_train = datasets.SVHN(root='data/', split='train', download=True, transform=transform)

batch_size = 21
test_batch_size = 399
num_workers = 0

# build DataLoaders for SVHN dataset
train_loader = torch.utils.data.DataLoader(dataset=train_df.values, #dff.values,
                                          batch_size=batch_size,
                                          shuffle=True,
                                          num_workers=num_workers)
test_loader = torch.utils.data.DataLoader(dataset=test_df.values, #dff.values,
                                          batch_size=test_batch_size,
                                          shuffle=True,
                                          num_workers=num_workers)

In [None]:
#train_df

In [None]:
#next(iter(train_loader))#[:, 1]

In [None]:
for idx, batch in enumerate(train_loader):
    print(idx)
    print(batch)

In [None]:
#This funcn creates batches
def perfect_batches2(df, batch_size):
    batched_data = []
    if len(df)%batch_size != 0:
        print('Cannot create a perfect batch size with the input. \nTry the following: ')
        for i in range(len(df)):
            if i != 0 and len(df)%i == 0:
                print('This number can do: ', i)
        print('\n\nEnd of numbers!')
        return None
    i = 0
    while i <= len(df)-batch_size+1:
        batch = []
        j = 0
        while j <= batch_size:
            #print('debug mode: ', df.values[i]) #This line is just for debugging
            #batch.append(torch.tensor(df.values[i])) #original
            batch.append(df.values[i]) #new --modified
            i += 1
            j += 1
        #batched_data.append(torch.stack(batch)) #original
        batched_data.append(np.stack(batch)) #new --modified
    #return torch.stack(batched_data).float()
    return np.stack(batched_data)#.float()

In [None]:
import torch
batched_train = perfect_batches2(train_df, 5)
batched_test = perfect_batches2(test_df, 3)

In [None]:
for batch in train_loader:
    print(batch[:, :-1])

In [None]:
###if above is going to give problems use Dataloaders

In [None]:
import torch
import torch.nn as nn

class BinaryClassifier(nn.Module):
    def __init__(self, input_len):
        super(BinaryClassifier, self).__init__()
        self.fc1 = nn.Linear(input_len, 200)
        self.bn1 = nn.BatchNorm1d(200)
        self.fc2 = nn.Linear(200, 100)
        self.bn2 = nn.BatchNorm1d(100)
        self.fc3 = nn.Linear(100, 50)
        self.bn3 = nn.BatchNorm1d(50)
        self.fc4 = nn.Linear(50, 1)
        self.sigmoid = nn.Sigmoid()
        #self.tanh = nn.Tanh()
        self.dropout = nn.Dropout(0.30)

    def forward(self, x):
        x = x.double()
        x = self.fc1(x)
        x = self.bn1(x)
        x = nn.functional.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.bn2(x)
        x = nn.functional.relu(x)
        x = self.dropout(x)
        x = self.fc3(x)
        x = self.bn3(x)
        x = nn.functional.relu(x)
        x = self.dropout(x)
        x = self.fc4(x)
        x = self.sigmoid(x)
        #x = self.tanh(x)
        return x


In [None]:
import torch.optim as optim
def modelAndOptim(input_len, lr):
    #modell = movieModel(input_len)
    modell = BinaryClassifier(input_len)
    optimizer = optim.SGD(modell.parameters(), lr)
    #optimizer = optim.Adam(modell.parameters(), lr)
    loss_func = nn.BCEWithLogitsLoss()
    #loss_func = nn.BCELoss()
    #loss_func = nn.CrossEntropyLoss()
    return modell, optimizer, loss_func

In [None]:
def accuracy(pred, actual):
    pred = torch.tensor([1.0 if i >= 0.5 else 0.0 for i in pred])
    #print('preddd: ', pred)
    #actual = torch.tensor([1 if i >= 7.0 else 0 for i in actual])
    same = []
    for i in range(len(pred)):
        if pred[i] == actual[i]:
            same.append(1)
        else:
            same.append(0)
    same = torch.tensor(same)
    #print(pred)
    #print(actual)
    #print(same)
    return (sum(same)/len(same)).item()
    
#accuracy(v, a)

In [None]:
#create a train method and validation
#Theres an error i spotted i have to make sure that it loops through the batch one at a time not all at once
model, optimizer, loss = modelAndOptim(22, 0.005)
def train(epoch, train_data, model, optimizer, loss):
    model.train()
    model = model.double()
    sum_loss = []
    accuracy_count = []
    for e in range(epoch):
        losses = []
        #acc = []
        acc = None
        
        #train_data = next(iter(train_data)).type(torch.float32)
        #print('train_data')
        #print(train_data)
        #print(train_data[0, :-1])
        #print(train_data[0, :])
        for batch in train_data:
            #for rec in batch:
            features = batch[:, :-1]
            #print(features)
            label = batch[:, -1:]
            #print(label)
            optimizer.zero_grad()

            predicted = model(features)
            #print('predited')
            #print('p: ', len(predicted))
            #print('l: ', label)
            predicted = predicted.view(len(predicted), 1)
            label = label.view(len(label), 1)
            #print('pridicted size: ', predicted.shape)
            #print('label dType: ', label.shape)
            #print('pridicted: ', predicted)
            #print('label: ', label)
            loss_error = loss(predicted.double(), label.double())
            losses.append(loss_error.item())
            loss_error.backward()
            optimizer.step()
            #acc.append(accuracy(predicted.view(-1), label.view(-1)))
            acc = accuracy(predicted.view(-1), label.view(-1))
            #print('Average error at current  ', sum(losses)/len(losses))
            #print('done')
        #print(predicted)
        print('Average error at ',e,' epoch:  ', sum(losses)/len(losses))
        #acr = sum(acc)/len(acc)
        acr = acc
        print('current batch accurate prediction: ', acr)
        accuracy_count.append(acr)
        sum_loss.append(sum(losses)/len(losses))
        if acr >= 0.999 and e == 500:
            plt.plot(sum_loss, label = 'loss')
            #print(sum_loss)
            plt.legend()
            plt.show()
            return predicted, label
    print('\nOverall Accuracy: ', sum(accuracy_count)/len(accuracy_count))
    print('\n\n\n\n')
    #plt.plot(label, label = 'True')
    plt.plot(sum_loss, label = 'loss')
    #print(sum_loss)
    plt.legend()
    plt.show()
    print('end of train')
    print("")
    return predicted, label

In [None]:
pr, lr = train(250, train_loader, model, optimizer, loss)

import torch
import torch.nn as nn
import torch.optim as optim

# Define the ANN architecture
class SentimentModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SentimentModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, hidden_size//2)
        self.fc3 = nn.Linear(hidden_size//2, output_size)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x=x.double()
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.fc3(out)
        out = self.sigmoid(out)
        return out
model = SentimentModel(input_size = 22, hidden_size=100, output_size=1)
# Define the loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Define the training loop
def train(model, train_loader, criterion, optimizer, epochs):
    model = model.double()
    acr = []
    sum_loss = []
    for epoch in range(epochs):
        running_loss = 0.0
        accuracy_count = []
        for i, data in enumerate(train_loader):
            #print(data)
            #inputs, labels = data
            features = data[:, :-1]
            labels = data[:, -1:]
            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            acc = accuracy(outputs.view(-1), labels.view(-1))
        print('[Epoch %d] loss: %.3f' % (epoch + 1, running_loss / len(train_loader)))
        acr = acc
        print('Accuracy: ', acr)
        accuracy_count.append(acr)
    sum_loss.append(sum(losses)/len(losses))
    if acr >= 0.999:
        plt.plot(sum_loss, label = 'loss')
        #print(sum_loss)
        plt.legend()
        plt.show()
    return outputs, labels
train(model, train_loader, criterion, optimizer, 100)

In [None]:
def test(epoch, test_data, model, optimizer, loss):
    model.eval()
    #model = model.float()
    pred_labels = []
    actual_labels = []
    accuracy_count = []
    for e in range(epoch):
        losses = []
        #print(len(test_data))
        
        #train_data = next(iter(train_data)).type(torch.float32)
        #print('train_data')
        #print(train_data)
        #print(train_data[0, :-1])
        #print(train_data[0, :])
        for batch in test_data:
            features = batch[:, :-1]
            #print(features)
            label = batch[:, -1]
            #print(features.shape)
            #features is #7, label is #1

            predicted = model(features)
            #print(predicted)
            predicted = predicted.view(len(predicted), 1)
            pred_labels.append(predicted)
            label = label.view(len(predicted), 1)
            actual_labels.append(label)
            #print('pridicted size: ', predicted.shape)
            #print('label dType: ', label.shape)
            #print('pridicted: ', predicted)
            #print('label: ', label)
            loss_error = loss(predicted, label)
            losses.append(loss_error.item())
            acc = accuracy(predicted.view(-1), label.view(-1))
            print(':::percentage of correct batch prediction:- ', acc*100, '%')
            accuracy_count.append(acc)
            #loss_error.backward()
            #optimizer.step()
            #print('done')
        #print(predicted)
        #print('Average error at ',e,' epoch:  ', sum(losses)/len(losses))
    print('\nOverall Accuracy for validation: ', sum(accuracy_count)/len(accuracy_count))
    print('\n\n\n\n')
    print('end of test')
    print("")
    return pred_labels, actual_labels

In [None]:
#p, l = train(500, test_loader, model, optimizer, loss)

In [None]:
p_t, l_t = test(1, test_loader, model, optimizer, loss)

In [None]:
p_t[0]

In [None]:
def comp(pred, actual):
    #print(pred)
    pred = torch.tensor([1.0 if i >= 0.5 else 0.0 for i in pred])
    #print('preddd: ', pred)
    #actual = torch.tensor([1 if i >= 7.0 else 0 for i in actual])
    same = []
    for i in range(len(pred)):
        if pred[i] == actual[i]:
            same.append(1)
        else:
            same.append(0)
    same = torch.tensor(same)
    error = 0
    for i in same:
        if i == 0:
            error += 1
    #print(pred)
    #print(actual)
    print(same)
    print('out of ', len(actual), ' predicted values, the miss predicted values are: ', error)
    print('Thus making the accuracy: ', (sum(same)/len(same)).item())
    print('Where Zero appears is the data that was miss predicted')
    return pred, actual

In [None]:
p, a = comp(p_t[0], l_t[0])

In [None]:
pr

In [None]:
lr

In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(a,p))

In [None]:
from sklearn.metrics import classification_report
print (classification_report(a, p))