## **Dow Jones Industrial Average (DJIA) Movement Prediction Using News Headlines**



> We have taken a dataset of top 25 news headlines for a day and combine it with Dow Jones Industrial Average (DJIA) index movement where, "0" refers to the downward movement and "1" referes to the upwoed movement. We created a classifier to predict DJIA index movement based on the given headlines. Data covers the span of 2008-2016. 



In [None]:
# Import Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from collections import defaultdict

In [None]:
# Importing libraries (NLTK)
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn

# Note:- There are some additional functions used which need to be downloded for first time. To download uncomment the code below.

# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Import Libraries (model creation and vectorizer)
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn import svm
from sklearn.metrics import accuracy_score

In [None]:
# Read in the data
np.random.seed(500)
data = pd.read_csv('Full_Data.csv', encoding = "ISO-8859-1")
data.head(1)

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,Top9,Top10,Top11,Top12,Top13,Top14,Top15,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,2000-01-03,0,A 'hindrance to operations': extracts from the...,Scorecard,Hughes' instant hit buoys Blues,Jack gets his skates on at ice-cold Alex,Chaos as Maracana builds up for United,Depleted Leicester prevail as Elliott spoils E...,Hungry Spurs sense rich pickings,Gunners so wide of an easy target,Derby raise a glass to Strupar's debut double,"Southgate strikes, Leeds pay the penalty",Hammers hand Robson a youthful lesson,Saints party like it's 1999,Wear wolves have turned into lambs,Stump mike catches testy Gough's taunt,Langer escapes to hit 167,Flintoff injury piles on woe for England,Hunters threaten Jospin with new battle of the...,Kohl's successor drawn into scandal,The difference between men and women,"Sara Denver, nurse turned solicitor",Diana's landmine crusade put Tories in a panic,Yeltsin's resignation caught opposition flat-f...,Russian roulette,Sold out,Recovering a title


In [None]:
# Combine News Headlines into one for sementic analysis
data['headlines'] = data[data.columns[2:]].apply(lambda x: ' '.join(x.astype(str)), axis=1)

# Creating copy of data (corpus)
corpus = data.copy()
corpus = corpus[['Date', 'headlines', 'Label']]
corpus.head(5)

Unnamed: 0,Date,headlines,Label
0,2000-01-03,A 'hindrance to operations': extracts from the...,0
1,2000-01-04,Scorecard The best lake scene Leader: German s...,0
2,2000-01-05,Coventry caught on counter by Flo United's riv...,0
3,2000-01-06,Pilgrim knows how to progress Thatcher facing ...,1
4,2000-01-07,Hitches and Horlocks Beckham off but United su...,1


In [None]:
# SHape of data
corpus.shape

(4101, 3)

In [None]:
corpus.isnull().sum()

Date         0
headlines    0
Label        0
dtype: int64

In [None]:
# Convert every words of headline to lowercase to eliminate the diferrence of uppercase and lowercase words 
corpus['headlines'] = [entry.lower() for entry in corpus['headlines']]

# Tokenizing headlines
corpus['headlines'] = [word_tokenize(entry) for entry in corpus['headlines']]
corpus.head(5)

Unnamed: 0,Date,headlines,Label
0,2000-01-03,"[a, 'hindrance, to, operations, ', :, extracts...",0
1,2000-01-04,"[scorecard, the, best, lake, scene, leader, :,...",0
2,2000-01-05,"[coventry, caught, on, counter, by, flo, unite...",0
3,2000-01-06,"[pilgrim, knows, how, to, progress, thatcher, ...",1
4,2000-01-07,"[hitches, and, horlocks, beckham, off, but, un...",1


In [None]:
# Remove Stop words, Numerics and perfom Word Stemming/Lemmenting.

# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
tag_map

for index,entry in enumerate(corpus['headlines']):
    
    # Empty List to store the words for each headline
    Final_words = []

    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    
    # Function pos_tag will provide the 'tag' i.e if the word is Noun(N) or Verb
    for word, tag in pos_tag(entry):

        # Condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)

    # The final processed set of words for each iteration will be stored in 'headlines'
    corpus.loc[index,'headlines'] = str(Final_words)
    
corpus.head(5)

Unnamed: 0,Date,headlines,Label
0,2000-01-03,"['operation', 'extract', 'leaked', 'report', '...",0
1,2000-01-04,"['scorecard', 'best', 'lake', 'scene', 'leader...",0
2,2000-01-05,"['coventry', 'catch', 'counter', 'flo', 'unite...",0
3,2000-01-06,"['pilgrim', 'know', 'progress', 'thatcher', 'f...",1
4,2000-01-07,"['hitch', 'horlocks', 'beckham', 'united', 'su...",1


In [None]:
# Train Test split
# We have data as time series so, it's not a good approach to split train and test split randomly. 
# We used data from 2000-2014 for training and rest as testing.

train = corpus[corpus['Date'] < '20150101']
test = corpus[corpus['Date'] > '20141231']

Train_X = train['headlines'].values
Train_Y = train['Label'].values

Test_X = test['headlines'].values
Test_Y = test['Label'].values

print(Train_X.shape)
print(Test_X.shape)
print(Train_Y.shape)
print(Test_Y.shape)

(3975,)
(378,)
(3975,)
(378,)


In [None]:
# Convert words to vectors

# Function to generate TF-IDF vectors (features) for headlines
def tfidf_vectorizer(train_X, test_X, ng_range):
  
  """
  params: 
  train_X: numpy array consisting training data (X = headlines)
  test_X: numpy array consisting test data (Y = label)
  ng_range: tuple having minimum and maximum range for NGram 
  """

  Tfidf_vect = TfidfVectorizer(ngram_range=ng_range)
  Tfidf_vect.fit(train_X)
  Train_X_Tfidf = Tfidf_vect.transform(train_X)
  Test_X_Tfidf = Tfidf_vect.transform(test_X)

  # Display the shape of training and test set after tf-idf vectorizer  
  print(Train_X_Tfidf.shape)
  print(Test_X_Tfidf.shape)

  return Train_X_Tfidf, Test_X_Tfidf

In [None]:
# Model Creation and Accuracy 
# We are using SVM for classification due to it's batter performance over logistic regression, K-nn, and Trees 
# for text data. (later we have tried with Multinomial Naive Bayes which tend to perform slightly well)

def svm_model(Train_X, Test_X, Train_Y, Test_Y):
  
  # Classifier - Algorithm - SVM
  # fit the training dataset on the classifier
  SVM = svm.SVC(C=1, kernel='linear', class_weight='balanced', gamma='auto')
  SVM.fit(Train_X, Train_Y)

  # predict the labels on validation dataset
  predictions_SVM = SVM.predict(Test_X)
  # Use accuracy_score function to get the accuracy
  accuracy = accuracy_score(Test_Y, predictions_SVM)*100

  # Display accuracy just to check
  print("SVM Accuracy Score -> ",accuracy)

  return accuracy

In [None]:
# Check for which NGram range model gives best performance.

result = {}
for i in range(1, 4):
  for j in range(i, 4):
    r = (i, j)
    print("============= ", r, " ==============")
    train_X_tfidf, test_X_tfidf = tfidf_vectorizer(Train_X, Test_X, ng_range=r)
    result[r] = svm_model(train_X_tfidf, test_X_tfidf, Train_Y, Test_Y)

result

(3975, 36724)
(378, 36724)
SVM Accuracy Score ->  79.8941798941799
(3975, 570795)
(378, 570795)
SVM Accuracy Score ->  84.12698412698413
(3975, 1276656)
(378, 1276656)
SVM Accuracy Score ->  84.92063492063492
(3975, 534071)
(378, 534071)
SVM Accuracy Score ->  83.86243386243386
(3975, 1239932)
(378, 1239932)
SVM Accuracy Score ->  84.39153439153439
(3975, 705861)
(378, 705861)
SVM Accuracy Score ->  81.74603174603175


{(1, 1): 79.8941798941799,
 (1, 2): 84.12698412698413,
 (1, 3): 84.92063492063492,
 (2, 2): 83.86243386243386,
 (2, 3): 84.39153439153439,
 (3, 3): 81.74603174603175}

In [None]:
Train_X_Tfidf, Test_X_Tfidf = tfidf_vectorizer(Train_X, Test_X, (2, 2))

# Hyperparameter tuning with Grid Search for SVM

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
param_grid = [{'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'C': [0.1, 1, 10, 100], 
                                       'gamma': [0.0001, 0.1, 1, 3]}]
print("Total length to check for best model and parameters: ",len(list(ParameterGrid(param_grid))))
gsc = GridSearchCV(svm.SVC(), param_grid, scoring='accuracy')
grid_result = gsc.fit(Train_X_Tfidf,Train_Y)
print("Best Parameters: ", grid_result.best_params_)
print("Best Estimator: ", grid_result.best_estimator_)
print("Best Score: ", grid_result.best_score_)

y_pred = grid_result.predict(Test_X_Tfidf)
print("Accuracy:", accuracy_score(Test_Y, y_pred))
# # results['rbf'] = accuracy_score(y_test, y_pred)
# print("Classification Report: ")
# print(classification_report(Test_Y, y_pred))
# print("Confusion Matrix: ")
# print(confusion_matrix(Test_Y, y_pred))

(3975, 534071)
(378, 534071)
Total length to check for best model and parameters:  64
Best Parameters:  {'C': 0.1, 'gamma': 0.0001, 'kernel': 'linear'}
Best Estimator:  SVC(C=0.1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
Best Score:  0.5272955974842767
Accuracy: 0.5079365079365079


In [None]:
# Multinomial Naive Bayes for NGram range (2, 2)

Train_X_Tfidf, Test_X_Tfidf = tfidf_vectorizer(Train_X, Test_X, (2, 2))
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
mnb.fit(Train_X_Tfidf, Train_Y)

pred = mnb.predict(Test_X_Tfidf)
acc = accuracy_score(Test_Y, pred)*100
print(acc)

(3975, 534071)
(378, 534071)
85.44973544973546
