In [2]:
# Import libraries that will be used
import pandas as pd
import numpy as np
import nltk
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB,BernoulliNB 
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from statistics import mode
import random
import pickle

In [3]:
# Read the csv file into a dataframe
df_news = pd.read_csv("Combined_News_DJIA.csv")

In [4]:
df_news.head(2)

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,2008-08-08,0,"b""Georgia 'downs two Russian warplanes' as cou...",b'BREAKING: Musharraf to be impeached.',b'Russia Today: Columns of troops roll into So...,b'Russian tanks are moving towards the capital...,"b""Afghan children raped with 'impunity,' U.N. ...",b'150 Russian tanks have entered South Ossetia...,"b""Breaking: Georgia invades South Ossetia, Rus...","b""The 'enemy combatent' trials are nothing but...",...,b'Georgia Invades South Ossetia - if Russia ge...,b'Al-Qaeda Faces Islamist Backlash',"b'Condoleezza Rice: ""The US would not act to p...",b'This is a busy day: The European Union has ...,"b""Georgia will withdraw 1,000 soldiers from Ir...",b'Why the Pentagon Thinks Attacking Iran is a ...,b'Caucasus in crisis: Georgia invades South Os...,b'Indian shoe manufactory - And again in a se...,b'Visitors Suffering from Mental Illnesses Ban...,"b""No Help for Mexico's Kidnapping Surge"""
1,2008-08-11,1,b'Why wont America and Nato help us? If they w...,b'Bush puts foot down on Georgian conflict',"b""Jewish Georgian minister: Thanks to Israeli ...",b'Georgian army flees in disarray as Russians ...,"b""Olympic opening ceremony fireworks 'faked'""",b'What were the Mossad with fraudulent New Zea...,b'Russia angered by Israeli military sale to G...,b'An American citizen living in S.Ossetia blam...,...,b'Israel and the US behind the Georgian aggres...,"b'""Do not believe TV, neither Russian nor Geor...",b'Riots are still going on in Montreal (Canada...,b'China to overtake US as largest manufacturer',b'War in South Ossetia [PICS]',b'Israeli Physicians Group Condemns State Tort...,b' Russia has just beaten the United States ov...,b'Perhaps *the* question about the Georgia - R...,b'Russia is so much better at war',"b""So this is what it's come to: trading sex fo..."


### You can see that there are multiple headlines in each row. We will create a list that is easier for more readable. Just consists of headlines and their labels

In [5]:
# 1 is where the DJIA Adj Close value either rose or stayed the same 
positive = df_news['Label'] == 1

# 0 is where the DJIA Adj Close value saw a decline
negative = df_news['Label'] == 0

# Break the labels into separate dataframes
df_pos_news = df_news[positive]
df_neg_news = df_news[negative]

# Get columns to go through and get each headline (Only Top 10 to avoid noise in our data)
news_col = ['Top1', 'Top2', 'Top3', 'Top4', 'Top5', 'Top6', 'Top7',
       'Top8', 'Top9', 'Top10']

In [6]:
# Create full dataframe of headlines and their labels
df_data = pd.DataFrame(columns =["Headline","Label"])

# Take add data to new dataframe
for col in news_col:
    for headline in df_pos_news[col]:
        df_data = df_data.append({"Headline":str(headline),
                                 "Label":1},ignore_index=True)
        
    for headline in df_neg_news[col]:
        df_data = df_data.append({"Headline":str(headline),
                                 "Label":0},ignore_index=True)

In [7]:
df_data.shape

(19890, 2)

In [8]:
# Break the dataframe into x and y's
df_x = df_data["Headline"]
df_y = df_data["Label"]

In [9]:
# Now we split our training and testing data
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=4, shuffle=True)

In [10]:
# This will convert our words into numerical values
# These numerical values represent the frequency of the words
count_vector=TfidfVectorizer(min_df=1, stop_words='english',lowercase=True)

In [11]:
# Converting our text/training data into numerical data
x_traincv=count_vector.fit_transform(x_train)

In [13]:
# Saving this count vector
save_classifier = open(('count_vector.pickle'),"wb")
pickle.dump(count_vector, save_classifier)
save_classifier.close()

In [14]:
# Store this data as a array
x_train_array = x_traincv.toarray()

In [17]:
# Use different algorithms to classify the data
MNB_classifier = MultinomialNB()
BNB_classifier = BernoulliNB()
LR_classifier = LogisticRegression()
SGD_classifier = SGDClassifier()
LinearSVC_classifier = LinearSVC()
NuSVC_classifier = NuSVC()
SVC_classifier = SVC()

In [18]:
# Making sure our labels are numerical as well
y_train = y_train.astype('int')

In [19]:
# Training data with the different algorithms
MNB_classifier.fit(x_traincv, y_train)
BNB_classifier.fit(x_traincv, y_train)
LR_classifier.fit(x_traincv, y_train)
SGD_classifier.fit(x_traincv, y_train)
LinearSVC_classifier.fit(x_traincv, y_train)
NuSVC_classifier.fit(x_traincv, y_train)
SVC_classifier.fit(x_traincv, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [36]:
# Transform test data so we can make some predictions
x_testcv= count_vector.transform(x_test)

In [41]:
type(x_test)

pandas.core.series.Series

In [22]:
# Get prediction from each classifier
MNB_pred = MNB_classifier.predict(x_testcv)
BNB_pred = BNB_classifier.predict(x_testcv)
LR_pred = LR_classifier.predict(x_testcv)
SGD_pred = SGD_classifier.predict(x_testcv)
linearSVC_pred = LinearSVC_classifier.predict(x_testcv)
NuSVC_pred = NuSVC_classifier.predict(x_testcv)
SVC_pred = SVC_classifier.predict(x_testcv)

In [23]:
# Store as an array to iterate through
y_test_array=np.array(y_test)

In [24]:
# Let's get a quick glimpse of the accuracy for each model
classifiers = [MNB_pred,BNB_pred,LR_pred,SGD_pred,linearSVC_pred,NuSVC_pred,SVC_pred]
classifier_string= ['MNB','BNB','LR','SGD','Linear','NuSVC','SVC']
name = 0

for classifier in classifiers:
    count = 0
    for prediction in range(len(classifier)):
        if classifier[prediction]== y_test_array[prediction]:
            count+=1
    print(classifier_string[name],"accuracy: ", count/len(classifier))
    name+=1

MNB accuracy:  0.49949723479135244
BNB accuracy:  0.49547511312217196
LR accuracy:  0.5108094519859225
SGD accuracy:  0.5077928607340372
Linear accuracy:  0.5
NuSVC accuracy:  0.514831573655103
SVC accuracy:  0.5175967823026647


## As you can see these classifiers are each guessing a little over half of the test data correct. They will still be used because if enough of the classifiers guess a certain label then it will still give us confidence. This will be explain in the code for the GUI

In [25]:
# Now we will save each classifier for use in our GUI
classifiers = [MNB_classifier,BNB_classifier,LR_classifier,SGDC_classifier,LinearSVC_classifier,NuSVC_classifier,SVC_classifier]
classifier_string= ['MNB_classifier','BNB_classifier','LR_classifier','SGD_classifier','Linear_classifier','NuSVC_classifier','SVC_classifier']
name = 0

for classifier in classifiers:
    save_classifier = open((classifier_string[name]+'.pickle'),"wb")
    pickle.dump(classifier, save_classifier)
    save_classifier.close()
    print(classifier_string[name],"saved as pickle.")
    name +=1

MNB_classifier saved as pickle.
BNB_classifier saved as pickle.
LR_classifier saved as pickle.
SGD_classifier saved as pickle.
Linear_classifier saved as pickle.
NuSVC_classifier saved as pickle.
SVC_classifier saved as pickle.
