In [1]:
from google.colab import drive
drive.mount("/content/drive", force_remount = True)

Mounted at /content/drive


In [0]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

In [0]:
import os
os.chdir("/content/drive/My Drive/Colab Notebooks/NLP_project")

In [0]:
#import train data
data = pd.read_csv('./stocknews/Combined_News_DJIA.csv')

In [5]:
data["Label"].value_counts()

1    1065
0     924
Name: Label, dtype: int64

In [6]:
print("Training data has {} trading days and {} titles/day".format(data.shape[0], data.shape[1]-2))

Training data has 1989 trading days and 25 titles/day


In [7]:
data.columns

Index(['Date', 'Label', 'Top1', 'Top2', 'Top3', 'Top4', 'Top5', 'Top6', 'Top7',
       'Top8', 'Top9', 'Top10', 'Top11', 'Top12', 'Top13', 'Top14', 'Top15',
       'Top16', 'Top17', 'Top18', 'Top19', 'Top20', 'Top21', 'Top22', 'Top23',
       'Top24', 'Top25'],
      dtype='object')

In [0]:
#split data to train and test size
train = data.iloc[0:1400]
test = data.iloc[1400:]

In [9]:
train["Label"].value_counts()

1    753
0    647
Name: Label, dtype: int64

In [10]:
test["Label"].value_counts()

1    312
0    277
Name: Label, dtype: int64

In [0]:
# Creating x_train and x_test
trainheadlines = []
for row in range(0,len(train.index)):
    trainheadlines.append(' '.join(str(x) for x in train.iloc[row,2:27]))
testheadlines = []
for row in range(0,len(test.index)):
    testheadlines.append(' '.join(str(x) for x in test.iloc[row,2:27]))

In [0]:
y_true = test['Label']
target_names = ["0", "1"]

In [13]:
basicvectorizer = TfidfVectorizer()
basictrain = basicvectorizer.fit_transform(trainheadlines)
print(basictrain.shape)

(1400, 29920)


In [14]:
#Use Logistic Regression with Tfidf 
basicmodel = LogisticRegression()
basicmodel = basicmodel.fit(basictrain, train["Label"])



In [0]:
basictest = basicvectorizer.transform(testheadlines)
predictions = basicmodel.predict(basictest)

In [16]:
from sklearn.metrics import classification_report
y_true = test['Label']
y_pred = predictions
target_names = ["0", "1"]
print(classification_report(y_true, y_pred, target_names=target_names))

              precision    recall  f1-score   support

           0       0.44      0.12      0.19       277
           1       0.53      0.87      0.65       312

    accuracy                           0.51       589
   macro avg       0.48      0.49      0.42       589
weighted avg       0.49      0.51      0.43       589



In [0]:
SVMmodel = LinearSVC()
advancedmodel1 = SVMmodel.fit(basictrain, train["Label"])

In [0]:
#Use SVM with Tfidf
basictest_2 = basicvectorizer.transform(testheadlines)
predictions_2 = SVMmodel.predict(basictest_2)

In [20]:
y_true = test['Label']
y_pred_2 = predictions_2
target_names = ["0", "1"]
print(classification_report(y_true, y_pred_2, target_names=target_names))

              precision    recall  f1-score   support

           0       0.47      0.34      0.40       277
           1       0.53      0.66      0.59       312

    accuracy                           0.51       589
   macro avg       0.50      0.50      0.49       589
weighted avg       0.50      0.51      0.50       589



In [0]:
countvectorizer =CountVectorizer(ngram_range=(2,3))
advancedtrain = countvectorizer.fit_transform(trainheadlines)

In [0]:
advancedmodel2 = SVMmodel.fit(advancedtrain, train["Label"])

In [0]:
advancedtest_3 = countvectorizer.transform(testheadlines)
predictions_3 = advancedmodel2.predict(advancedtest_3)

In [24]:
y_true = test['Label']
y_pred_3 = predictions_3
print(classification_report(y_true, y_pred_3, target_names=target_names))

              precision    recall  f1-score   support

           0       0.56      0.36      0.44       277
           1       0.57      0.75      0.65       312

    accuracy                           0.57       589
   macro avg       0.57      0.56      0.54       589
weighted avg       0.57      0.57      0.55       589



In [25]:
advwords = countvectorizer.get_feature_names()
advcoeffs = advancedmodel2.coef_.tolist()[0]
advcoeffdf = pd.DataFrame({'Words' : advwords, 
                        'Coefficient' : advcoeffs})
advcoeffdf = advcoeffdf.sort_values(['Coefficient', 'Words'], ascending=[0, 1])
advcoeffdf.head(10)


Unnamed: 0,Words,Coefficient
358225,in china,0.035421
628411,right to,0.030824
728058,the first,0.030355
56300,and other,0.029213
309536,government has,0.027263
750652,this is,0.027142
351909,if they,0.02695
30171,after the,0.026478
359577,in egypt,0.025122
328598,have to,0.024662
