In [144]:
#general
import pandas as pd
import re
import string
import contractions
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from functools import reduce

#functions
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, classification_report

#models
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


In [115]:
df = pd.read_csv("stock_data.csv")
df['Sentiment'].value_counts()
#3685 labelled 1, 2106 labelled 0 

 1    3685
-1    2106
Name: Sentiment, dtype: int64

In [146]:
#stopwords 
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

#stemming
nltk.download('punkt')
ps = PorterStemmer()

#Vectorisation
count_vector = CountVectorizer()
tfidf_transformer = TfidfTransformer()
vectorizer = TfidfVectorizer()


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lionsee/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/lionsee/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [117]:
#pre-processing functions
def rmv_punc(txt):
    translator = str.maketrans('','', string.punctuation)
    return txt.translate(translator)

#function to remove stopwords
def rmv_stopwords(txt):
    return " ".join([word for word in str(txt).split() if word not in stop_words])

#function to reduce words to their stem word
def stem_fun(txt):
    sentence = word_tokenize(txt)
    stemmed_sentence = reduce(lambda x,y: x + " " + ps.stem(y), sentence, "")
    return stemmed_sentence

In [118]:
#change the sentiments to 0 and 1s
df['Sentiment'] = df['Sentiment'].replace(-1, 0)

#make everything lower-case
df['Text'] = df['Text'].apply(lambda x: x.lower())

#remove URLS
df['Text'] = df['Text'].apply(lambda x: re.sub(r'https?://\S+', '', x))

#expand short-form words
df['Text'] = df['Text'].apply(lambda x: contractions.fix(x))

#remove punctuations
df['Text'] = df['Text'].apply(rmv_punc)

#remove stop-words
df['Text'] = df['Text'].apply(lambda x: rmv_stopwords(x))

#stemming the sentence 
df['Text'] = df['Text'].apply(lambda x: stem_fun(x))

#remove extra white space
df['Text'] = df['Text'].apply(lambda x: re.sub(' +', ' ', x))



In [123]:
#obtain train test split
x = df['Text']
y = df['Sentiment']

X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [164]:
#testing various models
models = [LinearSVC,
          MultinomialNB,
          LogisticRegression,
          RandomForestClassifier
          ] 
accuracy = []
precision = []
recall = []

for m in models:
    model_ = m()
    
    pipe = Pipeline([('tfidf', TfidfVectorizer()), ('classifier', model_)])
    pipe.fit(X_train, Y_train)
    pred = pipe.predict(X_test)
    print(pipe)
    
    acc = accuracy_score(Y_test, pred)
    prec = precision_score(Y_test, pred)
    rec = recall_score(Y_test, pred)
    accuracy.append(acc)
    precision.append(prec)
    recall.append(rec)
    
    print(f'Accuracy of {m} is {accuracy[-1]}')
    print(f'Precision of {m} is {precision[-1]}')
    print(f'Recall score of {m} is {recall[-1]}')
    print("-------------------------------------")



Pipeline(steps=[('tfidf', TfidfVectorizer()), ('classifier', LinearSVC())])
Accuracy of <class 'sklearn.svm._classes.LinearSVC'> is 0.8015530629853321
Precision of <class 'sklearn.svm._classes.LinearSVC'> is 0.8346666666666667
Recall score of <class 'sklearn.svm._classes.LinearSVC'> is 0.855191256830601
-------------------------------------
Pipeline(steps=[('tfidf', TfidfVectorizer()), ('classifier', MultinomialNB())])
Accuracy of <class 'sklearn.naive_bayes.MultinomialNB'> is 0.730802415875755
Precision of <class 'sklearn.naive_bayes.MultinomialNB'> is 0.7108433734939759
Recall score of <class 'sklearn.naive_bayes.MultinomialNB'> is 0.9672131147540983
-------------------------------------
Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('classifier', LogisticRegression())])
Accuracy of <class 'sklearn.linear_model._logistic.LogisticRegression'> is 0.7868852459016393
Precision of <class 'sklearn.linear_model._logistic.LogisticRegression'> is 0.7823050058207218
Recall scor

In [None]:
#LinearSVC has the best accuracy of 0.802, precision of 0.836 and recall score of 0.855