In [31]:
#Import pandas
import pandas as pd

#Import Natural Language Toolkit
import nltk

#Import Beautiful Soup
# from bs4 import BeautifulSoup

#Import string for list of punctuation
import string

#Import stop word list
from nltk.corpus import stopwords as stopwords

#Import tokenizer
from nltk.tokenize import RegexpTokenizer

#Import Lemmatizer
from nltk.stem import WordNetLemmatizer

#Import Stemmer
from nltk.stem.porter import PorterStemmer

In [32]:
df = pd.read_csv('C:/Users/ASUS/Desktop/NFSML/Project 2 (NLP Binary Classification)/stock_data.csv')
df.head()

Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


In [33]:
df.shape

(5791, 2)

In [34]:
# Remove punctuation
def remove_punctuation(text):
    no_punct = "".join([c for c in text if c not in string.punctuation])
    return no_punct

df['Text'] = df['Text'].apply(lambda x: remove_punctuation(x))
df['Text'].head()

0    Kickers on my watchlist XIDE TIT SOQ PNK CPW B...
1    user AAP MOVIE 55 return for the FEAGEED indic...
2    user Id be afraid to short AMZN  they are look...
3                                     MNTA Over 1200  
4                                      OI  Over 2137  
Name: Text, dtype: object

In [35]:
#Instantiate Tokenizer
tokenizer = RegexpTokenizer(r'\w+')

In [36]:
#Tokenize
df['Text'] = df['Text'].apply(lambda x: tokenizer.tokenize(x.lower()))
df['Text'].head(20)

0     [kickers, on, my, watchlist, xide, tit, soq, p...
1     [user, aap, movie, 55, return, for, the, feage...
2     [user, id, be, afraid, to, short, amzn, they, ...
3                                    [mnta, over, 1200]
4                                      [oi, over, 2137]
5                                     [pgnx, over, 304]
6     [aap, user, if, so, then, the, current, downtr...
7     [mondays, relative, weakness, nyx, win, tie, t...
8     [goog, ower, trend, line, channel, test, volum...
9         [aap, will, watch, tomorrow, for, ong, entry]
10    [im, assuming, fcx, opens, tomorrow, above, th...
11    [it, really, worries, me, how, everyone, expec...
12    [aap, gamcos, arry, haverty, apple, is, extrem...
13    [user, maykiljil, posted, that, i, agree, that...
14    [momentum, is, coming, back, to, etfc, broke, ...
15    [ha, hitting, 3565, means, resume, targeting, ...
16    [user, gameplan, shot, for, today, but, i, lik...
17    [with, fcx, gapping, well, above, ideal, e

In [37]:
#Remove stop words
def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words

df['Text'] = df['Text'].apply(lambda x: remove_stopwords(x))
df['Text'].head()

0    [kickers, watchlist, xide, tit, soq, pnk, cpw,...
1    [user, aap, movie, 55, return, feageed, indica...
2    [user, id, afraid, short, amzn, looking, like,...
3                                         [mnta, 1200]
4                                           [oi, 2137]
Name: Text, dtype: object

In [41]:
#Instantiate Lemmatizer
lemmatizer = WordNetLemmatizer()

#Lemmatize
def word_lemmatizer(text):
    lem_text = " ".join([lemmatizer.lemmatize(i) for i in text])
    return lem_text

df['Text'] = df['Text'].apply(lambda x: word_lemmatizer(x))
df['Text'].head()

0    kicker watchlist xide tit soq pnk cpw bpz aj t...
1    user aap movie 55 return feageed indicator 15 ...
2    user id afraid short amzn looking like nearmon...
3                                            mnta 1200
4                                              oi 2137
Name: Text, dtype: object

In [19]:
#Instantiate Stemmer
stemmer = PorterStemmer()

#Stemming
def word_stemmer(text):
    stem_text = " ".join([stemmer.stem(i) for i in text])
    return stem_text

df['Text'] = df['Text'].apply(lambda x: word_stemmer(x))
df['Text'].head()

0    kicker watchlist xide tit soq pnk cpw bpz aj t...
1    user aap movi 55 return feage indic 15 trade y...
2    user id afraid short amzn look like nearmonopo...
3                                            mnta 1200
4                                              oi 2137
Name: Text, dtype: object

In [42]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['Text'],
                                                    df['Sentiment'],
                                                    test_size = 0.3)

In [43]:
#Vectorization
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

cvec = CountVectorizer()
tfidf = TfidfVectorizer()

#Preparing the cvec & tfidf X data
X_train_cvec = cvec.fit_transform(X_train)
X_test_cvec = cvec.transform(X_test)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [44]:
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

In [45]:
# Naive Bayes and tfidf

# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(X_train_tfidf,y_train)

# predict the labels on validation dataset
predictions_NB = Naive.predict(X_test_tfidf)
# Use accuracy_score function to get the accuracy
print("Naive Bayes(tfidf) Accuracy Score -> ",accuracy_score(predictions_NB, y_test)*100)

Naive Bayes(tfidf) Accuracy Score ->  72.78481012658227


In [46]:
# Naive Bayes and cvec

# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(X_train_cvec,y_train)

# predict the labels on validation dataset
predictions_NB = Naive.predict(X_test_cvec)
# Use accuracy_score function to get the accuracy
print("Naive Bayes(cvec) Accuracy Score -> ",accuracy_score(predictions_NB, y_test)*100)

Naive Bayes(cvec) Accuracy Score ->  76.23705408515535


In [47]:
# SVM and tfidf

# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(X_train_tfidf,y_train)

# predict the labels on validation dataset
predictions_SVM = SVM.predict(X_test_tfidf)

# Use accuracy_score function to get the accuracy
print("SVM(tfidf) Accuracy Score -> ",accuracy_score(predictions_SVM, y_test)*100)

SVM(tfidf) Accuracy Score ->  78.25086306098964


In [48]:
# SVM and cvec

# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(X_train_cvec,y_train)

# predict the labels on validation dataset
predictions_SVM = SVM.predict(X_test_cvec)

# Use accuracy_score function to get the accuracy
print("SVM(cvec) Accuracy Score -> ",accuracy_score(predictions_SVM, y_test)*100)

SVM(cvec) Accuracy Score ->  76.2945914844649


In [None]:
# SVM on tfidf vectorized data (without stemming) performed the best