1) Extracting data

In [305]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import numpy as np
import string
import nltk
import re
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

tfidf_transformer = TfidfTransformer()
data = pd.read_csv(r"C:\Users\yuras\OneDrive\Документы\ML_ASSIGNMENT\archive\sentiment labelled sentences\sentiment labelled sentences\amazon_cells_labelled.txt",
                 delimiter="\t",header=None,names=["Amazon Review","Sentiment"])

data_test = pd.read_csv(r"C:\Users\yuras\OneDrive\Документы\ML_ASSIGNMENT\archive\sentiment labelled sentences\sentiment labelled sentences\amazon_cells_labelled.txt",
                 delimiter="\t",header=None,names=["Amazon Review","Sentiment"])


data.head(5)

Unnamed: 0,Amazon Review,Sentiment
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


2) Applying CountVectorizer, and then TfidTransformer to the dataset.
3) Predicting the data using Naive Bayes classifier and testing the accuracy of the prediction

CountVectorizer - Converts a collection of text documents to a matrix of token counts. 

TfidTransformer - for applying Term Frequencies and TF-IDF

In [306]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(data["Amazon Review"])

#print(X_train_counts)

X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

#print(X_train_tfidf)

print(X_train_tfidf.shape)

clf = MultinomialNB().fit(X_train_tfidf, data["Sentiment"])
predict = clf.predict(X_train_tfidf)
np.mean(predict == data["Sentiment"])

(1000, 1847)


0.964

4) The same process as before but using a pipeline:

In [307]:
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),                    
                     ('clf', MultinomialNB()), ])

text_clf = text_clf.fit(data_test["Amazon Review"], data_test["Sentiment"])

predicted = text_clf.predict(data_test["Amazon Review"])
np.mean(predicted == data_test["Sentiment"])

0.964

5) Performing data preprocessing

Step 1 - removing punctuation

In [308]:
contraction_dict = {"ain't": "are not","'s":" is","aren't": "are not"}
string.punctuation
new_data = pd.read_csv(r"C:\Users\yuras\OneDrive\Документы\ML_ASSIGNMENT\archive\sentiment labelled sentences\sentiment labelled sentences\amazon_cells_labelled.txt",
                 delimiter="\t",header=None,names=["Amazon Review","Sentiment"])
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree
new_data["Amazon Review"]= new_data["Amazon Review"].apply(lambda x:remove_punctuation(x))

new_data.head(5)

Unnamed: 0,Amazon Review,Sentiment
0,So there is no way for me to plug it in here i...,0
1,Good case Excellent value,1
2,Great for the jawbone,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great,1


Step 2 - lowercasing

In [309]:
new_data["Amazon Review"] = new_data["Amazon Review"].apply(lambda x: x.lower())
new_data.head(5)

Unnamed: 0,Amazon Review,Sentiment
0,so there is no way for me to plug it in here i...,0
1,good case excellent value,1
2,great for the jawbone,1
3,tied to charger for conversations lasting more...,0
4,the mic is great,1


Step 3 - tokenization

In [310]:
def tokenization(text):
    text = str(text)
    tokens = re.split('W+',text)
    return tokens
new_data["Amazon Review"] = new_data.apply(lambda row: nltk.word_tokenize(row["Amazon Review"]), axis=1)
new_data.head(5)

Unnamed: 0,Amazon Review,Sentiment
0,"[so, there, is, no, way, for, me, to, plug, it...",0
1,"[good, case, excellent, value]",1
2,"[great, for, the, jawbone]",1
3,"[tied, to, charger, for, conversations, lastin...",0
4,"[the, mic, is, great]",1


Step 4 - stop words removal (Such as i, me, you..)

In [311]:
stopwords = nltk.corpus.stopwords.words('english')
#print(stopwords)
def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output
new_data["Amazon Review"] = new_data["Amazon Review"].apply(lambda x:remove_stopwords(x))
new_data.head(10)

Unnamed: 0,Amazon Review,Sentiment
0,"[way, plug, us, unless, go, converter]",0
1,"[good, case, excellent, value]",1
2,"[great, jawbone]",1
3,"[tied, charger, conversations, lasting, 45, mi...",0
4,"[mic, great]",1
5,"[jiggle, plug, get, line, right, get, decent, ...",0
6,"[several, dozen, several, hundred, contacts, i...",0
7,"[razr, owneryou, must]",1
8,"[needless, say, wasted, money]",0
9,"[waste, money, time]",0


Step 5 - Lemmanization (Recover words after stemming)

In [312]:
wordnet_lemmatizer = WordNetLemmatizer()
def lemmatizer(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text
new_data["Amazon Review"] = new_data["Amazon Review"].apply(lambda x:lemmatizer(x))
new_data.head(5)

Unnamed: 0,Amazon Review,Sentiment
0,"[way, plug, u, unless, go, converter]",0
1,"[good, case, excellent, value]",1
2,"[great, jawbone]",1
3,"[tied, charger, conversation, lasting, 45, min...",0
4,"[mic, great]",1


I have run into problems trying to classify the data after tokenization. 

Below is another version of data preprocessing implementation.

In [324]:
def lemmatize_words(text):
    lemmatizer = WordNetLemmatizer()
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

def preprocess_data(new_data, f1, f2):
    #Step 1 - contraction
    contraction_dict = {"ain't": "are not","'s":" is","aren't": "are not"}
    #Step 2 - punctuation
    string.punctuation
    new_data[f1] = new_data[f1].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '' , x))
    #Step 3 - lowercasing 
    new_data[f1] = new_data[f1].apply(lambda x: x.lower())
    #Step 4 - remove digits
    new_data[f1] = new_data[f1].apply(lambda x: re.sub('W*dw*','',x))
    #Step 5 - lemmanization 
    new_data[f1] = new_data[f1].apply(lambda text: lemmatize_words(text))
    
    
    return new_data
amazon_data = pd.read_csv(r"C:\Users\yuras\OneDrive\Документы\ML_ASSIGNMENT\archive\sentiment labelled sentences\sentiment labelled sentences\amazon_cells_labelled.txt",
                 delimiter="\t",header=None,names=["Amazon Review","Sentiment"])
amazon_data.head(5)

Unnamed: 0,Amazon Review,Sentiment
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


Now we can compare the initial input and the result after preprocessing

In [325]:
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),                    
                     ('clf', MultinomialNB()), ])

amazon_data = preprocess_data(amazon_data, "Amazon Review", "Sentiment")

text_clf = text_clf.fit(amazon_data["Amazon Review"], amazon_data["Sentiment"])

predicted = text_clf.predict(amazon_data["Amazon Review"])
np.mean(predicted == amazon_data["Sentiment"])

0.956

Now, let's repeat the process for the rest of the dataset:

In [327]:
imdb_data = pd.read_csv(r"C:\Users\yuras\OneDrive\Документы\ML_ASSIGNMENT\archive\sentiment labelled sentences\imdb_labelled.txt",
                 delimiter="\t",header=None,names=["Imdb Review","Sentiment"])

text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),                    
                     ('clf', MultinomialNB()), ])

text_clf = text_clf.fit(imdb_data["Imdb Review"], imdb_data["Sentiment"])

predicted = text_clf.predict(imdb_data["Imdb Review"])
np.mean(predicted == imdb_data["Sentiment"])


0.9812834224598931

What value to expect using preprocessing?

In [328]:
imdb_data = preprocess_data(imdb_data, "Imdb Review", "Sentiment")

text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),                    
                     ('clf', MultinomialNB()), ])

text_clf = text_clf.fit(imdb_data["Imdb Review"], imdb_data["Sentiment"])

predicted = text_clf.predict(imdb_data["Imdb Review"])
np.mean(predicted == imdb_data["Sentiment"])


0.9786096256684492

The same process for yelp reviews:

In [330]:
yelp_data = pd.read_csv(r"C:\Users\yuras\OneDrive\Документы\ML_ASSIGNMENT\archive\sentiment labelled sentences\yelp_labelled.txt",
                 delimiter="\t",header=None,names=["Yelp Review","Sentiment"])

text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),                    
                     ('clf', MultinomialNB()), ])

text_clf = text_clf.fit(yelp_data["Yelp Review"], yelp_data["Sentiment"])

predicted = text_clf.predict(yelp_data["Yelp Review"])
np.mean(predicted == yelp_data["Sentiment"])


0.964

In [331]:
yelp_data = preprocess_data(yelp_data, "Yelp Review", "Sentiment")
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),                    
                     ('clf', MultinomialNB()), ])

text_clf = text_clf.fit(yelp_data["Yelp Review"], yelp_data["Sentiment"])

predicted = text_clf.predict(yelp_data["Yelp Review"])
np.mean(predicted == yelp_data["Sentiment"])


0.962