In [17]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [18]:
df = pd.read_csv('../sentiment-analysis-on-movie-reviews/train.tsv/train.tsv', sep="\t")

In [19]:
df.head(10)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
5,6,1,of escapades demonstrating the adage that what...,2
6,7,1,of,2
7,8,1,escapades demonstrating the adage that what is...,2
8,9,1,escapades,2
9,10,1,demonstrating the adage that what is good for ...,2


In [20]:
df.isnull().sum()

PhraseId      0
SentenceId    0
Phrase        0
Sentiment     0
dtype: int64

In [21]:
dfx = df.iloc[:,2].values
dfy = df.iloc[:,3].values

In [22]:
print(df.shape)
print(dfx.shape)
print(dfy.shape)

(156060, 4)
(156060,)
(156060,)


In [23]:
y_train = [x for x in dfy]

### Cleaning the training data

In [24]:
sw = stopwords.words('english')
sw.remove('not')
sw = set(sw)

ps = PorterStemmer()

In [25]:
def cleaning_pipeline(review):
    words = word_tokenize(review.lower())
    words = [ps.stem(word) for word in words if word not in sw and word.isalpha()]
    review = " ".join(words)
    return review

In [26]:
cleaned_reviews = [ cleaning_pipeline(review) for review in dfx]

### Loading and cleaning testing data

In [27]:
dftest = pd.read_csv('../sentiment-analysis-on-movie-reviews/test.tsv/test.tsv', sep="\t")

In [28]:
dftest.shape

(66292, 3)

In [29]:
dftest.head(10)


Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine
5,156066,8545,intermittently pleasing but
6,156067,8545,intermittently pleasing
7,156068,8545,intermittently
8,156069,8545,pleasing
9,156070,8545,but


In [30]:
y_test = dftest.iloc[:,2]


In [31]:
y_test.shape

(66292,)

In [32]:
cleaned_test_rev = [cleaning_pipeline(rev) for rev in y_test]

### Vectorization

In [33]:
cv = CountVectorizer(ngram_range=(1,3))


In [34]:
#cv2 = CountVectorizer(ngram_range=(2,2))

In [35]:
x_train_vect = cv.fit_transform(cleaned_reviews)

In [36]:
x_train_vect.shape

(156060, 129339)

In [37]:
x_test_vect = cv.transform(cleaned_test_rev)

In [38]:
x_test_vect.shape

(66292, 129339)

## Multinomial NB

In [39]:
from sklearn.naive_bayes import MultinomialNB

In [40]:
mnb = MultinomialNB()

In [41]:
#Trainig
mnb.fit(x_train_vect,y_train)

MultinomialNB()

In [42]:
#Prediction
prediction = mnb.predict(x_test_vect)

In [43]:
#score
mnb.score(x_train_vect,y_train)

0.7147122901448161

In [54]:
def predictUserIp(text):
    cleaned_text = cleaning_pipeline(text)
    text_vect = cv.transform([cleaned_text])
    return mnb.predict(text_vect)
    

In [66]:
ans = predictUserIp("spectacular")

In [67]:
ans[0]

3

## Multivariate Bernoulii NB

In [44]:
from sklearn.naive_bayes import BernoulliNB

In [45]:
bnb = BernoulliNB()

In [46]:
# Training 
bnb.fit(x_train_vect,y_train)

BernoulliNB()

In [47]:
# Testing 
pred2 = bnb.predict(x_test_vect)

In [48]:
#score
bnb.score(x_train_vect,y_train)

0.6556132256824299

## Hyperparameter tuning for Multinomial NB

In [49]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV