In [1]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import re  # for regex
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import pickle
import nltk

# Download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hriti\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hriti\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [3]:
# Load data
data = pd.read_csv('IMDB-Dataset.csv')
print(data.shape)
print(data.head())
data.info()
print(data.sentiment.value_counts())
print(data.review[0])


(50000, 2)
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB
sentiment
positive    25000
negative    25000
Name: count, dtype: int64
One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unfli

In [4]:
# Data cleaning functions
def clean(text):
    cleaned = re.compile(r'<.*?>')
    return re.sub(cleaned, '', text)

data.review = data.review.apply(clean)

def is_special(text):
    rem = ''
    for i in text:
        if i.isalnum():
            rem += i
        else:
            rem += ' '
    return rem

data.review = data.review.apply(is_special)

def to_lower(text):
    return text.lower()

data.review = data.review.apply(to_lower)

def rem_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    return [w for w in words if w not in stop_words]

data.review = data.review.apply(rem_stopwords)

def stem_txt(text):
    ss = SnowballStemmer('english')
    return " ".join([ss.stem(w) for w in text])

data.review = data.review.apply(stem_txt)

print(data.head())


                                              review sentiment
0  one review mention watch 1 oz episod hook righ...  positive
1  wonder littl product film techniqu unassum old...  positive
2  thought wonder way spend time hot summer weeke...  positive
3  basic famili littl boy jake think zombi closet...  negative
4  petter mattei love time money visual stun film...  positive


In [5]:
# Prepare data for model training
X = np.array(data.review)
y = np.array(data.sentiment)
cv = CountVectorizer(max_features=1000)
X = cv.fit_transform(X).toarray()
print("X.shape =", X.shape)
print("y.shape =", y.shape)

# Split the data
trainx, testx, trainy, testy = train_test_split(X, y, test_size=0.2, random_state=9)
print("Train shapes: X = {}, y = {}".format(trainx.shape, trainy.shape))
print("Test shapes: X = {}, y = {}".format(testx.shape, testy.shape))


X.shape = (50000, 1000)
y.shape = (50000,)
Train shapes: X = (40000, 1000), y = (40000,)
Test shapes: X = (10000, 1000), y = (10000,)


In [6]:
# Train the model
mnb = MultinomialNB(alpha=1.0, fit_prior=True)
mnb.fit(trainx, trainy)

# Predictions
ypm = mnb.predict(testx)
print("Multinomial = ", accuracy_score(testy, ypm))

# Save the model
pickle.dump(mnb, open('model1.pkl', 'wb'))


Multinomial =  0.831


In [7]:
# Prepare for single review prediction
rev = """Terrible. Complete trash. Brainless tripe. Insulting to anyone who isn't an 8 year old fan boy. 
Im actually pretty disgusted that this movie is making the money it is - what does it say about the people who 
brainlessly hand over the hard earned cash to be 'entertained' in this fashion and then come here to leave a 
positive 8.8 review?? Oh yes, they are morons. Its the only sensible conclusion to draw. How anyone can rate 
this movie amongst the pantheon of great titles is beyond me."""

f1 = clean(rev)
f2 = is_special(f1)
f3 = to_lower(f2)
f4 = rem_stopwords(f3)
f5 = stem_txt(f4)

# Creating the input vector
input_vector = cv.transform([f5]).toarray()
y_pred = mnb.predict(input_vector)

print("Predicted Sentiment: ", y_pred)


Predicted Sentiment:  ['negative']
