# Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Using the Natural Language Toolkit for NLP

In [2]:
import nltk
import re

nltk.download('stopwords')
from nltk.corpus import stopwords

nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


In [3]:
data = pd.read_csv('imdb.csv')
data.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [4]:
data.shape

(40000, 2)

### Using only the 15000 text rows due to less computational power availability.

In [8]:
data = data.sample(15000).reset_index(drop=True)
data.shape

(15000, 2)

In [10]:
data.head()

Unnamed: 0,text,label
0,An excellent and accurate film... McGovern tak...,1
1,"I love this movie, Jouvet, Arletty, Blier, Car...",1
2,This film was my first acquaintance with the t...,1
3,"Simple, meaningful and delivers an emotional p...",1
4,Mary Pickford becomes the chieftain of a Scott...,0


## Splitting the Dependant and Independant Variables

In [11]:
X = data.iloc[:, 0].values
y = data.iloc[:, 1].values

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [13]:
len(X_train), len(X_test)

(12000, 3000)

# Label Encoding the Target Variable

In [14]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_train

array([1, 0, 1, ..., 0, 1, 0], dtype=int64)

# Cleaning the Data

In [15]:
wordnet = WordNetLemmatizer()

In [16]:
def clean_data(X):
  corpus = []

  for review in X:
    review = review.lower()
    review = re.sub('<br \/>', ' ', review)
    review = re.sub('[^a-zA-Z]', ' ', review)
    words = nltk.word_tokenize(review)
    words = [wordnet.lemmatize(word) for word in words if word not in stopwords.words('english')]
    w = ' '.join(words)
    corpus.append(w)
  return corpus

In [17]:
X_train = clean_data(X_train)

In [18]:
X_train[0]

'funny scathing critique russian society culture transition communism okno v parizh also show west unfavorable light group russian living st petersburg k peter great window west find magic portal instantly transport paris mamin film truly hilarious weird enough constantly keep even jaded film viewer toe song dream sequence deliciously disgusting fringe society culture mingle create memorable meaningful film anyone trying understand shift russian cultural sentiment since fall ussr begin'

# Encoding the test data using TF-IDF Vectorizer

In [19]:
tfidf = TfidfVectorizer()
X_train = tfidf.fit_transform(X_train).toarray()

In [20]:
X_train[0]

array([0., 0., 0., ..., 0., 0., 0.])

# Training the model

In [21]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
mnb.fit(X_train, y_train)

MultinomialNB()

In [22]:
X_test = clean_data(X_test)
X_test = tfidf.transform(X_test).toarray()

# Predicting the results

In [25]:
y_pred = mnb.predict(X_test)

In [26]:
# Convert y_pred to positive or negative
y_pred = le.inverse_transform(y_pred)
print(y_pred[:10])

[1 0 0 1 1 0 0 1 0 1]


In [27]:
len(y_test), len(y_pred)

(3000, 3000)

# Checking the accuracy

In [28]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
accuracy

0.8573333333333333