Use the following dataset - https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [1]:
import numpy as np
import pandas as pd


In [31]:
# Dataset: IMDB movies reviews dataset, two columns: review and sentiment of review.

from google.colab import files
uploaded = files.upload()

Saving IMDB Dataset.csv to IMDB Dataset (1).csv


In [3]:
reviews = pd.read_csv('IMDB Dataset.csv')

In [5]:
df = reviews.copy()

In [6]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
#Text Preprocessing:
#1. lowercasing
df['review'] = df['review'].str.lower()


In [8]:
#2. removing HTML tags
import re

def remove_tags(text):
  pattern = re.compile('<.*?>')
  return pattern.sub(r'',text)

df['review'] = df['review'].apply(remove_tags)

In [9]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [10]:
#3. remove punctuation marks
import string
exclude = string.punctuation
def remove_punc(text):
  return text.translate(str.maketrans('','',exclude))

for i in range(0,len(df['review'])):
  text = remove_punc(df.loc[i,'review'])
  df.loc[i,'review'] = text

In [12]:
#4. removing stop words

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')


def remove_stopwords(text):
  exclude = stopwords.words('english')
  new_text = []
  for word in text:
    if word in exclude:
      pass
    else:
      new_text.append(word)

  x = new_text[:]
  new_text.clear()
  return " ".join(x)

for i in range(0, len(df['review'])):
  df.loc[i, 'review'] = remove_stopwords(df.loc[i, 'review'].split())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [13]:
#5. tokenization
import spacy
nlp = spacy.load('en_core_web_sm') # English small dictionary

def tokenization(overview):
  doc = nlp(overview)
  tokens = []
  for i in doc:
    tokens.append(i.text)

  return tokens

for i in range(0, len(df['review'])):
  df.at[i,'review'] =  tokenization(df.loc[i, 'review'])

In [14]:
df.head()

Unnamed: 0,review,sentiment
0,"[one, reviewers, mentioned, watching, 1, oz, e...",positive
1,"[wonderful, little, production, filming, techn...",positive
2,"[thought, wonderful, way, spend, time, hot, su...",positive
3,"[basically, there, s, family, little, boy, jak...",negative
4,"[petter, matteis, love, time, money, visually,...",positive


In [15]:
#6. stemming
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()
def stem_words(tokens):
  stem_tokens = []
  for i in tokens:
    stem_tokens.append(ps.stem(i))

  return stem_tokens

for i in range(0, len(df['review'])):
  df.at[i,'review'] =  stem_words(df.loc[i, 'review'])



In [16]:
df.head()

Unnamed: 0,review,sentiment
0,"[one, review, mention, watch, 1, oz, episod, y...",positive
1,"[wonder, littl, product, film, techniqu, unass...",positive
2,"[thought, wonder, way, spend, time, hot, summe...",positive
3,"[basic, there, s, famili, littl, boy, jake, th...",negative
4,"[petter, mattei, love, time, money, visual, st...",positive


In [20]:
for i in range(0,len(df['review'])):
  df.at[i,'review'] = " ".join(df.loc[i,'review'])

In [23]:
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

In [24]:
from sklearn.model_selection import train_test_split

X = df['review']
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)
print(len(X_train))
print(len(X_test))

40000
10000


In [22]:
# Text Representation/Feature Extraction
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features = 5000)
X_train_tfidf = tfidf.fit_transform(X_train)
print(X_train_tfidf.shape)
X_test_tfidf = tfidf.transform(X_test)
print(X_test_tfidf.shape)

(40000, 5000)
(10000, 5000)


In [25]:
# Model Training
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(C = 1.0, max_iter=1000, random_state=42, solver='liblinear')
lr.fit(X_train_tfidf,y_train)

In [26]:
predictions = lr.predict(X_test_tfidf)

In [30]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

accuracy = accuracy_score(y_test, predictions)
print('Accuracy: ', accuracy)

print('\nConfusion Matrix:\n ', confusion_matrix(y_test, predictions))
print('\nClassification Report:\n ', classification_report(y_test,predictions, target_names=['Negative (0)', 'Positive (1)']))



Accuracy:  0.8843

Confusion Matrix:
  [[4302  659]
 [ 498 4541]]

Classification Report:
                precision    recall  f1-score   support

Negative (0)       0.90      0.87      0.88      4961
Positive (1)       0.87      0.90      0.89      5039

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000

