## Sentiment Analysis

This IMDB dataset has 50,000 movie reviews for Natural Language Processing or Text Analytics.
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. Provided is a set of 25,000 highly polar movie reviews for training and 25,000 for testing. The objective is to predict the number of positive and negative reviews using either classification or deep learning algorithms.

In [1]:
import numpy as np
import pandas as pd

In [23]:
data = pd.read_csv("data/IMDB Dataset.csv")
data.head() #shows the first 5 rows of the dataset.

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [63]:
data.tail() #shows the last 5 rows of the dataset.

Unnamed: 0,review,sentiment
49995,i thought thi movi did a down right good job. ...,positive
49996,"bad plot, bad dialogue, bad acting, idiot dire...",negative
49997,i am a cathol taught in parochi elementari sch...,negative
49998,i'm go to have to disagre with the previou com...,negative
49999,no one expect the star trek movi to be high ar...,negative


In [12]:
data.columns #shows the types of columns in the dataset.

Index(['review', 'sentiment'], dtype='object')

In [14]:
data.shape #shows the number of rows and columns in the dataset

(50000, 2)

In [15]:
data.isnull().any() #checking for null values. If you have any null values, try to replace them.

review       False
sentiment    False
dtype: bool

In [18]:
data.isnull().sum() #Another way for checking the amount of null values in the dataset.

review       0
sentiment    0
dtype: int64

In [17]:
data.describe() #shows the summary statistics of the dataset

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [60]:
data['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

# Text Normalization

### Tokenization

In [62]:
# Visualization and utilities
import seaborn as sns
import matplotlib.pyplot as plt

# NLP
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tokenize.toktok import ToktokTokenizer

# Feature extraction (fixed module name)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.preprocessing import LabelBinarizer
from wordcloud import WordCloud, STOPWORDS

# Models & metrics
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Other
import spacy
import re, string, unicodedata
from bs4 import BeautifulSoup
from textblob import TextBlob, Word

In [37]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')  # wordnet lemmatizer lexicon

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jeffreyjackson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jeffreyjackson/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jeffreyjackson/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/jeffreyjackson/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [38]:
#Tokenization of text
tokenizer = ToktokTokenizer()
#setting English stopwords
stopwords = nltk.corpus.stopwords.words('english')

In [39]:
# Removing the noisy text
# BeautifulSoup is used to remove the html tags from the text
def noiseremoval_text(text):
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text()
    text = re.sub('\[[^]]*\]', '', text)
    return text

In [40]:
# Apply function on review column
data['review'] = data['review'].apply(noiseremoval_text)

In [41]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Stemming

In [42]:
# Stemming the text
# Stemming is the process of reducing a word to its base or root form.
def stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

In [43]:
# Apply function on review column
data['review'] = data['review'].apply(stemmer)

In [44]:
data.head()

Unnamed: 0,review,sentiment
0,one of the other review ha mention that after ...,positive
1,a wonder littl production. the film techniqu i...,positive
2,i thought thi wa a wonder way to spend time on...,positive
3,basic there' a famili where a littl boy (jake)...,negative
4,"petter mattei' ""love in the time of money"" is ...",positive


# Removing Stop Words

In [67]:
# Prepare a reusable stopword set (fast lookups) without shadowing the module name
from nltk.corpus import stopwords as nltk_stopwords
import string

stop_words = set(nltk_stopwords.words('english'))
punct = set(string.punctuation)

# Removing the stop words from the text
def removing_stopwords(text, is_lower_case=False):
    # Ensure text is a string (handles NaN)
    text = "" if text is None else str(text)

    # Tokenization
    tokens = tokenizer.tokenize(text)
    tokens = [t.strip() for t in tokens if t.strip()]

    if is_lower_case:
        # Optionally lower-case the tokens first
        tokens = [t.lower() for t in tokens]
        filtokens = [t for t in tokens if t not in stop_words and t not in punct]
    else:
        filtokens = [t for t in tokens if t.lower() not in stop_words and t not in punct]

    return " ".join(filtokens)

In [68]:
# Apply function on review column
data['review'] = data['review'].astype(str).apply(removing_stopwords)

In [69]:
data.head()

Unnamed: 0,review,sentiment
0,one review ha mention watch 1 oz episod hooked...,positive
1,wonder littl production. film techniqu veri un...,positive
2,thought thi wa wonder way spend time hot summe...,positive
3,basic famili littl boy jake think zombi hi clo...,negative
4,petter mattei love time money visual stun film...,positive


# Train Test Split

In [70]:
# Split the dataset
# Train dataset
train_reviews_data = data.review[:30000]

In [71]:
# Test dataset
test_reviews_data = data.review[30000:]

# Bag of Words

In [73]:
# Count vectorizer for bag of words
cv=CountVectorizer(min_df = 0.0, max_df = 1, binary = False, ngram_range = (1,3))
# Transformed trained reviews
cv_train = cv.fit_transform(train_reviews_data)
# Transformed test reviews
cv_test = cv.transform(test_reviews_data)

print('BOW_cv_train:', cv_train.shape)
print('BOW_cv_test:', cv_test.shape)


BOW_cv_train: (30000, 4954204)
BOW_cv_test: (20000, 4954204)


# TF_IDF

In [74]:
# TF_IDF vectorizer for bag of words
tf = TfidfVectorizer(min_df = 0.0, max_df = 1, binary = True, ngram_range = (1,3))
# Transformed trained reviews
tf_train = tf.fit_transform(train_reviews_data)
# Transformed test reviews
tf_test = tf.transform(test_reviews_data)

print('Tfidf_train:', tf_train.shape)
print('Tfidf_test:', tf_test.shape)

Tfidf_train: (30000, 4954204)
Tfidf_test: (20000, 4954204)


# Label Encoding

In [76]:
# Labeling the sentiment data
label = LabelBinarizer()
# Transformed sentiment data
sentiment_data = label.fit_transform(data['sentiment'])
print(sentiment_data.shape)

(50000, 1)


In [77]:
sentiment_data

array([[1],
       [1],
       [1],
       ...,
       [0],
       [0],
       [0]], shape=(50000, 1))

In [78]:
train_data = data.sentiment[:30000]

In [79]:
test_data = data.sentiment[30000:]

In [81]:
# Training the model
logistic = LogisticRegression(penalty = 'l2', max_iter = 500, C = 1, random_state = 42)
lr_bow = logistic.fit(cv_train, train_data)
print(lr_bow)

LogisticRegression(C=1, max_iter=500, random_state=42)


In [82]:
# Predicting the model for bag of words
bow_predict = logistic.predict(cv_test)
print(bow_predict)

['negative' 'negative' 'negative' ... 'negative' 'negative' 'negative']


In [87]:
# Accuracy score for bag of words
bow_score = accuracy_score(test_data, bow_predict)
print("lr_bow_score: ", bow_score)

lr_bow_score:  0.6168


In [86]:
# Fitting the model for tfidf features
lr_tfidf = logistic.fit(tf_train, train_data)
print(lr_tfidf)

LogisticRegression(C=1, max_iter=500, random_state=42)


In [88]:
# Predicting the model for tfidf features
lr_tfidf_predict = logistic.predict(tf_test)
print(lr_tfidf_predict)

['negative' 'negative' 'positive' ... 'negative' 'positive' 'positive']


In [89]:
# Accuracy score for tfidf features
lr_tfidf_score = accuracy_score(test_data, lr_tfidf_predict)
print("lr_tfidf_score: ", lr_tfidf_score)


lr_tfidf_score:  0.7421
