<a href="https://colab.research.google.com/github/keerthi5083/Sentiment_analysis/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [58]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from bs4 import BeautifulSoup
import spacy
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from textblob import TextBlob
from textblob import Word
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [59]:
df = pd.read_csv("/content/drive/MyDrive/Projects/IMDB Dataset.csv")

In [60]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [61]:
print (df.shape)

(50000, 2)


In [62]:
print (df.columns)

Index(['review', 'sentiment'], dtype='object')


In [63]:
df.describe() #summary of dataset

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [64]:
print (df.isnull().sum())

review       0
sentiment    0
dtype: int64


In [65]:
df['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


In [66]:
#Spliting to training & testing data

train_r=df.review[:30000]
train_s=df.sentiment[:30000]

test_r=df.review[30000:]
test_s=df.sentiment[30000:]

print(train_r.shape,train_s.shape)
print(test_r.shape,test_s.shape)

(30000,) (30000,)
(20000,) (20000,)


In [67]:
nltk.download('stopwords')
#Normalisating the text

tokenizer=ToktokTokenizer()
stopword_list=nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [68]:
#Removing the html tags
def remove_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets content
def remove_bracket_text(text):
    return re.sub('\\[[^]]*\\]', '', text) #re(regular expression), searches for a pattern in a string and replaces it with something else. pattern - ('\[[^]]), replacement - ''(blank), to clean text

#Combine cleaning steps
def remove_noise_text(text):
    text = remove_html(text)
    text = remove_bracket_text(text)
    return text

#Apply cleaning to the entire dataset
df['review']=df['review'].apply(remove_noise_text)


In [69]:
# removing special characters
def remove_special_symbol(text, remove_digits=True):
    pattern=r'[^a-zA-Z0-9\s]'
    text=re.sub(pattern,'',text)
    return text

df['review']=df['review'].apply(remove_special_symbol)

In [70]:
#Stemming the text (helps ML models treat similar words as the same)
def stemming_sentence(text):
    ps=nltk.porter.PorterStemmer()     # create a ps object
    text= ' '.join([ps.stem(word) for word in text.split()])  #Split the sentence into words(text.split()), Apply stemming to every word, and Join words back into a sentence
    return text

df['review']=df['review'].apply(stemming_sentence)

In [71]:
#Setting English stopwords
stop=set(stopwords.words('english')) #loads a list of common English stopwords
print(stop)

#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)             #Spliting the text into individual words
    tokens = [token.strip() for token in tokens]  #Removing unnecessary spaces around words
    if is_lower_case:                                           #check if every word is in lowercase else convert it
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

df['review']=df['review'].apply(remove_stopwords)

{'if', 'ourselves', 'because', 'hers', 'nor', "won't", 'between', 'there', 'is', 'does', 'haven', 'off', 'of', 'only', "they're", 'which', 'our', 's', 'ain', 'such', 'himself', 'all', 'so', 'until', 'it', 'against', 'not', 'how', "you'd", 'other', 'both', 'in', 'yourselves', 'do', "we'll", 't', 'above', 'an', 'doesn', 'few', 'but', 'yours', 'ours', 'on', "you're", 'be', 'then', 'wouldn', 'can', "should've", 'his', "i'd", 'didn', 'their', 'no', 'through', 'under', 'has', 'were', 'and', "wasn't", 'was', 'isn', "it's", "mightn't", 'same', 'a', 'aren', 'further', "i've", 'you', 'own', 'hadn', "that'll", 'or', 'most', "weren't", 'than', 'by', 'once', 'why', 'mightn', 'my', 'me', 'been', "they've", 'he', 'her', 'him', 'any', 'doing', "aren't", 'as', "doesn't", "haven't", "she'll", 'them', 'before', "wouldn't", "isn't", 'each', "it'll", 'i', 'up', 'having', 'those', 'very', 'd', "shouldn't", "don't", "she's", 'ma', 'yourself', 'again', 're', "i'll", "mustn't", 'y', 'shan', 'who', 'here', 'aft

In [32]:
# Displaying the pre-processed train and test reviews
preprocessed_train_r=df.review[:30000]
preprocessed_train_r[0]   #[0] displaying from very first item

'one review ha mention watch 1 oz episod youll hook right thi exactli happen meth first thing struck oz wa brutal unflinch scene violenc set right word go trust thi show faint heart timid thi show pull punch regard drug sex violenc hardcor classic use wordit call oz nicknam given oswald maximum secur state penitentari focus mainli emerald citi experiment section prison cell glass front face inward privaci high agenda em citi home manyaryan muslim gangsta latino christian italian irish moreso scuffl death stare dodgi deal shadi agreement never far awayi would say main appeal show due fact goe show wouldnt dare forget pretti pictur paint mainstream audienc forget charm forget romanceoz doesnt mess around first episod ever saw struck nasti wa surreal couldnt say wa readi watch develop tast oz got accustom high level graphic violenc violenc injustic crook guard wholl sold nickel inmat wholl kill order get away well manner middl class inmat turn prison bitch due lack street skill prison exp

In [33]:
preprocessed_test_r=df.review[30000:]
preprocessed_test_r[35005]

'thi kind movi extrem bad cant stop watch becaus keep tell cannot continu thi crappi way end cant know wors jaw 4kind badi honestli think ive onli seen one movi wa wors thi talk religi crap youd end hell lie watch footballgor inde lot well made gore wayth act beyond bad line lousi clich goe storylin onli realli consist sex blood violenc like mani gore moviesif hope mix ichi killer august underground keep look wont find live feed'

In [34]:
# ================================
# COUNT VECTORIZER (BAG OF WORDS)
# ================================

# Goal:
# Convert preprocessed movie reviews (text) into numerical features
# so that machine learning models can learn sentiment patterns.

# Why CountVectorizer:
# ML models cannot understand raw text.
# CountVectorizer represents each review as a vector of word/phrase counts
# using the Bag of Words (BoW) approach.


cv=CountVectorizer(min_df=2,max_df=0.9,binary=False,ngram_range=(1,2))

# Parameter explanation:
# min_df = 2
#   → Keep only words/phrases that appear in at least 2 reviews
#   → Removes very rare and noisy words

# max_df = 0.9
#   → Remove words that appear in more than 90% of reviews
#   → Removes extremely common, non-informative words (e.g., "movie", "film")

# binary = False
#   → Use frequency counts (how many times a word appears)
#   → Helps sentiment models learn intensity (e.g., "great great great")

# ngram_range = (1,2)
#   → Use unigrams (1 word) and bigrams (2-word phrases)
#   → Captures sentiment context like:
#       "good" vs "not good"


#transformed train reviews
cv_train_r=cv.fit_transform(preprocessed_train_r)
#transformed test reviews
cv_test_r=cv.transform(preprocessed_test_r)

print('BOW_cv_train:',cv_train_r.shape)
print('BOW_cv_test:',cv_test_r.shape)
#vocab=cv.get_feature_names()-toget feature names

BOW_cv_train: (30000, 423204)
BOW_cv_test: (20000, 423204)


In [85]:
#Term Frequency – Inverse Document Frequency
#It is a way to convert text into numbers while reducing the importance of very common words.
# TF - How often a word appears in a review.
# IDF - How rare a word is across all reviews.
       #Word appears in many reviews → low importance
       #Word appears in few reviews → high importance
#This helps the model focus on important sentiment words.

#Tfidf vectorizer
    #tv=TfidfVectorizer(min_df=0.0,max_df=1,use_idf=True,ngram_range=(1,3))
#transformed train reviews
    #tv_train_r=tv.fit_transform(preprocessed_train_r)
#transformed test reviews
    #tv_test_r=tv.transform(preprocessed_test_r)
    #print('Tfidf_train:',tv_train_r.shape)
    #print('Tfidf_test:',tv_test_r.shape)

tv=TfidfVectorizer(min_df=2,max_df=0.9,binary=False,ngram_range=(1,2))
tv_train_r=tv.fit_transform(preprocessed_train_r)
tv_test_r=tv.transform(preprocessed_test_r)
print('Tfidf_train:',tv_train_r.shape)
print('Tfidf_test:',tv_test_r.shape)

Tfidf_train: (30000, 423204)
Tfidf_test: (20000, 423204)


In [86]:
#Labeling the sentiment text to binary (+ve = 1, -ve = 0)

#labeling the sentient data
lb=LabelBinarizer()
#transformed sentiment data
sentiment_data=lb.fit_transform(df['sentiment'])
print(sentiment_data.shape)

(50000, 1)


In [87]:
#Spliting the sentiment data
train_s=sentiment_data[:30000]
test_s=sentiment_data[30000:]
print(train_s)
print(test_s)

[[1]
 [1]
 [1]
 ...
 [0]
 [1]
 [0]]
[[1]
 [0]
 [0]
 ...
 [0]
 [0]
 [0]]


In [88]:
# Logistic Regression model for both bag of words and tfidf features

#Logistic Regression learns which words indicate positive or negative sentiment and predicts the probability of each review being positive.
#Logistic Regression predicts probability

#training the model
lr = LogisticRegression(
    penalty='l2',                   #A penalty is a rule that prevents the model from becoming too complex.
    max_iter=500,                   #How long the model is allowed to learn
    C=1,                            #How strong the penalty is
    random_state=42                 #Reproducibility
)
#Fitting the model for Bag of words
lr_bow=lr.fit(cv_train_r,train_s)
print(lr_bow)
#Fitting the model for tfidf features
lr_tfidf=lr.fit(tv_train_r,train_s)
print(lr_tfidf)


#notes

#C value	Meaning
#Small C (0.1)	Strong penalty
#Large C (10)	Weak penalty
#C = 1	Balanced (default)

#We keep randomness the same so results are repeatable, comparable, and trustworthy

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1, max_iter=500, random_state=42)


  y = column_or_1d(y, warn=True)


LogisticRegression(C=1, max_iter=500, random_state=42)


In [91]:
#Predicting the model for bag of words
lr_bow_predict=lr.predict(cv_test_r)
print(lr_bow_predict)
#Predicting the model for tfidf features
lr_tfidf_predict=lr.predict(tv_test_r)
print(lr_tfidf_predict)

[1 0 0 ... 1 0 0]
[1 0 0 ... 1 0 0]


In [92]:
#Accuracy for bag of words
lr_bow_score=accuracy_score(test_s,lr_bow_predict)
print("lr_bow_score :",lr_bow_score)
#Accuracy for tfidf features
lr_tfidf_score=accuracy_score(test_s,lr_tfidf_predict)
print("lr_tfidf_score :",lr_tfidf_score)

lr_bow_score : 0.8645
lr_tfidf_score : 0.8918
