<a href="https://colab.research.google.com/github/harshadj1611/IMDB-Reviews---Sentimental-Analysis/blob/main/IMDB_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Downloading the library wordninja

In [1]:
!pip install wordninja

Collecting wordninja
  Downloading wordninja-2.0.0.tar.gz (541 kB)
[?25l[K     |▋                               | 10 kB 29.8 MB/s eta 0:00:01[K     |█▏                              | 20 kB 36.3 MB/s eta 0:00:01[K     |█▉                              | 30 kB 41.7 MB/s eta 0:00:01[K     |██▍                             | 40 kB 25.2 MB/s eta 0:00:01[K     |███                             | 51 kB 18.0 MB/s eta 0:00:01[K     |███▋                            | 61 kB 14.8 MB/s eta 0:00:01[K     |████▎                           | 71 kB 12.3 MB/s eta 0:00:01[K     |████▉                           | 81 kB 13.4 MB/s eta 0:00:01[K     |█████▌                          | 92 kB 14.8 MB/s eta 0:00:01[K     |██████                          | 102 kB 13.2 MB/s eta 0:00:01[K     |██████▋                         | 112 kB 13.2 MB/s eta 0:00:01[K     |███████▎                        | 122 kB 13.2 MB/s eta 0:00:01[K     |███████▉                        | 133 kB 13.2 MB/s eta 0:00:01

## Importing required libraries

In [2]:

import pandas as pd 
import numpy as np
import re
import string
from string import punctuation
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import PunktSentenceTokenizer
from nltk.stem import WordNetLemmatizer

## The datasets we are going to use (Downloaded from Kaggle)

In [3]:
train = pd.read_csv("/content/Train.csv")
test = pd.read_csv("/content/Test.csv")

In [4]:
train.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [5]:
test.head()

Unnamed: 0,text,label
0,I always wrote this series off as being a comp...,0
1,1st watched 12/7/2002 - 3 out of 10(Dir-Steve ...,0
2,This movie was so poorly written and directed ...,0
3,The most interesting thing about Miryang (Secr...,1
4,"when i first read about ""berlin am meer"" i did...",0


## Checking if there are null values

In [6]:
train.isnull().sum()

text     0
label    0
dtype: int64

In [7]:
test.isnull().sum()

text     0
label    0
dtype: int64

## Cleaning the data 

In [8]:
## Converting every word in lower text
train['text'] = train['text'].apply(lambda x : str.lower(x))
test['text'] = test['text'].apply(lambda x : str.lower(x))

In [9]:
## Cleaning if any url present in the dataset
def clean_url(text):
    text = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', " ", text)
    text = text.strip()
    return text

In [10]:
train['text'] = train['text'].apply(lambda x : clean_url(x))
test['text'] = test['text'].apply(lambda x : clean_url(x))

In [11]:
## Cleaning the dataset further
def clean(text):
    text = re.sub("(@[A-Za-z0-9_]+)","", text) #remove mentions or tags
    text = re.sub('<br\s?\/>|<br>', " ", text) # remove br tags
    text = text.translate(str.maketrans('', '', string.punctuation)) # remove all punctuations
    
    text = re.sub(" +", " ", text) #remove extra white spaces
    text = re.sub("[0-9]+", "", text) #remove in numeric data
    text = re.sub("[^A-Za-z0-9_. ]+","",text) #remove any other characters other than A-Z a-z 0-9. 
    return text    

In [12]:
train['text'] = train['text'].apply(lambda x : clean(x))
test['text'] = test['text'].apply(lambda x : clean(x))

In [13]:
## Removing blank spaces
def blank_space(text):
    text = ' '.join(text.split())
    
    
    return text

In [14]:
train['text'] = train['text'].apply(lambda x : blank_space(x))
test['text'] = test['text'].apply(lambda x : blank_space(x))

In [15]:
## If two English words get joined from the above step, we will seperate them in this step by using wordninja package
import wordninja 
def ninja(text):
    text = wordninja.split(text)
    text = " ".join(text)
    return text


In [16]:
train['text'] = train['text'].apply(lambda x : ninja(x))
test['text'] = test['text'].apply(lambda x : ninja(x))

## Removing stop words

In [17]:
nltk.download('stopwords')
def remove_stopwords(text):
    stopword = stopwords.words('english')
    
    text = [word for word in text.split() if not word in stopword]
    text = " ".join(text)
    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [18]:
train['text'] = train['text'].apply(lambda x : remove_stopwords(x))
test['text'] = test['text'].apply(lambda x : remove_stopwords(x))

In [19]:
## tokenizer 
nltk.download('punkt')
def tokens(text):
    text = word_tokenize(text)
    text = " ".join(text)
    return text

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [20]:
train['text'] = train['text'].apply(lambda x : tokens(x))
test['text'] = test['text'].apply(lambda x : tokens(x))

In [21]:
nltk.download("wordnet")
nltk.download("averaged_perceptron_tagger")
nltk.download('state_union')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package state_union to /root/nltk_data...
[nltk_data]   Unzipping corpora/state_union.zip.


True

## Function to get Parts Of Speech of the words

In [22]:
## Creating a function to get the parts of speech of the words.
## It gives us the context in which the word has been used.
## For example: 'watch' can be used as a verb but 'watch' as in wrist watch is a noun

from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.corpus import state_union

training = state_union.raw("2005-GWBush.txt")
tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer(training)

def get_word_pos(word):
    
    sample = word
    
    tokenized = tokenizer.tokenize(sample)
#     print(tokenized)
    
    
    for i in tokenized:
        words = nltk.word_tokenize(i)
        tagged = nltk.pos_tag(words)
#         print(tagged)
        
    for letter in tagged:
    #print(letter[1])
        if letter[1].startswith('J'):
            return wordnet.ADJ
        elif letter[1].startswith('V'):
            return wordnet.VERB
        elif letter[1].startswith('N'):
            return wordnet.NOUN
        elif letter[1].startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

## Performing process of lemmatization

In [23]:
lemmatizer = WordNetLemmatizer()
def lemmatize(text):
    
    lemmatized_text = []
    
    tokens = word_tokenize(text)
#     print(tokens)
    
    for word in tokens:
        
        lemmatized_text.append(lemmatizer.lemmatize(word, get_word_pos(word)))

    return " ".join(lemmatized_text)

In [24]:
train['text'] = train['text'].apply(lambda x : lemmatize(x))
test['text'] = test['text'].apply(lambda x : lemmatize(x))

In [25]:
x_train = train['text'].copy()
x_test = test['text'].copy()

y_train = train['label'].copy()
y_test = test['label'].copy()

In [26]:
x_train.head()

0    grow b watch love thunderbird mate school watc...
1    put movie dvd player coke chip expectation hop...
2    people know particular time past like feel nee...
3    even though great interest biblical movie bore...
4    im die hard dad army fan nothing ever change g...
Name: text, dtype: object

In [27]:
print("Train: ",x_train.shape,y_train.shape,"Test: ",(x_test.shape,y_test.shape))

Train:  (40000,) (40000,) Test:  ((5000,), (5000,))


## Implementing TfidVectorizer

In [28]:
## Term frequency-inverse document frequency is a text vectorizer that transforms the text into a usable vector
from sklearn.feature_extraction.text import TfidfVectorizer  
tfidfconverter = TfidfVectorizer(max_features=4000, min_df=1, max_df=0.9, ngram_range=(1,1))  
X_train = tfidfconverter.fit_transform(x_train)
X_test = tfidfconverter.transform(x_test)

## Implementing classification models

### Support Vector Machine

In [86]:
## Support vector machines use support vectors to train and then classify the new data point into particular category
from sklearn.svm import SVC
classifier = SVC(kernel= "linear", random_state=0)
classifier.fit(X_train,y_train)


SVC(kernel='linear', random_state=0)

In [87]:
y_test_pred=classifier.predict(X_test)

In [88]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
 
print(confusion_matrix(y_test,y_test_pred))  
print(classification_report(y_test,y_test_pred))  
print(accuracy_score(y_test, y_test_pred))

[[2175  320]
 [ 257 2248]]
              precision    recall  f1-score   support

           0       0.89      0.87      0.88      2495
           1       0.88      0.90      0.89      2505

    accuracy                           0.88      5000
   macro avg       0.88      0.88      0.88      5000
weighted avg       0.88      0.88      0.88      5000

0.8846


### Random Forest

In [None]:
## A random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting

from sklearn.ensemble import RandomForestClassifier
text_classifier = RandomForestClassifier(n_estimators=200, random_state=0)  
text_classifier.fit(X_train, y_train)

RandomForestClassifier(n_estimators=200, random_state=0)

In [None]:
predictions = text_classifier.predict(X_test)
 
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
 
print(confusion_matrix(y_test,predictions))  
print(classification_report(y_test,predictions))  
print(accuracy_score(y_test, predictions))

[[2130  365]
 [ 364 2141]]
              precision    recall  f1-score   support

           0       0.85      0.85      0.85      2495
           1       0.85      0.85      0.85      2505

    accuracy                           0.85      5000
   macro avg       0.85      0.85      0.85      5000
weighted avg       0.85      0.85      0.85      5000

0.8542


### Logistic Regression

In [None]:
## Logistic regression is a process of modeling the probability of a discrete outcome given an input variable

from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(max_iter=3500,solver='saga')

In [None]:
clf.fit(X_train,y_train)

LogisticRegression(max_iter=3500, solver='saga')

In [None]:
y_test_pred=clf.predict(X_test)

In [None]:
print(confusion_matrix(y_test,y_test_pred))  
print(classification_report(y_test,y_test_pred))  
print(accuracy_score(y_test, y_test_pred))

[[2169  326]
 [ 256 2249]]
              precision    recall  f1-score   support

           0       0.89      0.87      0.88      2495
           1       0.87      0.90      0.89      2505

    accuracy                           0.88      5000
   macro avg       0.88      0.88      0.88      5000
weighted avg       0.88      0.88      0.88      5000

0.8836


### KNN classification

In [89]:
## KNN is a supervised classification algorithm that classifies new data points based on the nearest data points
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=500, metric="minkowski", p =2 )
classifier.fit(X_train,y_train)

KNeighborsClassifier(n_neighbors=500)

In [90]:
y_test_pred=classifier.predict(X_test)

In [91]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_test_pred))  
print(classification_report(y_test,y_test_pred))  
print(accuracy_score(y_test, y_test_pred))

[[2132  363]
 [ 516 1989]]
              precision    recall  f1-score   support

           0       0.81      0.85      0.83      2495
           1       0.85      0.79      0.82      2505

    accuracy                           0.82      5000
   macro avg       0.83      0.82      0.82      5000
weighted avg       0.83      0.82      0.82      5000

0.8242


## Conclusion
By implementing Support Vector Machine we have achieved an accuracy of 0.8846 which is slightly better (or almost similar) as compared to Logistic Regression Classifier which has an accuracy of 0.8836
Random forest classifiers performance is also good. Its accuracy is 0.8542
K Nearest Neighbour classfier has lowest accuracy as compared to other classifiers in this case
