In [1]:
#importing basic libraries
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup


In [2]:
%%time
df=pd.read_csv("IMDB Dataset.csv")
df.info()
#no missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 390.7+ KB
Wall time: 527 ms


In [3]:
#equal division of positive and negative sentiment
df['sentiment'].value_counts()

negative    25000
positive    25000
Name: sentiment, dtype: int64

# TEXT CLEANING AND TRAINING STEP BY STEP:

1)Removal of HTML contents like "< br>".

2)Removal of punctutions, special characters like '\'.

3)Removal of stopwords like is, the which do not offer much insight.

4)Stemming/Lemmatization to bring back multiple forms of same word to their common root like 'coming', 'comes' into 'come'.

5)Vectorization - Encode the numeric values once you have cleaned it.

6)Fit the data to the ML model.


In [4]:
#removing all html tags
from bs4 import BeautifulSoup
def html_remover(text):
    soup=BeautifulSoup(text,'html.parser')
    a=soup.get_text()
    return a
df['review']=df['review'].apply(html_remover)
df['review'][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.I would say the main appeal of the show is due to the fact that it goes where other shows wo

In [5]:
%%time
#Removal of punctuations and special characters
import re
def sp_char_remover(review):
    review = re.sub('\[[^]]*\]', ' ', review)
    review = re.sub('[^a-zA-Z]', ' ', review)
    return review
df['review']=df['review'].apply(sp_char_remover)
df['review'][1]

Wall time: 3.26 s


'A wonderful little production  The filming technique is very unassuming  very old time BBC fashion and gives a comforting  and sometimes discomforting  sense of realism to the entire piece  The actors are extremely well chosen  Michael Sheen not only  has got all the polari  but he has all the voices down pat too  You can truly see the seamless editing guided by the references to Williams  diary entries  not only is it well worth the watching but it is a terrificly written and performed piece  A masterful production about one of the great master s of comedy and his life  The realism really comes home with the little things  the fantasy of the guard which  rather than use the traditional  dream  techniques remains solid then disappears  It plays on our knowledge and our senses  particularly with the scenes concerning Orton and Halliwell and the sets  particularly of their flat with Halliwell s murals decorating every surface  are terribly well done '

In [6]:
%%time
#To lower
def lower(text):
    return text.lower()
df['review']=df['review'].apply(lower)
df['review'][2]

Wall time: 89 ms


'i thought this was a wonderful way to spend time on a too hot summer weekend  sitting in the air conditioned theater and watching a light hearted comedy  the plot is simplistic  but the dialogue is witty and the characters are likable  even the well bread suspected serial killer   while some may be disappointed when they realize this is not match point    risk addiction  i thought it was proof that woody allen is still fully in control of the style many of us have grown to love this was the most i d laughed at one of woody s comedies in years  dare i say a decade    while i ve never been impressed with scarlet johanson  in this she managed to tone down her  sexy  image and jumped right into a average  but spirited young woman this may not be the crown jewel of his career  but it was wittier than  devil wears prada  and more interesting than  superman  a great comedy to go see with friends '

In [7]:
%%time
#Removal of stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

def stopword_remover(text):
    x=[]
    text=text.split()    #splitting into individual words
    for i in text:
        if i not in stopwords.words('english'):
            x.append(i)
    return x

df['review']=df['review'].apply(stopword_remover)
df['review'][0]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Arya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Wall time: 39min 57s


['one',
 'reviewers',
 'mentioned',
 'watching',
 'oz',
 'episode',
 'hooked',
 'right',
 'exactly',
 'happened',
 'first',
 'thing',
 'struck',
 'oz',
 'brutality',
 'unflinching',
 'scenes',
 'violence',
 'set',
 'right',
 'word',
 'go',
 'trust',
 'show',
 'faint',
 'hearted',
 'timid',
 'show',
 'pulls',
 'punches',
 'regards',
 'drugs',
 'sex',
 'violence',
 'hardcore',
 'classic',
 'use',
 'word',
 'called',
 'oz',
 'nickname',
 'given',
 'oswald',
 'maximum',
 'security',
 'state',
 'penitentary',
 'focuses',
 'mainly',
 'emerald',
 'city',
 'experimental',
 'section',
 'prison',
 'cells',
 'glass',
 'fronts',
 'face',
 'inwards',
 'privacy',
 'high',
 'agenda',
 'em',
 'city',
 'home',
 'many',
 'aryans',
 'muslims',
 'gangstas',
 'latinos',
 'christians',
 'italians',
 'irish',
 'scuffles',
 'death',
 'stares',
 'dodgy',
 'dealings',
 'shady',
 'agreements',
 'never',
 'far',
 'away',
 'would',
 'say',
 'main',
 'appeal',
 'show',
 'due',
 'fact',
 'goes',
 'shows',
 'dare',
 

In [8]:
%%time
#Lemmatizing the stopwords and then joining it back
from nltk.stem import WordNetLemmatizer
lem=WordNetLemmatizer()

def temp(text):
    text=" ".join(text)
    return text

def lemma_join(text):
    text=[lem.lemmatize(word) for word  in text]
    text=temp(text)
    return text

df['review']=df['review'].apply(lemma_join)        
df['review'][0]

Wall time: 31.8 s


'one reviewer mentioned watching oz episode hooked right exactly happened first thing struck oz brutality unflinching scene violence set right word go trust show faint hearted timid show pull punch regard drug sex violence hardcore classic use word called oz nickname given oswald maximum security state penitentary focus mainly emerald city experimental section prison cell glass front face inwards privacy high agenda em city home many aryan muslim gangsta latino christian italian irish scuffle death stare dodgy dealing shady agreement never far away would say main appeal show due fact go show dare forget pretty picture painted mainstream audience forget charm forget romance oz mess around first episode ever saw struck nasty surreal say ready watched developed taste oz got accustomed high level graphic violence violence injustice crooked guard sold nickel inmate kill order get away well mannered middle class inmate turned prison bitch due lack street skill prison experience watching oz m

In [9]:
#separation into training and testing
from sklearn.model_selection import train_test_split
df_train, df_test, train_data_label, test_data_label = train_test_split(df['review'], df['sentiment'], test_size=0.20, random_state=42)

In [10]:
train_data_label=(train_data_label.replace({'positive':1,'negative':0}))
test_data_label=(test_data_label.replace({'positive':1,'negative':0}))

In [11]:
#Creating cleaned corpus from the cleaned df['review'] dataset for the purpose of training
corpus_train = []
corpus_test  = []

for i in df_train.index:
    temp=df_train[i]
    corpus_train.append(temp)

for j in df_test.index:
    temp1=df_test[j]
    corpus_test.append(temp1)
    
    

In [12]:
corpus_train2=corpus_train
corpus_test2=corpus_test

In [44]:
%%time
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()
cv_train=cv.fit_transform(corpus_train2)
cv_test=cv.transform(corpus_test2)

Wall time: 5.08 s


In [48]:
from sklearn.svm import LinearSVC
lin_svc=LinearSVC(C=0.5,random_state=42,max_iter=10000)
lin_svc.fit(cv_train,train_data_label)

y_pred=lin_svc.predict(cv_test)

In [49]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score


print(classification_report(test_data_label,y_pred))
print("ACCURACY SCORE IS: ",accuracy_score(test_data_label,y_pred))

              precision    recall  f1-score   support

           0       0.87      0.86      0.86      4961
           1       0.86      0.87      0.87      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000

ACCURACY SCORE IS:  0.8667


In [51]:
a['review']=corpus_test2

TypeError: list indices must be integers or slices, not str