In [1]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD as TSVD
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score, cross_val_predict

import spacy
from spacy.lang.en.stop_words import STOP_WORDS as stopwords

In [2]:
data = pd.read_csv('dataset/IMDB_Dataset.csv')

In [3]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
data.shape

(50000, 2)

In [5]:
train = data.iloc[:25000]
test = data.iloc[25000:]

In [6]:
train.shape, test.shape

((25000, 2), (25000, 2))

In [7]:
train.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [8]:
nlp = spacy.load('en_core_web_lg')

In [9]:
# Converting the text to lowercase

train['review'] = train['review'].apply(lambda x: str(x).lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['review'] = train['review'].apply(lambda x: str(x).lower())


In [10]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


### Contractions Expansion

In [11]:
!pip install contractions

Defaulting to user installation because normal site-packages is not writeable
Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii
  Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
     ---------------------------------------- 0.0/289.9 kB ? eta -:--:--
     - -------------------------------------- 10.2/289.9 kB ? eta -:--:--
     --------- --------------------------- 71.7/289.9 kB 975.2 kB/s eta 0:00:01
     ---------------------- --------------- 174.1/289.9 kB 1.5 MB/s eta 0:00:01
     -------------------------------------- 289.9/289.9 kB 1.8 MB/s eta 0:00:00
Collecting pyahocorasick
  Downloading pyahocorasick-2.0.0-cp311-cp311-win_amd64.whl (39 kB)
Installing collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.2 contractions-0.1.73 pyahocorasick-2.0.0 textsearch-0.0.24



[notice] A new release of pip is available: 23.0.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [12]:
import contractions

In [13]:
contractions_dict = contractions.contractions_dict
contractions_dict

{"I'm": 'I am',
 "I'm'a": 'I am about to',
 "I'm'o": 'I am going to',
 "I've": 'I have',
 "I'll": 'I will',
 "I'll've": 'I will have',
 "I'd": 'I would',
 "I'd've": 'I would have',
 'Whatcha': 'What are you',
 "amn't": 'am not',
 "ain't": 'are not',
 "aren't": 'are not',
 "'cause": 'because',
 "can't": 'cannot',
 "can't've": 'cannot have',
 "could've": 'could have',
 "couldn't": 'could not',
 "couldn't've": 'could not have',
 "daren't": 'dare not',
 "daresn't": 'dare not',
 "dasn't": 'dare not',
 "didn't": 'did not',
 'didn’t': 'did not',
 "don't": 'do not',
 'don’t': 'do not',
 "doesn't": 'does not',
 "e'er": 'ever',
 "everyone's": 'everyone is',
 'finna': 'fixing to',
 'gimme': 'give me',
 "gon't": 'go not',
 'gonna': 'going to',
 'gotta': 'got to',
 "hadn't": 'had not',
 "hadn't've": 'had not have',
 "hasn't": 'has not',
 "haven't": 'have not',
 "he've": 'he have',
 "he's": 'he is',
 "he'll": 'he will',
 "he'll've": 'he will have',
 "he'd": 'he would',
 "he'd've": 'he would have',
 

In [15]:
def contraction_expansion(x):
    
    if type(x) is str:
        
        for key in contractions_dict:

            value = contractions_dict[key]

            x = x.replace(key, value)

    return x

In [16]:
train['review'] = train['review'].apply(lambda x: contraction_expansion(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['review'] = train['review'].apply(lambda x: contraction_expansion(x))


In [17]:
train.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there is a family where a little boy...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


### Removing Emails

In [18]:
def remove_emails(x):
    
    email_pattern = re.compile(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)")
    
    return re.sub(email_pattern, '', x)

In [19]:
train['review'] = train['review'].apply(lambda x:remove_emails(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['review'] = train['review'].apply(lambda x:remove_emails(x))


In [20]:
train.sample(5)

Unnamed: 0,review,sentiment
1722,my mother worked with dennis l. raider for ele...,negative
3922,i gather from reading the previous comments th...,positive
12134,"this movie was very enjoyable, though you will...",positive
3257,"by saying that,i mean that this is not a well ...",positive
15165,"this is a feel good film, about one person's d...",positive


### Removing HTML Tags

In [21]:
train['review'] = train['review'].apply(lambda x: BeautifulSoup(x, 'lxml').get_text().strip())

  train['review'] = train['review'].apply(lambda x: BeautifulSoup(x, 'lxml').get_text().strip())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['review'] = train['review'].apply(lambda x: BeautifulSoup(x, 'lxml').get_text().strip())


In [22]:
train.iloc[6005][0]

'pretty.pretty actresses and actors. pretty bad script. pretty frequent "let us strip to our undies" scenes. pretty fair f/x. pretty jarring location decisions (the college dorm room looks like a high-end hotel room - probably because it was shot at a hotel). pretty bland storyline. pretty awful dialog. pretty locations. pretty annoying editing, unless you like the music video flash-cut style.this one is not a guilty pleasure - this is more an embarrassing one. if you must watch this, pick a good dance/techno album and turn the sound off on the movie - you will see the pretty people in their pretty black undies, and probably follow the story just fine.the cast may be able to act - i doubt that anyone could look skilled given the lines/plot that they had to deal with.'

In [23]:
train.sample(5)

Unnamed: 0,review,sentiment
7773,***comments contain spoilers*** i was barely h...,negative
13343,on this 4th of july weekend it is heartening t...,positive
2620,"los angeles, 1976. indie film brat john carpen...",negative
17671,last year was the 200th anniversary of charles...,positive
8721,a terrible movie that is amateurish on almost ...,negative


### Removing Special Characters

In [24]:
def RemoveSpecialChars(x):
    
    x = re.sub(r'[^\w ]+', "", x)
    x = ' '.join(x.split())
    return x

In [25]:
train['review'] = train['review'].apply(lambda x: RemoveSpecialChars(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['review'] = train['review'].apply(lambda x: RemoveSpecialChars(x))


In [26]:
train.sample(5)

Unnamed: 0,review,sentiment
6462,ingrid bergman is a temporarily impoverished p...,positive
16740,it is a piece of crap not funny at all during ...,negative
9323,the tattooed stranger was another of those rar...,positive
5111,i know that some films i mean european films t...,negative
8106,the title got my attention and then i wondered...,positive


In [27]:
train.iloc[6005][0]

'prettypretty actresses and actors pretty bad script pretty frequent let us strip to our undies scenes pretty fair fx pretty jarring location decisions the college dorm room looks like a highend hotel room probably because it was shot at a hotel pretty bland storyline pretty awful dialog pretty locations pretty annoying editing unless you like the music video flashcut stylethis one is not a guilty pleasure this is more an embarrassing one if you must watch this pick a good dancetechno album and turn the sound off on the movie you will see the pretty people in their pretty black undies and probably follow the story just finethe cast may be able to act i doubt that anyone could look skilled given the linesplot that they had to deal with'

### Lemmetization

In [28]:
def lemme(x):
    
    x = str(x)
    x_list = []
    doc = nlp(x)

    for token in doc:
        lemma = token.lemma_

        if lemma in ['-PRON-', 'be']:
            lemma = token.text

        x_list.append(lemma)

    return ' '.join(x_list)

In [29]:
%%time
train['review'] = train['review'].apply(lambda x: lemme(x))

CPU times: total: 12min 35s
Wall time: 16min 35s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [30]:
train.sample(5)

Unnamed: 0,review,sentiment
17296,this movie was outright painful for I to watch...,negative
23058,never ever take a film just for its good look ...,negative
17932,ok the other reviewer have pretty much cover t...,positive
9767,darkly comic serendipity about a cosmetic sale...,negative
2988,wow pretty terrible stuff the richard burtonel...,negative


### Tokenization using Text Blob

### Removing Stop Words

In [31]:
stopwords

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

In [32]:
len(stopwords)

326

In [33]:
def RemoveStopWords(x):
    
    return ' '.join([word for word in x.split() if word not in stopwords])

In [34]:
x = train.iloc[6005][0]

In [35]:
# EXAMPLE CODE

print(x)
print()
print("length of x: ",len(x))

prettypretty actress and actor pretty bad script pretty frequent let we strip to our undie scene pretty fair fx pretty jarring location decision the college dorm room look like a highend hotel room probably because it was shoot at a hotel pretty bland storyline pretty awful dialog pretty location pretty annoying editing unless you like the music video flashcut stylethis one is not a guilty pleasure this is more an embarrassing one if you must watch this pick a good dancetechno album and turn the sound off on the movie you will see the pretty people in their pretty black undie and probably follow the story just finethe cast may be able to act I doubt that anyone could look skilled give the linesplot that they have to deal with

length of x:  735


In [36]:
x1 = RemoveStopWords(x)
x1

'prettypretty actress actor pretty bad script pretty frequent let strip undie scene pretty fair fx pretty jarring location decision college dorm room look like highend hotel room probably shoot hotel pretty bland storyline pretty awful dialog pretty location pretty annoying editing like music video flashcut stylethis guilty pleasure embarrassing watch pick good dancetechno album turn sound movie pretty people pretty black undie probably follow story finethe cast able act I doubt look skilled linesplot deal'

In [37]:
len(x1)

510

In [38]:
%%time

train['review'] = train['review'].apply(lambda x: RemoveStopWords(x))

CPU times: total: 391 ms
Wall time: 940 ms


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [39]:
train.sample(5)

Unnamed: 0,review,sentiment
17790,good feeling movie deliver good expect dead en...,positive
4979,john thaw inspector morse fame play old tom oa...,positive
10929,develop movie base actual event involve crypto...,negative
23005,group young adult strand place vicinity museum...,positive
15211,I short film dvd ridley scotts film duellist i...,negative


### Removing Rare Words

In [40]:
text = ' '.join(train['review'])

In [40]:
#text

In [41]:
len(text)

17525809

In [42]:
# Creating Frequency

text_series = pd.Series(text.split())

In [43]:
freq_comm = text_series.value_counts()

In [44]:
freq_comm

I                    90117
movie                49624
film                 45914
like                 21488
good                 20105
                     ...  
allhell                  1
lamenessseriously        1
x1                       1
xraye                    1
tvpersonality            1
Length: 129592, dtype: int64

In [45]:
rare_words = freq_comm[-82000:-1]
'rockumentarie' in rare_words

True

In [46]:
rare_words

hourthis             2
messbut              2
packed               2
jacky                2
garbed               2
                    ..
buffalothe           1
allhell              1
lamenessseriously    1
x1                   1
xraye                1
Length: 81999, dtype: int64

In [47]:
# Removing 82000 rare occuring words 

train['review'] = train['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in rare_words]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['review'] = train['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in rare_words]))


In [48]:
train['review'].sample(5)

13585    possible I sorry excuse movie zero star far ba...
17970    inside movie halfhour episode twilight zone tr...
17019    view vcr find fascinating know true story I th...
9982     probably good movie director hector babenco br...
6454     viva variety unique hybrid program parody trib...
Name: review, dtype: object

### Converting the Data into Vector

In [49]:
train['sentiment'].value_counts()

negative    12526
positive    12474
Name: sentiment, dtype: int64

In [97]:
train['review'].to_csv('train_review_data.csv')

In [50]:
X = train['review']
y = train['sentiment']

In [51]:
tfidf = TfidfVectorizer()

In [52]:
X = tfidf.fit_transform(X)

In [53]:
X.shape

(25000, 47554)

In [54]:
X

<25000x47554 sparse matrix of type '<class 'numpy.float64'>'
	with 1934883 stored elements in Compressed Sparse Row format>

### Splitting Data into Training and Testing sets

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 4, stratify = y)

In [56]:
X_train.shape, X_test.shape

((20000, 47554), (5000, 47554))

### Dimensionality reduction using Truncated Singular Value Decomposition

In [57]:
%%time

#tsvd = TSVD(n_components=10000, random_state=4)
#X_train_tsvd = tsvd.fit_transform(X_train)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.44 µs


In [58]:
#sum(tsvd.explained_variance_)

### Using SVC for Classification

In [59]:
#clf_svc = SVC()

In [60]:
%%time

#scores = cross_val_score(clf_svc, X_train, y_train, cv=6, n_jobs=-1)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.44 µs


In [61]:
#scores

### Using Logistic Regression

In [57]:
from sklearn.linear_model import LogisticRegression

In [58]:
clf_lr = LogisticRegression()

In [59]:
X_train

<20000x47554 sparse matrix of type '<class 'numpy.float64'>'
	with 1549068 stored elements in Compressed Sparse Row format>

In [60]:
%%time

scores = cross_val_score(clf_lr, X_train, y_train, cv=10, n_jobs=4)

CPU times: total: 31.2 ms
Wall time: 14.9 s


In [61]:
scores

array([0.8765, 0.881 , 0.8795, 0.876 , 0.892 , 0.8945, 0.877 , 0.8745,
       0.87  , 0.8725])

In [62]:
scores.mean()

0.87935

In [63]:
clf_lr.fit(X_train, y_train)

In [64]:
y_test_pred = clf_lr.predict(X_test)

In [65]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

    negative       0.89      0.85      0.87      2505
    positive       0.86      0.90      0.88      2495

    accuracy                           0.88      5000
   macro avg       0.88      0.88      0.88      5000
weighted avg       0.88      0.88      0.88      5000



In [66]:
confusion_matrix(y_test, y_test_pred)

array([[2137,  368],
       [ 253, 2242]], dtype=int64)

In [67]:
clf_lr.predict(tfidf.transform(['American Psycho deserved an Oscar, they were robbed']))

array(['positive'], dtype=object)

In [68]:
y_real_pred = clf_lr.predict(tfidf.transform(test['review']))

In [69]:
print(classification_report(test['sentiment'], y_real_pred))

              precision    recall  f1-score   support

    negative       0.89      0.83      0.86     12474
    positive       0.84      0.89      0.87     12526

    accuracy                           0.86     25000
   macro avg       0.87      0.86      0.86     25000
weighted avg       0.87      0.86      0.86     25000



In [70]:
clf_lr.predict(tfidf.transform(["What hell was that, it's a masterpiece"]))

array(['positive'], dtype=object)

## Save the model weights

In [78]:
import pickle
pickle.dump(clf_lr, open('LR_model.pickle', 'wb'))
print('Logistic Regression trained Model Saved')

Logistic Regression trained Model Saved


## Load Save Model

In [73]:
filename = 'trained_model.pkl'
classifier = pickle.load(open(filename, 'rb'))

In [76]:
(classifier.predict(tfidf.transform(["What hell was that, it's a masterpiece"])))

array(['positive'], dtype=object)