

> # Deep Learning for NLP


> #### IMDB dataset



   
      


**Dheera Shaji**   
***CB.SC.I5.DAS18007*** 

Needed libraries and packages

In [28]:
!pip install num2words



In [58]:
import pandas as pd
import numpy as np
import nltk
import re

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

from num2words import num2words
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score
from sklearn import metrics

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Collecting data from IMDB dataset

In [2]:
data = pd.read_csv("/content/drive/MyDrive/NLP/IMDB Dataset.csv")
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


Convert upper case to lower case

In [3]:
def lower_case_conversion(text):
  lower_text=text.lower()
  return lower_text

In [4]:
data['review'] = data['review'].apply(lower_case_conversion)
data.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


Remove html tags

In [5]:
def remove_html_tags(text):
  html_pattern = r'<.*?>'
  without_html = re.sub(pattern = html_pattern,repl='',string=text)
  return without_html

In [6]:
data['review'] = data['review'].apply(remove_html_tags)
data.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


Expansion

In [10]:
import sys
import os

py_file_location = "/content/drive/MyDrive/NLP/"
sys.path.append(os.path.abspath(py_file_location))
from contraction import CONTRACTION_MAP

def expand_contractions(contraction):
  match = contraction.group(0)
  first_char = match[0]
  if contraction_mapping.get(match):
    expanded_contraction = contraction_mapping.get(match)
  else:
    expanded_contraction = contraction_mapping.get(match.lower())
  expanded_contraction = first_char + expanded_contraction[1:]
  return expanded_contraction

contraction_mapping = CONTRACTION_MAP # From Contraction.py
contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),flags=re.IGNORECASE|re.DOTALL)



for i in range(len(data['review'])):

  text = data['review'][i]

  expanded_text = contractions_pattern.sub(expand_contractions,text) # expand_contractions is a function

  expanded_text = re.sub("'", "", expanded_text)

  data['review'][i] = expanded_text



data.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there is a family where a little boy...,negative
4,"petter matteis ""love in the time of money"" is ...",positive


Remove expansions

In [11]:
def remove_special(text):
  return re.sub(r'[^a-zA-Z0-9\s]+', '',text)

In [12]:
data['review'] = data['review'].apply(remove_special)

In [13]:
data.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there is a family where a little boy...,negative
4,petter matteis love in the time of money is a ...,positive


Remove single characters

In [14]:
def remove_singlechar(text):
  return re.sub(pattern=r'\s[a-zA-Z]\s', repl=" ",string=text)

In [15]:
data['review'] = data['review'].apply(remove_singlechar)
data.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was wonderful way to spend time...,positive
3,basically there is family where little boy jak...,negative
4,petter matteis love in the time of money is vi...,positive


Convert numbers to words

In [17]:
def num_to_word(text):
  after_split = text.split()
  for idx in range(len(after_split)):
    if after_split[idx].isdigit():
      after_split[idx] = num2words(after_split[idx])
  nums_to_words = ' '.join(after_split)
  return nums_to_words

In [18]:
data['review'] = data['review'].apply(num_to_word)
data.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was wonderful way to spend time...,positive
3,basically there is family where little boy jak...,negative
4,petter matteis love in the time of money is vi...,positive


Remove stopwords

In [19]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')
data['review'] = data['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [20]:
data.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching one oz episod...,positive
1,wonderful little production filming technique ...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically family little boy jake thinks zombie...,negative
4,petter matteis love time money visually stunni...,positive


Lemmatization

In [27]:
wordnet = WordNetLemmatizer()
#used for tagging words with their parts of speech
for i in range(len(data['review'])):
  text = data['review'][i]
  lemm_word = []
  tokenizer = word_tokenize(text)
  for token, tag in pos_tag(tokenizer):
    pos = tag[0].lower()
    if pos not in ['a','r','v','n']:
      pos = 'n'
    lemm_word.append(wordnet.lemmatize(token, pos))
  data['review'][i]=' '.join(lemm_word)

data.head()

Unnamed: 0,review,sentiment
0,one reviewer mention watch one oz episode hook...,positive
1,wonderful little production film technique una...,positive
2,think wonderful way spend time hot summer week...,positive
3,basically family little boy jake think zombie ...,negative
4,petter matteis love time money visually stunni...,positive


x-y split

In [36]:
X= data.review
y = data.sentiment

Count vectorizer

In [37]:
vectorizer = CountVectorizer(max_df=0.5)
X = vectorizer.fit_transform(X)

Modelling

In [33]:
model = XGBRegressor()
print(model)

XGBRegressor()


In [42]:
model = LogisticRegression()
print(model)

LogisticRegression()


Train-Test split

In [54]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=0)

Model fitting

In [None]:
model.fit(X_train,y_train)
y_pred = model.predict(X_val)

Accuracy

In [60]:
print('Accuracy:', accuracy_score(y_pred, y_val))

Accuracy: 0.881


Confusion matrix

In [59]:
print(metrics.confusion_matrix(y_val, y_pred))

[[4419  616]
 [ 574 4391]]
