In [2]:
import pandas as pd
import numpy as np
import csv

In [3]:
review_data = pd.read_csv('IMDB.csv')

In [4]:
#read data and perform simple exploration to establish parameters for cleaning.
review_data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
review_data.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [6]:
review_data.dtypes

review       object
sentiment    object
dtype: object

In [7]:
review_data.info

<bound method DataFrame.info of                                                   review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]>

In [8]:
review_data.shape

(50000, 2)

In [9]:
#delete html tags from review column
import re
def delete_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

In [10]:
review_data["review"] = review_data["review"].apply(delete_html_tags)

In [11]:
review_data["review"]

0        One of the other reviewers has mentioned that ...
1        A wonderful little production. The filming tec...
2        I thought this was a wonderful way to spend ti...
3        Basically there's a family where a little boy ...
4        Petter Mattei's "Love in the Time of Money" is...
                               ...                        
49995    I thought this movie did a down right good job...
49996    Bad plot, bad dialogue, bad acting, idiotic di...
49997    I am a Catholic taught in parochial elementary...
49998    I'm going to have to disagree with the previou...
49999    No one expects the Star Trek movies to be high...
Name: review, Length: 50000, dtype: object

In [12]:
#reduce words to their base dictionary meaning using a lemitizer
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
stem=PorterStemmer()
lem=WordNetLemmatizer()

In [13]:
#reduce words fucntion
def lematizer(text):
    sentence=re.sub(r'[^\w]',' ',text)
    words=sentence.split()
    lem_words=[lem.lemmatize(word) for word in words]
    return " ".join(lem_words)

In [14]:
#tool for processing human language data
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ajibo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [15]:
review_data1 = review_data.copy()
review_data1

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [16]:
review_data1["review"] = review_data1["review"].apply(lambda x: lematizer(x))

In [17]:
review_data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [18]:
#remove punctuation marks and non word expressions
def steming(text):
    sentence=re.sub(r'[^\w]',' ',text)
    words=sentence.split()
    lem_words=[stem.stem(word) for word in words]
    return " ".join(lem_words)

In [19]:
review_data1["review"] = review_data1["review"].apply(lambda x: steming(x))

In [20]:
review_data1

Unnamed: 0,review,sentiment
0,one of the other review ha mention that after ...,positive
1,a wonder littl product the film techniqu is ve...,positive
2,i thought thi wa a wonder way to spend time on...,positive
3,basic there s a famili where a littl boy jake ...,negative
4,petter mattei s love in the time of money is a...,positive
...,...,...
49995,i thought thi movi did a down right good job i...,positive
49996,bad plot bad dialogu bad act idiot direct the ...,negative
49997,i am a cathol taught in parochi elementari sch...,negative
49998,i m go to have to disagre with the previou com...,negative


In [21]:
review_data1.to_csv('review_data1.csv', index = False)