In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from bs4 import BeautifulSoup
import re,string,unicodedata
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from textblob import TextBlob
from textblob import Word




movie_data = pd.read_csv("IMDB Dataset.csv")
print(movie_data.shape)
movie_data.head(10) # first 10 data examples

(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [2]:
movie_data["sentiment"].value_counts() # positive/negative sentiment counts

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [3]:
training_set = movie_data.review[0:40000] # 80% of dataset as training
training_sentiments = movie_data.sentiment[0:40000] # sentiments for training set

#test set
test_set = movie_data.review[40000:50000]
test_sentiments = movie_data.sentiment[40000:50000]
print(training_set.shape, training_sentiments.shape) # 40,000 examples in training set
print(test_set.shape, test_sentiments.shape) # 10,000 examples in test set


(40000,) (40000,)
(10000,) (10000,)


In [4]:
tokenizer = ToktokTokenizer() # tokenizer 
stopwords_list = nltk.corpus.stopwords.words('english') # get english stopwords from ntlk corpus
#removing special chars
def remove_special_chars(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text = re.sub(pattern, '', text) #removing non alphanumeric chars
    return text
movie_data['review'] = movie_data['review'].apply(remove_special_chars)

In [5]:


#Removing the noisy text
def remove_noise(text):
    #remove html
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    #remove square brackets
    text = re.sub('\[[^]]*\]', '', text)
    return text
#Apply function on review column
movie_data['review'] = movie_data['review'].apply(remove_noise)

In [6]:
#text stemming
def stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()]) #get each word from text, get stem
    #then put together all the stems
    return text

movie_data['review'] = movie_data['review'].apply(stemmer)
    

In [7]:
stop_words = set(stopwords.words('english'))
print(stop_words)

def remove_stopwords(text, is_lower_case = False):
    tokens = tokenizer.tokenize(text) #tokenized text
    tokens = [token.strip() for token in tokens] #strip each token and store in list
    
    if not is_lower_case:
        filtered_tokens = [token for token in tokens if token.lower() not in stopwords_list]
    else:
        filtered_tokens = [token for token in tokens if token not in stopwords_list]
        
    filtered_text =  ' '.join(filtered_tokens) #stopwords
    return filtered_text

movie_data['review'] = movie_data['review'].apply(remove_stopwords)


{'been', "you'll", 'hers', 'when', 'so', 'some', 't', 'few', "won't", "wasn't", 'myself', 've', 'them', 'aren', 'than', 'wouldn', 'to', 'shouldn', "hadn't", 'doing', 'the', 'does', 'most', 'having', 'were', 'during', 'other', 'herself', 'whom', 'because', 'had', 'into', 'which', 'm', 'needn', 'yours', 'both', "you've", 'hadn', 'did', "isn't", "shouldn't", 'we', 'me', 'mightn', "it's", 'out', 'about', 'll', 'such', 'those', 'you', 'at', 'against', 'haven', 'any', 'ourselves', 'yourself', 'its', 'how', "you're", 'ain', 'isn', 'until', 'why', 'ma', 'nor', 'very', 'doesn', "weren't", 'will', 'o', 'that', 'these', "wouldn't", 'off', "you'd", 'through', 'each', 'all', 'being', 'this', 'under', 'themselves', 'for', 'himself', 'our', "she's", 'while', "couldn't", 'before', 'there', 'up', 'ours', 'her', 'who', 'wasn', "don't", 'and', 'same', 'should', 'it', 'just', 'they', 'mustn', 'again', 'then', 'a', "should've", 'i', 'of', 'his', 'don', 'an', 'my', 'hasn', "mightn't", 're', 'have', 'from', 

In [20]:
normalized_train_reviews = movie_data['review'][:40000]
normalized_train_reviews[0]

normalized_train_string=normalized_train_reviews.to_string() # convert reviews to strings
normalized_train_spelling=TextBlob(normalized_train_string)
normalized_train_spelling.correct() # fix spelling
normalized_train_words=normalized_train_spelling.words
normalized_train_words



In [21]:
normalized_train_reviews[0]

'one review ha mention watch 1 Oz episod youll hook right thi exactli happen mebr br first thing struck Oz wa brutal unflinch scene violenc set right word GO trust thi show faint heart timid thi show pull punch regard drug sex violenc hardcor classic use wordbr br call OZ nicknam given oswald maximum secur state penitentari focus mainli emerald citi experiment section prison cell glass front face inward privaci high agenda Em citi home manyaryan muslim gangsta latino christian italian irish moreso scuffl death stare dodgi deal shadi agreement never far awaybr br would say main appeal show due fact goe show wouldnt dare forget pretti pictur paint mainstream audienc forget charm forget romanceoz doesnt mess around first episod ever saw struck nasti wa surreal couldnt say wa readi watch develop tast Oz got accustom high level graphic violenc violenc injustic crook guard wholl sold nickel inmat wholl kill order get away well manner middl class inmat turn prison bitch due lack street skill 

In [16]:
normalized_test_reviews = movie_data['review'][40000:]
normalized_test_reviews[45000]


'enjoy thi film wa sceneri corfu greek ador countri like flatter director point view base true stori dure year greec wa struggl stand two feet war nazi hardship italian soldier greek girl fall love time hard lot sacrific make nichola cage look great uniform give passion account thi unfulfil begin love ador christian bale play mandra heroin husbandtob look veri veri good greek hi person match one greek patriot true fighter one movi would like buy keep collectionfor ever'