In [47]:
import re

import numpy as np
import pandas as pd
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import re
import string

# df = pd.read_csv('imdb_sup.csv')
df = pd.read_csv('imdb_sup.csv', nrows=2)

In [5]:
#helper functions for cleaning
def remove_punc(word):
    punc = '''-!()[]{};:"\,<>./?@#$%^&*_~'''
    no_punc_word = word.translate(str.maketrans('', '', punc))
    return no_punc_word

def remove_BR(lst):
    words = []
    for word in lst:
        if word.endswith('br'):
            word = word[:-2]
        words.append(word)
    
    return ' '.join(words)

def make_lower(word):
    return word.lower()

def remove_numbers(lst):
    no_num_list = [x for x in lst if not any(c.isdigit() for c in x)]
    return ' '.join(no_num_list)

def remove_words(lst):
    extra_stops = ['aa', 'ab','br', 'us', 'mr', 'saw', 'until', 'no', 'when', 'with', 
            'like', 'just', 'even', 'it\'s', 'i\'m', 
            'who', 'i\'ve', 'what', 'he', 'see', 'up','get', 'been',
            'because', 'into', 'time', 'watch', 'â–','called', '2',
             '10', 'said','their', 'can','two', 'go', 'also', 'seen', 'him',
            'through', 'it', 'doesn\'t', 'you\'re', 'that\'s', 'there\'s',
            'come', 'said', 'all.', 'screen', 'person', 'i\'ll', 'is,'
            '5', 'sandra', 'them.', '3', '.', 'he\'s', 'man', 'they\'re',
            '\\\x96', '--', 'i\'d', 'is,', 'oh', 'one', 'much', 'movies',
            'say', '4', '1', 'five', 'what\'s', '15', 'ed', '...',
            'movie', 'film','', '-', 'people', 'could', 'make', 'films', 'reviews']


    stop_words = nltk.corpus.stopwords.words("english")
    stop_words = set(stop_words)
    stop_words.update(extra_stops) 

    words = [word for word in lst if word not in stop_words]

    return ' '.join(words)
    

In [6]:
def text_clean(message):
  message = remove_punc(message)   # remove punctiation except - and \'
  message = remove_words(message.split())
  message = remove_BR(message.split())
  message = remove_numbers(message.split())
  message = nltk.tag.pos_tag(message.split(), tagset='universal')
  message = [word for word,tag in message if (tag == 'ADJ' or tag == 'ADV' or tag == 'VERB')]
  message = ' '.join(message)
  message = message.lower()

  return message

In [37]:
test_str = "I loved a country of Wakanda, the culture, world, scenery and people. I think it was well played, CGIs were very good and story but, I have to be critical about some things. -2 stars because of clique and predictability. If I haven't seen almost all Marvel movies I would give it more stars but they are going on every new hero with one script and they are repeating it over and over again. I know exactly what and when it will happen. -1 star for progressive propaganda and hypocrisy. Example: if Ross tell something about skin color in the move but he didn't meant to insult somebody it would be sad. when Shuri tell Ross 'Colonizer' and she meant as insult, it is ok and no racist. when I saw that I was like are you kidding me?"
text_clean(test_str)


'loved think well played good critical clique almost would give going new repeating know exactly happen progressive tell meant would sad tell meant kidding'

In [48]:
#cleans all reviews in dataset
df['Review'] = df['Review'].apply(text_clean)

In [49]:
#shuffles data entries
dff = df.sample(frac=1).reset_index(drop=True)

In [50]:
pd.set_option('display.max_colwidth', None)
dff


Unnamed: 0,Review,Rating,Sentiment
0,extremely low looks recorded however good easy follow shooting sexually abusive father released psychiatric secretly affair ends renting first kills then see's neighbor sets acts innocent finds killed attempted hot then stops acting later kill supposedly loves find dies good lot know whole falling stopping,8,1
1,coupled filmmaking makes finest ever aired holds emotional strong enough never preserved ultimate mindblowingly moving looked quite remains top,10,1


In [51]:
#stemming program
porterstemmer = PorterStemmer()

def steming(message):
  return[porterstemmer.stem(word) for word in message.split()]

In [52]:
train_X = dff['Review'].head(25000)  
train_Y = dff['Rating'].head(25000)  
test_X = dff['Review'].tail(25000) 
test_Y = dff['Rating'].tail(25000)

In [53]:
#tf idf
tf_idf = TfidfVectorizer(tokenizer=steming)
#applying tf idf to training data
X_train_tf = tf_idf.fit_transform(train_X)
#applying tf idf to training data
X_train_tf = tf_idf.transform(train_X)

In [54]:
print("n_samples: %d, n_features: %d" % X_train_tf.shape)

n_samples: 2, n_features: 56


In [55]:
#transforming test data into tf-idf matrix
X_test_tf = tf_idf.transform(test_X)
print(test_X)
print(X_test_tf)

print("n_samples: %d, n_features: %d" % X_test_tf.shape) # the result of the test data and training data are the same

0    extremely low looks recorded however good easy follow shooting sexually abusive father released psychiatric secretly affair ends renting first kills then see's neighbor sets acts innocent finds killed attempted hot then stops acting later kill supposedly loves find dies good lot know whole falling stopping
1                                                                                                                                                                        coupled filmmaking makes finest ever aired holds emotional strong enough never preserved ultimate mindblowingly moving looked quite remains top
Name: Review, dtype: object
  (0, 55)	0.1285582354796681
  (0, 52)	0.2571164709593362
  (0, 51)	0.1285582354796681
  (0, 49)	0.2571164709593362
  (0, 48)	0.1285582354796681
  (0, 47)	0.1285582354796681
  (0, 46)	0.1285582354796681
  (0, 45)	0.1285582354796681
  (0, 44)	0.1285582354796681
  (0, 43)	0.1285582354796681
  (0, 41)	0.1285582354796681
  (0, 40)	0.128558235479668

In [31]:
#naive bayes classifier
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train_tf, train_Y)
#predicted y
y_pred = naive_bayes_classifier.predict(X_test_tf)

# print(metrics.classification_report(test_Y, y_pred, target_names=[1,2,3,4,5,6,7,8]))
print("Confusion matrix:")
print(metrics.confusion_matrix(test_Y, y_pred))

Confusion matrix:
[[4771    0    0    0    0    0    0  359]
 [1971    0    0    0    0    0    0  265]
 [2055    0    0    1    0    0    0  425]
 [1962    0    0    1    0    1    0  729]
 [ 663    0    0    0    0    1    0 1746]
 [ 518    0    0    0    0    8    0 2394]
 [ 290    0    0    0    0    0    0 2018]
 [ 601    0    0    0    0    0    0 4221]]


In [26]:
test = 'First of all it was my childhood dream to see these hero\'s in a big screen and it was full filled. The movie was quite good for me but after getting out of the theatre there were not much scenes in my mind that i can feel awesome about. Zack Snyder is a master in making comic book style of movies he has proved it by his movie the Watchmen but in the movie the Justice league the movie lost its synder touch and the whole movie looks like a rip off to marvel movie mantra that they use in there films(like the happy go jolly characters not much tension on the screen light hearted) likewise Josh wedon has made this movie into a marvel like film. The Justice league had many character moments for all but the grip was loosen up to make the fan/audience glue to there seats. The whole colour tone of the movie was changed by josh and as the Wb asked them to cut down its length to 2hrs(as the internet says Wb asked them to do so) But anyone who watched the film can easily see that many of the scenes showned in the trailer is not in the movie,even the dialogue by steppen wolf "No protector here.. .. .   .... This world will fall like all the others" that one was also cut off in the film.in justice league the expensive reshooted scenes that have no connection to the actual film plot and at first report the movies run time was nearly 3hrs and now we got a 2hrs 1min movie to watch with most of the scenes cut off..As a movie lover its just hurts know that they reduce the length of the film that has so much potential to reach for more.When Bvs released the critics and some marvel fan boys said its so dark and lengthy with not much strong plot to establish the cause of the fight between Batman and superman. But after they release the extended cut bvs we all get to know the exact chronology of the plot. Likewise this movie should also release the extended cut/directors cut version but zack was great for making something that is so gripping and his tones delivers the theme itself but here its all josh wedon who just ruined Zack\'s vision and made a marvel-istic movie with his Cuts. Overall the movie was Great and All the actors played there roles perfectly and the image of batman had be shifted abit in my view as there is a bit of difference between the Bvs batman and the JL batman, Bvs batman was more like the exact comic book Dark Knight but in this He is more Like Tony stark here and there. And those who relay on F**king tomatoes ratings don\'t watch If you are a movie lover and if this is your genre then go for it From the intro scene of Batman you will love the movie While watching the films you won\'t be able to see any flaws because of its paced 2hrs cut.For me the rating for this film would be 6.5/10. Go and watch for yourself its an entertaining Flick don\'t miss it ..the only draw back you will have will be with its running time and scenes cut off.'

processed = []
processed.append(text_clean(test))

In [35]:
test_input = tf_idf.transform(processed)
test_input.shape

#0= bad review
#1= good review
res=naive_bayes_classifier.predict(test_input)[0]
if res==1:
    print("Good Review")
elif res==0:
    print("Bad Review")