In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
import ast

In [2]:
df = pd.read_csv('clean_reviews.csv')
df.head()

Unnamed: 0,text,label,clean_text,total_words,total_char,total_sent
0,i grew up (b. 1965) watching and loving the th...,0,"['grew', 'b', '1965', 'watch', 'love', 'thunde...",89,622,17
1,"when i put this movie in my dvd player, and sa...",0,"['put', 'movi', 'dvd', 'player', 'sat', 'coke'...",94,649,8
2,why do people who do not know what a particula...,0,"['peopl', 'not', 'know', 'particular', 'time',...",96,623,9
3,even though i have great interest in biblical ...,0,"['even', 'though', 'great', 'interest', 'bibli...",34,216,5
4,i am a die hard dads army fan and nothing will...,1,"['die', 'hard', 'dad', 'armi', 'fan', 'noth', ...",57,335,5


In [3]:
def join(x):
    x = ast.literal_eval(x)
    return ' '.join(x)

df['clean_text'] = df['clean_text'].apply(join)

In [4]:
x = df['clean_text']
y = df['label']

In [5]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=40,stratify=y)

In [6]:
vec = TfidfVectorizer(max_features = 5000)

x_train = vec.fit_transform(x_train).toarray()
x_test = vec.transform(x_test).toarray()

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,roc_curve

In [10]:
model = RandomForestClassifier()
model.fit(x_train,y_train)

In [9]:
y_pred = model.predict(x_test)

In [10]:
accuracy = accuracy_score(y_test,y_pred)
f1= f1_score(y_test,y_pred)
precision=precision_score(y_test,y_pred)
recall = recall_score(y_test,y_pred)
accuracy,f1,precision,recall

(0.8279421019509126,
 0.8297421845808942,
 0.8230788238201137,
 0.8365143144148669)

In [8]:
model = BernoulliNB()
model.fit(x_train,y_train)

In [11]:
from sklearn.metrics import confusion_matrix

In [12]:
confusion_matrix(y_test,y_pred)

array([[3247,  716],
       [ 651, 3331]], dtype=int64)

In [45]:
review = '''Stree 2 was about 45 minutes too long. Stree 2 was funny in the first half of the movie. It had great call backs to the OG Stree. The actors did a really good job reviving their characters. One of the newer characters played by a famous actor felt out of place because it made the film less grounded. The characters from other films that were on the movie felt shoehorned in and not in any way organic and faithful to the world building of the film. The effects of Sarkata were pretty well done.

Compared to first Stree as an allegory, Stree 2 pales in comparison.

The climax was too drawn out and did not follow the rules of the works that it had established.
'''

In [46]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import sent_tokenize,word_tokenize
import contractions
from nltk.corpus import stopwords
from string import punctuation

In [47]:
review = review.lower()
review

'stree 2 was about 45 minutes too long. stree 2 was funny in the first half of the movie. it had great call backs to the og stree. the actors did a really good job reviving their characters. one of the newer characters played by a famous actor felt out of place because it made the film less grounded. the characters from other films that were on the movie felt shoehorned in and not in any way organic and faithful to the world building of the film. the effects of sarkata were pretty well done.\n\ncompared to first stree as an allegory, stree 2 pales in comparison.\n\nthe climax was too drawn out and did not follow the rules of the works that it had established.\n'

In [48]:
stop_word_list = stopwords.words('english')
stop_words = []
for i in stop_word_list:
    if i not in ["nor",'no','not',]:
        stop_words.append(i)

stop_words

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 

In [49]:
def remove_punctuations(x):
    string = ''
    for i in x:
        if i not in punctuation:
            string = string + i
    return string

def remove_stopwords(x):
    l1 = []
    for i in x.split(' '):
        if i not in stop_words:
            l1.append(i)
    return " ".join(l1)



review = remove_punctuations(review)
review = remove_stopwords(review)
review

'stree 2 45 minutes long stree 2 funny first half movie great call backs og stree actors really good job reviving characters one newer characters played famous actor felt place made film less grounded characters films movie felt shoehorned not way organic faithful world building film effects sarkata pretty well done\n\ncompared first stree allegory stree 2 pales comparison\n\nthe climax drawn not follow rules works established\n'

In [50]:
from nltk.stem import PorterStemmer
def stemming(x):
    l = []
    stemmer = PorterStemmer()
    for i in x.split(' '):
        l.append(stemmer.stem(i))
    return l

review = stemming(review)
review = " ".join(review)

In [51]:
test_1 = vec.transform([review]).toarray()

In [52]:
model.predict(test_1)

array([1], dtype=int64)

'one gutwrench aspect chhaava portray betray sacrific histori seen mani betray heartbreak happen sambhaji maharaj film not shi away showcas pain moment make power unforgett sequenc way narr build event ensur hit full forc leav audienc emot devast yet fill admir unwav courag sambhaji maharaj moment serv remind histori shape not victori also sacrific made along way'