#### Importing required libraries

In [None]:
import numpy as np
import pandas as pd



In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

#### Getting the dataset

In [None]:
df=pd.read_csv("IMDB Dataset.csv")

##Data Analysis

#### Getting the Shape (Dimensions) of the dataset

In [None]:
df.shape

#### Information about the dataset

In [None]:
df.info()

#### Displaying the mathematical summary of the dataset

In [None]:
df.describe()

#### Top 5  rows of the dataset

In [None]:
df.head()

#### Last 5 rows of the dataset

In [None]:
df.tail()

#### Random 5 rows of the dataset

In [None]:
df.sample(5)

#### Getting the the Frequency of the Sentiment of movies in the dataset

In [None]:
df["sentiment"].value_counts()

In [None]:
df["sentiment"].value_counts().plot(kind="pie")

In [None]:
df["sentiment"].value_counts().plot(kind="bar")

In [None]:
df=df.sample(10000)

In [None]:
df["sentiment"].replace({'positive':1,'negative':0},inplace=True)

In [None]:
df.head()

#### Removing all HTML tags of the reviews

In [None]:
import re 
def clean_html_tags(text):
  clean=re.compile('<.*?>')
  return re.sub(clean,'',text)

In [None]:
df["review"]=df["review"].apply(clean_html_tags)

#### Converting all reviews in the lower case

In [None]:
def convert_to_lower(text):
  return text.lower()

In [None]:
df["review"]=df["review"].apply(convert_to_lower)

In [None]:
df.head()

#### Removing the special characters in all the reviews

In [None]:
def remove_special_chars(text):
  x=" "
  for i in text:
    if i.isalnum():
      x=x+i
    else:
      x=x+' '
  return x


In [None]:
df["review"]=df["review"].apply(remove_special_chars)

In [None]:
df.head()

#### Removing the stop words in the reviews

In [None]:
def remove_stop_words(text):
  x=[]
  for i in text.split():
     if i not in stopwords.words('english'):
       x.append(i)
  y=x[:]
  x.clear()
  return y

In [None]:
df["review"]=df["review"].apply(remove_stop_words)

In [None]:

df.head()

#### Perform the Stemming

In [None]:
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [None]:
y=[]
def stem_words(text):
  for i in text:
    y.append(ps.stem(i))
  z=y[:]
  y.clear()
  return z

In [None]:
df["review"]=df["review"].apply(stem_words)

In [None]:
df.head()

In [None]:
def join_back(list_op):
  return " ".join(list_op)

In [None]:
df["review"]=df["review"].apply(join_back)

In [None]:
df.head()

## Model Building

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=1000)

In [None]:
x=cv.fit_transform(df["review"]).toarray()

In [None]:
y=df.iloc[:,-1].values

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

In [None]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.neighbors import KNeighborsClassifier  
from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier  


clf1=GaussianNB()
clf2=MultinomialNB()
clf3=BernoulliNB()

knn= KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2 )  
svm = SVC(kernel='linear', random_state=0)  
rf= RandomForestClassifier(n_estimators= 10, criterion="entropy")  

In [None]:
clf1.fit(x_train,y_train)
clf2.fit(x_train,y_train)
clf3.fit(x_train,y_train)


knn.fit(x_train,y_train)
svm.fit(x_train,y_train)
rf.fit(x_train,y_train)


In [None]:
y_pred1=clf1.predict(x_test)
y_pred2=clf2.predict(x_test)
y_pred3=clf3.predict(x_test)


y_pred5=knn.predict(x_test)
y_pred6=svm.predict(x_test)
y_pred7=rf.predict(x_test)


In [None]:
from sklearn.metrics import accuracy_score


In [None]:
print(f"Accuracy using the Gaussian Naive Bayes: {accuracy_score(y_test,y_pred1)*100}")

In [None]:
print(f"Accuracy using the Multinomial Naive Bayes: {accuracy_score(y_test,y_pred2)*100}")

In [None]:
print(f"Accuracy using the BernoulliNB Naive Bayes: {accuracy_score(y_test,y_pred3)*100}")

In [None]:
print(f"Accuracy using the KNN: {accuracy_score(y_test,y_pred5)*100}")

In [None]:
print(f"Accuracy using the SVM: {accuracy_score(y_test,y_pred6)*100}")

In [None]:
print(f"Accuracy using the Random Forest: {accuracy_score(y_test,y_pred7)*100}")

#### Here the Accuracy of Bernoulli Naive Bayes is High which is 83.85%, so we use this algorithm to built our model and perform the prediction. 

In [None]:
review=input("Enter your review about movie: ")


#### Here the Accuracy score

In [None]:
y_pred3=clf3.predict(cv.transform([review]))

In [None]:
y_pred3

In [None]:
if y_pred3==1:
  print("Review about the movie is Positive")
else:
  print("Review about the movie is Negative")