In [1]:
##Importing IMDB Dataset and cleaning reviews

#Importing libraries
import nltk
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from bs4 import BeautifulSoup



In [2]:
#Importing dataset and replacing labels with 0 and 1 for classification
df = pd.read_csv('C:/Users/NIKUNJ/Downloads/IMDB Dataset.csv', encoding = 'Latin-1')
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})


In [3]:
#Defining stop_words and lemmatizer

nltk.download('stopwords')
stop_words = set(stopwords.words("english")) 
lemmatizer = WordNetLemmatizer()



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\NIKUNJ\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()



In [5]:
#Defining clean_text function
def clean_text(text):
    text = strip_html(text)
    text = re.sub(r'[^A-Za-z0-9]+',' ',text)
    text = text.lower()
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    text = [lemmatizer.lemmatize(token, "v") for token in text]
    text = [word for word in text if not word in stop_words]
    text = " ".join(text)
    return text



In [6]:
nltk.download('wordnet')
nltk.download('omw-1.4')
#Creating new column for processed reviews
df['Processed_Reviews'] = df.review.apply(lambda x: clean_text(x))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\NIKUNJ\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\NIKUNJ\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
  soup = BeautifulSoup(text, "html.parser")


In [7]:
##Deploying SVM model on available data

#Importing libraries
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
import time



In [8]:
#Defining input and target variable
x = df['Processed_Reviews']
y = df['sentiment']

print(x)

0        one reviewer ha mention watch 1 oz episode hoo...
1        wonderful little production film technique una...
2        think wa wonderful way spend time hot summer w...
3        basically family little boy jake think zombie ...
4        petter mattei love time money visually stun fi...
                               ...                        
49995    think movie right good job creative original f...
49996    bad plot bad dialogue bad act idiotic direct a...
49997    catholic teach parochial elementary school nun...
49998    go disagree previous comment side maltin one s...
49999    one expect star trek movie high art fan expect...
Name: Processed_Reviews, Length: 50000, dtype: object


In [9]:
#Training and splitting
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)



In [10]:
#Vectorization and Bag of words method with default parameters
count_vect = CountVectorizer(ngram_range=(1,2), max_df=0.5).fit(df['Processed_Reviews'].values.astype('U'))
bow_train = count_vect.transform(X_train.values.astype('U'))
bow_test = count_vect.transform(X_test.values.astype('U'))

print(bow_train, y_train)

  (0, 260885)	1
  (0, 803611)	1
  (0, 804249)	1
  (0, 1004741)	1
  (0, 1006721)	1
  (0, 1012993)	1
  (0, 1013021)	1
  (0, 1023254)	1
  (0, 1023357)	1
  (0, 1073334)	1
  (0, 1074250)	1
  (0, 1146844)	1
  (0, 1146959)	1
  (0, 1239552)	1
  (0, 1239991)	1
  (0, 1241107)	1
  (0, 1241319)	1
  (0, 1429779)	1
  (0, 1431160)	1
  (0, 1589524)	1
  (0, 1718391)	1
  (0, 1718687)	1
  (0, 1819520)	1
  (0, 1821346)	1
  (0, 1822876)	1
  :	:
  (39999, 2503073)	1
  (39999, 2503452)	1
  (39999, 2539974)	1
  (39999, 2539977)	1
  (39999, 2542270)	1
  (39999, 2542767)	1
  (39999, 2572531)	3
  (39999, 2572763)	1
  (39999, 2572964)	1
  (39999, 2573736)	1
  (39999, 2574290)	1
  (39999, 2575628)	1
  (39999, 2593022)	1
  (39999, 2622456)	1
  (39999, 2622616)	1
  (39999, 2632572)	1
  (39999, 2632596)	1
  (39999, 2634454)	1
  (39999, 2634579)	1
  (39999, 2634846)	1
  (39999, 2635187)	1
  (39999, 2656953)	1
  (39999, 2656964)	1
  (39999, 2692226)	1
  (39999, 2694128)	1 20330    0
17532    0
45819    1
34807    1
318

In [11]:
#instantiate the model (using the default parameters)
SVM = LinearSVC(C = 50)



In [12]:
# fit the model with pre-processed data
SVM.fit(bow_train, y_train)




In [13]:
#perform classification and prediction on samples in tf_test
from sklearn.metrics import accuracy_score


predicted_SVM = SVM.predict(bow_test)
print(classification_report(y_test, predicted_SVM))


              precision    recall  f1-score   support

           0       0.89      0.89      0.89      5035
           1       0.89      0.89      0.89      4965

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [21]:
test = ['The movie was good, I could have not imagined a better ending','The movie was generally bad, the plot was boring and the characters badly interpreted']
test_1 = ['The movie was generally bad, the plot was boring and the characters badly interpreted']
test = count_vect.transform(test).toarray()
test_1 = count_vect.transform(test_1).toarray()
#Printing prediction
print(SVM.predict(test))
print(SVM.predict(test_1))

[1 0]
[0]


In [15]:
def sentiment(review):
    vec = count_vect.transform(review).toarray()
    return SVM.predict(vec)

In [16]:
sentiment(['I could have not imagined a better ending'])

array([0], dtype=int64)

In [17]:
import pickle
# pickling the vectorizer
pickle.dump(count_vect, open('vectorizer.pkl', 'wb'))
# pickling the model
pickle.dump(SVM, open('sentiment.pkl', 'wb'))

In [18]:
sentiment = pickle.load(open('sentiment.pkl','rb'))
vectorizer = pickle.load(open('vectorizer.pkl', 'rb'))

def sentimental(review):
    vec = vectorizer.transform(review)
    return sentiment.predict(vec)

print(sentimental(['good']))


[1]
