# Method 1: Text vectorization with support vector classification(SVC) and Multinomial Naive Bayes classfication

In [31]:
#Load the libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
#Read the dataset
dataset = pd.read_csv("IMDB Dataset.csv")

# Step 1: data pre-processing

### The text_clean() function provides basic data cleaning for NLP:
a) remove HTML contents like "< br>"   
b) remove punctutions and special characters like '\'  
c) remove stopwords like "is", "the", which do not offer much insight  
d) lemmatize the words to bring back multiple forms of same word like 'coming', 'comes' into 'come'  

In [3]:
def text_clean(review):
    soup = BeautifulSoup(review, "html.parser")
    review = soup.get_text() 
    review = re.sub('\[[^]]*\]', ' ', review)
    review = re.sub('[^a-zA-Z]', ' ', review)
    review = review.lower()
    review = review.split()
    review = [word for word in review if not word in set(stopwords.words('english'))]
    lem = WordNetLemmatizer()
    review = [lem.lemmatize(word) for word in review]
    review = ' '.join(review)
    
    return review
    
    

### Below is an example before and after data cleaning

In [4]:
#Original text
review = dataset['review'].loc[1]
corpus = []
corpus.append(review)
corpus



['A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well 

In [5]:
#Cleaned text
corpus[0] = text_clean(corpus[0])
corpus

['wonderful little production filming technique unassuming old time bbc fashion give comforting sometimes discomforting sense realism entire piece actor extremely well chosen michael sheen got polari voice pat truly see seamless editing guided reference williams diary entry well worth watching terrificly written performed piece masterful production one great master comedy life realism really come home little thing fantasy guard rather use traditional dream technique remains solid disappears play knowledge sens particularly scene concerning orton halliwell set particularly flat halliwell mural decorating every surface terribly well done']

# Step 2: Vectorization: encode the text to numericals
## We apply three methods to vectorize the text: 
### a) count vectorization  
### b) binary count vectorization  
### c) text frequency - inverse document frequency (tfidf) vectorization

In [6]:
# a) count vectorization: each word is encoded by the number of times it appeared in the text
count_vec = CountVectorizer()
review_count_vec = count_vec.fit_transform(corpus)
review_count_vec.toarray()

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1,
        1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1]])

In [7]:
# b) binary count vectorization: similar to a), but all it only check whether a word appears, thus only 1 for all words
count_vec_bin = CountVectorizer(binary=True)
review_count_vec_bin = count_vec_bin.fit_transform(corpus)
review_count_vec_bin.toarray()

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

In [8]:
# c) TF : how many times a word (term) appears in a text (document). 
#    IDF: log(# of documents in corpus/# of documents containing the term).
#    TF-IDF: TF * IDF.
tfidf_vec = TfidfVectorizer()
review_tfidf_vec = tfidf_vec.fit_transform(corpus)
review_tfidf_vec.toarray()

array([[0.09712859, 0.09712859, 0.09712859, 0.09712859, 0.09712859,
        0.09712859, 0.09712859, 0.09712859, 0.09712859, 0.09712859,
        0.09712859, 0.09712859, 0.09712859, 0.09712859, 0.09712859,
        0.09712859, 0.09712859, 0.09712859, 0.09712859, 0.09712859,
        0.09712859, 0.09712859, 0.09712859, 0.09712859, 0.09712859,
        0.09712859, 0.09712859, 0.19425717, 0.09712859, 0.09712859,
        0.09712859, 0.19425717, 0.09712859, 0.09712859, 0.09712859,
        0.09712859, 0.09712859, 0.09712859, 0.09712859, 0.19425717,
        0.09712859, 0.09712859, 0.19425717, 0.09712859, 0.09712859,
        0.19425717, 0.09712859, 0.19425717, 0.09712859, 0.09712859,
        0.09712859, 0.09712859, 0.09712859, 0.09712859, 0.09712859,
        0.09712859, 0.09712859, 0.09712859, 0.09712859, 0.09712859,
        0.09712859, 0.19425717, 0.09712859, 0.09712859, 0.09712859,
        0.09712859, 0.09712859, 0.09712859, 0.09712859, 0.09712859,
        0.09712859, 0.09712859, 0.29138576, 0.09

# Step 3: split the whole dataset into 75% training and 25% test

In [9]:
dataset_train, dataset_test, train_data_label, test_data_label = train_test_split(dataset['review'], dataset['sentiment'], test_size=0.25, random_state=42)

train_data_label = (train_data_label.replace({'positive': 1, 'negative': 0})).values
test_data_label  = (test_data_label.replace({'positive': 1, 'negative': 0})).values

corpus_train = []
corpus_test  = []

for i in range(dataset_train.shape[0]):
    corpus_train.append(text_clean(dataset_train.iloc[i]))
    
for j in range(dataset_test.shape[0]):
    corpus_test.append(text_clean(dataset_test.iloc[j]))

# Step 4: fit data with Support Vector Classification

### a) using TFIDF vectorization, fit with SVC: the accuracy is 90.29%

In [10]:
tfidf_vec = TfidfVectorizer(ngram_range=(1, 3))

tfidf_vec_train = tfidf_vec.fit_transform(corpus_train)
tfidf_vec_test = tfidf_vec.transform(corpus_test)

In [11]:
linear_svc = LinearSVC(C=0.5, random_state=42)
linear_svc.fit(tfidf_vec_train, train_data_label)

predict = linear_svc.predict(tfidf_vec_test)

In [12]:
print("Classification Report: \n", classification_report(test_data_label, predict,target_names=['Negative','Positive']))
print("Confusion Matrix: \n", confusion_matrix(test_data_label, predict))
print("Accuracy: \n", accuracy_score(test_data_label, predict))

Classification Report: 
               precision    recall  f1-score   support

    Negative       0.91      0.89      0.90      6157
    Positive       0.89      0.92      0.91      6343

    accuracy                           0.90     12500
   macro avg       0.90      0.90      0.90     12500
weighted avg       0.90      0.90      0.90     12500

Confusion Matrix: 
 [[5467  690]
 [ 524 5819]]
Accuracy: 
 0.90288


### b) using count vectorization, fit with SVC: the accuracy is 89.77%

In [13]:
count_vec = CountVectorizer(ngram_range=(1, 3), binary=False)
count_vec_train = count_vec.fit_transform(corpus_train)
count_vec_test = count_vec.transform(corpus_test)

In [14]:
linear_svc_count = LinearSVC(C=0.5, random_state=42, max_iter=5000)
linear_svc_count.fit(count_vec_train, train_data_label)

predict_count = linear_svc_count.predict(count_vec_test)

In [15]:
print("Classification Report: \n", classification_report(test_data_label, predict_count,target_names=['Negative','Positive']))
print("Confusion Matrix: \n", confusion_matrix(test_data_label, predict_count))
print("Accuracy: \n", accuracy_score(test_data_label, predict_count))

Classification Report: 
               precision    recall  f1-score   support

    Negative       0.90      0.89      0.90      6157
    Positive       0.90      0.90      0.90      6343

    accuracy                           0.90     12500
   macro avg       0.90      0.90      0.90     12500
weighted avg       0.90      0.90      0.90     12500

Confusion Matrix: 
 [[5489  668]
 [ 611 5732]]
Accuracy: 
 0.89768


### c) using binary count vectorization, fit with SVC: the accuracy is 89.46%

In [16]:
ind_vec = CountVectorizer(ngram_range=(1, 3), binary=True)
ind_vec_train = ind_vec.fit_transform(corpus_train)
ind_vec_test = ind_vec.transform(corpus_test)

In [17]:
linear_svc_ind = LinearSVC(C=0.5, random_state=42)
linear_svc_ind.fit(ind_vec_train, train_data_label)

predict_ind = linear_svc_ind.predict(ind_vec_test)

In [18]:
print("Classification Report: \n", classification_report(test_data_label, predict_ind,target_names=['Negative','Positive']))
print("Confusion Matrix: \n", confusion_matrix(test_data_label, predict_ind))
print("Accuracy: \n", accuracy_score(test_data_label, predict_ind))

Classification Report: 
               precision    recall  f1-score   support

    Negative       0.90      0.89      0.89      6157
    Positive       0.89      0.90      0.90      6343

    accuracy                           0.89     12500
   macro avg       0.89      0.89      0.89     12500
weighted avg       0.89      0.89      0.89     12500

Confusion Matrix: 
 [[5461  696]
 [ 621 5722]]
Accuracy: 
 0.89464


# Step5: fit the data to Multinomial Naive Bayes classifier. 
## Bayesian model uses prior probabilities to predict posterior probabilites which is helpful for classification with discrete features like text classification

### a) using TFIDF vectorization, fit with SVC: the accuracy is 86.6%

In [19]:
tfidf_vec_NB = TfidfVectorizer(ngram_range=(1, 1))
tfidf_vec_train_NB = tfidf_vec_NB.fit_transform(corpus_train)

tfidf_vec_test_NB = tfidf_vec_NB.transform(corpus_test)

In [20]:
from sklearn.feature_selection import SelectKBest, chi2

ch2 = SelectKBest(chi2, k=50000)
tfidf_vec_train_NB = ch2.fit_transform(tfidf_vec_train_NB, train_data_label)
tfidf_vec_test_NB  = ch2.transform(tfidf_vec_test_NB)

In [21]:
feature_names = tfidf_vec_NB.get_feature_names()
feature_names = [feature_names[i] for i
                         in ch2.get_support(indices=True)]
feature_names = np.asarray(feature_names)

In [22]:
from sklearn.naive_bayes import MultinomialNB

multi_clf = MultinomialNB()
multi_clf.fit(tfidf_vec_train_NB, train_data_label)

predict_NB = multi_clf.predict(tfidf_vec_test_NB)

In [23]:
print("Classification Report: \n", classification_report(test_data_label, predict_NB,target_names=['Negative','Positive']))
print("Confusion Matrix: \n", confusion_matrix(test_data_label, predict_NB))
print("Accuracy: \n", accuracy_score(test_data_label, predict_NB))

Classification Report: 
               precision    recall  f1-score   support

    Negative       0.85      0.88      0.87      6157
    Positive       0.88      0.85      0.87      6343

    accuracy                           0.87     12500
   macro avg       0.87      0.87      0.87     12500
weighted avg       0.87      0.87      0.87     12500

Confusion Matrix: 
 [[5424  733]
 [ 938 5405]]
Accuracy: 
 0.86632


### b) using count vectorization, fit with SVC: the accuracy is 88.7%

In [24]:
count_vec_NB = CountVectorizer(ngram_range=(1, 3), binary=False)
count_vec_train_NB = count_vec_NB.fit_transform(corpus_train)
count_vec_test_NB = count_vec_NB.transform(corpus_test)

In [25]:
multi_clf_count = MultinomialNB()
multi_clf_count.fit(count_vec_train_NB, train_data_label)

predict_NB_count = multi_clf_count.predict(count_vec_test_NB)

In [26]:
print("Classification Report: \n", classification_report(test_data_label, predict_NB_count,target_names=['Negative','Positive']))
print("Confusion Matrix: \n", confusion_matrix(test_data_label, predict_NB_count))
print("Accuracy: \n", accuracy_score(test_data_label, predict_NB_count))

Classification Report: 
               precision    recall  f1-score   support

    Negative       0.88      0.89      0.89      6157
    Positive       0.90      0.88      0.89      6343

    accuracy                           0.89     12500
   macro avg       0.89      0.89      0.89     12500
weighted avg       0.89      0.89      0.89     12500

Confusion Matrix: 
 [[5503  654]
 [ 752 5591]]
Accuracy: 
 0.88752


# Best model: SVC + TFIDF vectorization
## Let's see some prediction results from the model!

In [27]:
dataset_predict = dataset_test.copy()
dataset_predict = pd.DataFrame(dataset_predict)
dataset_predict.columns = ['review']
dataset_predict = dataset_predict.reset_index()
dataset_predict = dataset_predict.drop(['index'], axis=1)
dataset_predict.iloc[0:10]

Unnamed: 0,review
0,I really liked this Summerslam due to the look...
1,Not many television shows appeal to quite as m...
2,The film quickly gets to a major chase scene w...
3,Jane Austen would definitely approve of this o...
4,Expectations were somewhat high for me when I ...
5,I've watched this movie on a fairly regular ba...
6,For once a story of hope highlighted over the ...
7,"Okay, I didn't get the Purgatory thing the fir..."
8,I was very disappointed with this series. It h...
9,The first 30 minutes of Tinseltown had my fing...


In [28]:
test_actual_label = test_data_label.copy()
test_actual_label = pd.DataFrame(test_actual_label)
test_actual_label.columns = ['sentiment']
test_actual_label['sentiment'] = test_actual_label['sentiment'].replace({1: 'positive', 0: 'negative'})

In [29]:
test_predicted_label = predict.copy()
test_predicted_label = pd.DataFrame(test_predicted_label)
test_predicted_label.columns = ['predicted_sentiment']
test_predicted_label['predicted_sentiment'] = test_predicted_label['predicted_sentiment'].replace({1: 'positive', 0: 'negative'})

In [30]:
test_result = pd.concat([dataset_predict, test_actual_label, test_predicted_label], axis=1)
test_result.iloc[0:10]

Unnamed: 0,review,sentiment,predicted_sentiment
0,I really liked this Summerslam due to the look...,positive,negative
1,Not many television shows appeal to quite as m...,positive,positive
2,The film quickly gets to a major chase scene w...,negative,negative
3,Jane Austen would definitely approve of this o...,positive,positive
4,Expectations were somewhat high for me when I ...,negative,negative
5,I've watched this movie on a fairly regular ba...,positive,positive
6,For once a story of hope highlighted over the ...,positive,positive
7,"Okay, I didn't get the Purgatory thing the fir...",positive,negative
8,I was very disappointed with this series. It h...,negative,negative
9,The first 30 minutes of Tinseltown had my fing...,negative,negative
