<div style="text-align: right"> York University - ML1030 - Julia Mitroi </div>

# Sentiment Analysis for Movie Reviews

## Text Pre-processing/Normalization + Multinomial Naive Bayes

### Installing libraries and packages

In [1]:
import os
import numpy as np 
import pandas as pd 

from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup

import re

import seaborn as sns

import random

from sklearn.model_selection import KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics

import keras
from keras.preprocessing.text import Tokenizer

import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### Loading data into Python

In [2]:
train_reviews = pd.read_csv("train.tsv", sep="\t")

In [3]:
test_reviews = pd.read_csv("test.tsv", sep="\t")

### Data Cleaning

In [4]:
# Make reviews text lowercase
train_reviews['Phrase'] = train_reviews['Phrase'].str.lower()
test_reviews['Phrase'] = test_reviews['Phrase'].str.lower()

In [5]:
# Leaving only letters from a-z and digits
train_reviews['Phrase'] = train_reviews['Phrase'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))
test_reviews['Phrase'] = test_reviews['Phrase'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

In [6]:
# Removing HTML content
train_reviews['Phrase'] = [BeautifulSoup(text).get_text() for text in train_reviews['Phrase']]
test_reviews['Phrase'] = [BeautifulSoup(text).get_text() for text in test_reviews['Phrase']]

In [7]:
# Tokenization using the TweetTokenizer, a Twitter-aware tokenizer which was designed to be flexible and easy to adapt to new domains and tasks
from nltk.tokenize import TweetTokenizer

tt = TweetTokenizer()

def tokenize(t):
    return ' '.join(tt.tokenize(t))

sentences = train_reviews.Phrase.apply(tokenize)

In [8]:
# NLTK's WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def lemmatize(l):
    return ' '.join([lemmatizer.lemmatize(s) for s in l.split(' ')])    
    
sentences = sentences.apply(lemmatize)

In [9]:
# NLTK's Stemmer

stemmer = PorterStemmer()

def stem(s):
    return ' '.join([stemmer.stem(w) for w in s.split(' ')])

sentences = sentences.apply(stem)

In [10]:
# Preliminary to modeling, I will use Keras text preprocessing functions to further pre-process the sentences in the 
# train set, namely to transform the text into sequences of tokens, and pad those sequences to have the same length.

tokenizer = Tokenizer()
# fit_on_texts function creates vocabulary index based on word frequency in sentences
tokenizer.fit_on_texts(sentences)

# texts_to_sequences transforms each text in the sentences set to a sequence of integers
X = tokenizer.texts_to_sequences(sentences)

from keras.preprocessing.sequence import pad_sequences
# pad_sequences performs sequences padding
X = pad_sequences(X)

In [11]:
print (X.shape)

(156060, 48)


In [12]:
# Using k-Fold Cross-Validation, splitting the train_reviews the dataset into 10 groups, shuffling the dataset randomly
kf = KFold(n_splits=10, shuffle=True)

In [13]:
# Loading the multinomina Naive Bayes model
mnb = MultinomialNB()

In [14]:
for train_index, test_index in kf.split(X, train_reviews['Sentiment']):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = train_reviews['Sentiment'][train_index], train_reviews['Sentiment'][test_index]
    mnb = mnb.fit(X_train, y_train)

In [17]:
print("average accuracy classification score: ")
print (accuracy_score(y_test, mnb.predict(X_test)))

average accuracy classification score: 
0.4637959759067026
