## Table of Contents

## 0. Introduction

## 1. Reading the Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read the labeled training data
# Header = 0 indicates that the first line of the file contains column names, 
# delimiter = \t indicates that the fields are seperated by tabs, and 
# quoting = 3 tells python to ignore doubled quotes

train = pd.read_csv("labeledTrainData.tsv", header = 0, delimiter = "\t", quoting = 3)
test = pd.read_csv("testData.tsv", header = 0, delimiter = "\t", quoting = 3)

In [3]:
# Display check the dimensions and the first 2 rows of the file.

print('train dim:', train.shape, 'test dim:', test.shape)
train.iloc[0:2]

train dim: (25000, 3) test dim: (25000, 2)


Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."


In [4]:
# Let's check the first review.

train.iloc[0]["review"][:len(train.iloc[0]["review"])//2]

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally

As you can see the above review, the html tags are disturbing and also in order to make the data machine-learning friendly, we need to clean the data.

## 2. Data Cleaning and Text Preprocessing

### 2.1. Removing HTML Markup by using BeautifulSoup Package

In [5]:
from bs4 import BeautifulSoup

In [6]:
example1 = BeautifulSoup(train["review"][0], "html.parser")

# Without the second argument "html.parser", it will pop out the warning message.

In [7]:
print(example1.get_text())

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.The actual feature film bit when it finally starts is only on for 20 mi

You can clearly see the effect of removing HTML markup. 

### 2.2. Removing Non-Letter Characters & Converting Reviews to Lower Case

It may be important to include some punctuations and numbers such as :-). However for this project, for simplicity, we remove both of them.

In [8]:
import re

letters = re.sub("[^a-zA-Z]", " ", example1.get_text())
letters = letters.lower()

The meaning of the above regular expression is that except for (^) the letters from a to z and from A to Z ([a-zA-Z]) substitute all the characters to spaces. lower() means conversion any capital letters to lower case.

In [9]:
print(letters)

 with all this stuff going down at the moment with mj i ve started listening to his music  watching the odd documentary here and there  watched the wiz and watched moonwalker again  maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent  moonwalker is part biography  part feature film which i remember going to see at the cinema when it was originally released  some of it has subtle messages about mj s feeling towards the press and also the obvious message of drugs are bad m kay visually impressive but of course this is all about michael jackson so unless you remotely like mj in anyway then you are going to hate this and find it boring  some may call mj an egotist for consenting to the making of this movie but mj and most of his fans would say that he made it for the fans which if true is really nice of him the actual feature film bit when it finally starts is only on for    mi

### 2.3. Tokenization

In [10]:
# Import Natural Language Toolkit
import nltk

In [11]:
# Instead of using just split() method, used word_tokenize in nltk library.
word = nltk.word_tokenize(letters)

In [12]:
word

['with',
 'all',
 'this',
 'stuff',
 'going',
 'down',
 'at',
 'the',
 'moment',
 'with',
 'mj',
 'i',
 've',
 'started',
 'listening',
 'to',
 'his',
 'music',
 'watching',
 'the',
 'odd',
 'documentary',
 'here',
 'and',
 'there',
 'watched',
 'the',
 'wiz',
 'and',
 'watched',
 'moonwalker',
 'again',
 'maybe',
 'i',
 'just',
 'want',
 'to',
 'get',
 'a',
 'certain',
 'insight',
 'into',
 'this',
 'guy',
 'who',
 'i',
 'thought',
 'was',
 'really',
 'cool',
 'in',
 'the',
 'eighties',
 'just',
 'to',
 'maybe',
 'make',
 'up',
 'my',
 'mind',
 'whether',
 'he',
 'is',
 'guilty',
 'or',
 'innocent',
 'moonwalker',
 'is',
 'part',
 'biography',
 'part',
 'feature',
 'film',
 'which',
 'i',
 'remember',
 'going',
 'to',
 'see',
 'at',
 'the',
 'cinema',
 'when',
 'it',
 'was',
 'originally',
 'released',
 'some',
 'of',
 'it',
 'has',
 'subtle',
 'messages',
 'about',
 'mj',
 's',
 'feeling',
 'towards',
 'the',
 'press',
 'and',
 'also',
 'the',
 'obvious',
 'message',
 'of',
 'drugs',

### 2.4. Removing Stop words

"Stop words" is the frequently occurring words that do not carry much meaning such as "a", "and" , "is", "the". In order to use the data as input for machine learning algorithms, we need to get rid of them. Fortunately, there is a function called stopwords which is already built in NLTK library.

In [13]:
from nltk.corpus import stopwords

Below is the list of stopwords.

In [14]:
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [15]:
# Exclude the stop words from the original tokens.

word = [w for w in word if not w in set(stopwords.words("english"))]

In [16]:
word

['stuff',
 'going',
 'moment',
 'mj',
 'started',
 'listening',
 'music',
 'watching',
 'odd',
 'documentary',
 'watched',
 'wiz',
 'watched',
 'moonwalker',
 'maybe',
 'want',
 'get',
 'certain',
 'insight',
 'guy',
 'thought',
 'really',
 'cool',
 'eighties',
 'maybe',
 'make',
 'mind',
 'whether',
 'guilty',
 'innocent',
 'moonwalker',
 'part',
 'biography',
 'part',
 'feature',
 'film',
 'remember',
 'going',
 'see',
 'cinema',
 'originally',
 'released',
 'subtle',
 'messages',
 'mj',
 'feeling',
 'towards',
 'press',
 'also',
 'obvious',
 'message',
 'drugs',
 'bad',
 'kay',
 'visually',
 'impressive',
 'course',
 'michael',
 'jackson',
 'unless',
 'remotely',
 'like',
 'mj',
 'anyway',
 'going',
 'hate',
 'find',
 'boring',
 'may',
 'call',
 'mj',
 'egotist',
 'consenting',
 'making',
 'movie',
 'mj',
 'fans',
 'would',
 'say',
 'made',
 'fans',
 'true',
 'really',
 'nice',
 'actual',
 'feature',
 'film',
 'bit',
 'finally',
 'starts',
 'minutes',
 'excluding',
 'smooth',
 'crim

### 2.5. Stemming / Lemmatization

It is important to know the difference between these two.

- __Stemming:__ Stemming algorithms work by cutting off the end of the word, and in some cases also the beginning while looking for the root. This indiscriminate cutting can be successful in some occasions, but not always, that is why we affirm that this an approach that offers some limitations. ex) studying -> study, studied -> studi <br>
<br>
- __Lemmatization:__ Lemmatization is the process of converting the words of a sentence to its dictionary form. For example, given the words amusement, amusing, and amused, the lemma for each and all would be amuse. ex) studying -> study, studied -> study. Lemmatization also discerns the meaning of the word by understanding the context of a passage. For example, if a "meet" is used as a noun then it will print out a "meeting"; however, if it is used as a verb then it will print out "meet".  
<br>

Usually, either one of them is chosen for text-analysis not both. As a side note, Lancaster is the most aggressive stemmer among three major stemming algorithms (Porter, Snowball, Lancaster) and Porter is the least aggressive. The "aggressive algorithms" means how much a working set of words are reduced. The more aggressive the algorithms, the faster it is; however, in some certain circumstances, it will hugely trim down your working set. Therefore, in this project I decide to use snowball since it is slightly faster than Porter and does not trim down too much information as Lancaster does.

In [17]:
snow = nltk.stem.SnowballStemmer('english')
stems = [snow.stem(w) for w in word]

In [18]:
stems

['stuff',
 'go',
 'moment',
 'mj',
 'start',
 'listen',
 'music',
 'watch',
 'odd',
 'documentari',
 'watch',
 'wiz',
 'watch',
 'moonwalk',
 'mayb',
 'want',
 'get',
 'certain',
 'insight',
 'guy',
 'thought',
 'realli',
 'cool',
 'eighti',
 'mayb',
 'make',
 'mind',
 'whether',
 'guilti',
 'innoc',
 'moonwalk',
 'part',
 'biographi',
 'part',
 'featur',
 'film',
 'rememb',
 'go',
 'see',
 'cinema',
 'origin',
 'releas',
 'subtl',
 'messag',
 'mj',
 'feel',
 'toward',
 'press',
 'also',
 'obvious',
 'messag',
 'drug',
 'bad',
 'kay',
 'visual',
 'impress',
 'cours',
 'michael',
 'jackson',
 'unless',
 'remot',
 'like',
 'mj',
 'anyway',
 'go',
 'hate',
 'find',
 'bore',
 'may',
 'call',
 'mj',
 'egotist',
 'consent',
 'make',
 'movi',
 'mj',
 'fan',
 'would',
 'say',
 'made',
 'fan',
 'true',
 'realli',
 'nice',
 'actual',
 'featur',
 'film',
 'bit',
 'final',
 'start',
 'minut',
 'exclud',
 'smooth',
 'crimin',
 'sequenc',
 'joe',
 'pesci',
 'convinc',
 'psychopath',
 'power',
 'drug

As you can see the word "started", it is converted to "start" and "listening" and "watching" are converted to "listen" and "watch".

### 2.6. Putting It All Together

So far, we have cleaned only one datapoint. Now it's time to apply all the cleaning process to all the data.<br>
To make the code reusable, we need to create a function that can be called many times.

In [19]:
def cleaning(raw_review):
    import nltk
    
    # 1. Remove HTML.
    html_text = BeautifulSoup(raw_review,"html.parser").get_text()
    
    # 2. Remove non-letters.
    letters = re.sub("[^a-zA-Z]", " ", html_text)
    
    # 3. Convert to lower case.
    letters = letters.lower()
    
    # 4. Tokenize.
    tokens = nltk.word_tokenize(letters)
    
    # 5. Convert the stopwords list to "set" data type.
    stops = set(nltk.corpus.stopwords.words("english"))
    
    # 6. Remove stop words. 
    words = [w for w in tokens if not w in stops]
    
    # 7. Stemming
    words = [nltk.stem.SnowballStemmer('english').stem(w) for w in words]
    
    # 8. Join the words back into one string separated by space, and return the result.
    return " ".join(words)

    

In [20]:
# Add the processed data to the original data. Perhaps using apply function would be more elegant and concise than using for loop
train['clean'] = train['review'].apply(cleaning)
test['clean'] = test['review'].apply(cleaning)

In [21]:
train.head()

Unnamed: 0,id,sentiment,review,clean
0,"""5814_8""",1,"""With all this stuff going down at the moment ...",stuff go moment mj start listen music watch od...
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ...",classic war world timothi hine entertain film ...
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell...",film start manag nichola bell give welcom inve...
3,"""3630_4""",0,"""It must be assumed that those who praised thi...",must assum prais film greatest film opera ever...
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ...",superbl trashi wondrous unpretenti exploit hoo...


## 3. Visualization


### 3.1 WordCloud

As a tool for visualization by using the frequency of words appeared in text, we use WordCloud. Note that it can give more information and insight of texts by analyzing correlations and similarities between words rather than analyzing texts only by the frequency of words appeared; however, it can give you some general shape of what this text is about quickly and intuitively. 

In [71]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

%matplotlib notebook

In [46]:

def cloud(data,backgroundcolor = 'white', width = 800, height = 600):
    wordcloud = WordCloud(stopwords = STOPWORDS, background_color = backgroundcolor,
                         width = width, height = height).generate(data)
    plt.figure(figsize = (15, 10))
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()
    

In [47]:
cloud(' '.join(train['clean']))

<IPython.core.display.Javascript object>

In [48]:
cloud(' '.join(test['clean']))

<IPython.core.display.Javascript object>

### 3.2 Distribution 

In [61]:
# We need to split each words in cleaned review and then count the number of each rows of data frame.

train['freq_word'] = train['clean'].apply(lambda x: len(str(x).split()))
train['unique_freq_word'] = train['clean'].apply(lambda x: len(set(str(x).split())))
                                                 
test['freq_word'] = test['clean'].apply(lambda x: len(str(x).split()))
test['unique_freq_word'] = test['clean'].apply(lambda x: len(set(str(x).split())))                                                 

In [85]:
fig, axes = plt.subplots(ncols=2)
fig.set_size_inches(10,5)

sns.distplot(train['freq_word'], bins = 90, ax=axes[0], fit = stats.norm)
(mu0, sigma0) = stats.norm.fit(train['freq_word'])
axes[0].legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu0, sigma0)],loc='best')
axes[0].set_title("Distribution Word Frequency")
axes[0].axvline(train['freq_word'].median(), linestyle='dashed')
print("median of word frequency: ", train['freq_word'].median())


sns.distplot(train['unique_freq_word'], bins = 90, ax=axes[1], color = 'r', fit = stats.norm)
(mu1, sigma1) = stats.norm.fit(train['unique_freq_word'])
axes[1].set_title("Distribution Unique Word Frequency")
axes[1].legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu1, sigma1)],loc='best')
axes[1].axvline(train['unique_freq_word'].median(), linestyle='dashed')
print("median of uniuqe word frequency: ", train['unique_freq_word'].median())

<IPython.core.display.Javascript object>

median of word frequency:  89.0
median of uniuqe word frequency:  74.0


The black contour of the distribution graphs represent the normal distribution if the data would have been distributed as normal. Compared to the black contour, the actual distribution is pretty skwed; therefore, median would be better to use as a measure of representative of data since mean is very sensitive to outliers and noise especially the distribution is highly skewed. As shown in the legend, the mean of the word frequency is 119.50 and 94.04. It means 119.50 words and 94.04 unique words are used for each review. Also the dashed lines represent the median of the distribution. Another thing to notice is that the median values are very closely located to the normal distribution's mean points.

## 4. Bag of Words

Even though we cleaned the data with many steps, we still have one more step to create machine learning-friendly input. One common approach is called a Bag of Words. It is simply the matrix that counts how many each word appears in documents. In order to do that, we use "CountVectorizer" in sklearn library. As you know already, the number of vocabulary is very large so it is important to limit the size of the feature vectors. In this project, we use the 30000 most frequent words. Also, the other things to notice is that we set min_df = 2 and ngram_range = (1,3). min_df = 2 means in order to include the vocabulary in the matrix, one word must appear in at least two documents. ngram_range means we cut one sentence by number of ngram. Let's say we have one sentence, I am a boy. If we cut the sentence by digram (ngram=2) then the sentence would be cut like this ["I am", "a boy"]. The result of accuracy can be highly dependent on parameters so feel free to alter them and see if you can improve the score.

In [26]:
from sklearn.feature_extraction.text import CountVectorizer

In [27]:
vectorizer = CountVectorizer(analyzer = "word", 
                             tokenizer = None, 
                             preprocessor = None, 
                             stop_words = None, 
                             max_features = 18000,
                             min_df = 2,
                            ngram_range = (1,3)
                            )

As alluded many times, the matrix is going to be huge so it would be a good idea to use Pipeline for faster computation.

In [28]:
from sklearn.pipeline import Pipeline

In [29]:
pipe = Pipeline( [('vect', vectorizer)] )

In [30]:
# Complete form of bag of word for machine learning input. We will be using this for machine learning algorithms.

train_bw = pipe.fit_transform(train['clean'])

# We only call transform not fit_transform due to the risk of overfitting.

test_bw = pipe.transform(test['clean'])

In [31]:
print('train dim:', train_bw.shape, 'test dim:', test_bw.shape)

train dim: (25000, 18000) test dim: (25000, 18000)


In [32]:
# Get the name fo the features

lexi = vectorizer.get_feature_names()

In [33]:
lexi[:5]

['aag', 'aaron', 'abandon', 'abbey', 'abbi']

In [34]:
# Instead of 1 and 0 representation, create the dataframe to see how many times each word appears (just sum of 1 of each row)

train_sum = pd.DataFrame(np.sum(train_bw, axis=0), columns = lexi)

In [35]:
train_sum.head()

Unnamed: 0,aag,aaron,abandon,abbey,abbi,abbot,abbott,abc,abduct,abe,...,zoe,zombi,zombi film,zombi flick,zombi movi,zone,zoo,zoom,zorro,zu
0,26,48,288,24,30,29,30,125,55,24,...,27,1331,52,37,89,161,31,71,59,40


## 4. Modeling

In [90]:
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.model_selection import GridSearchCV,StratifiedKFold,learning_curve
from xgboost import XGBClassifier

In [None]:
kfold = StratifiedKFold( n_splits = 20, random_state = 2018 )

### 4.1 Support Vector Machine

In [None]:
SVMC = SVC(probability = True, random_state = 2018)

params = {    
              'gamma': [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1],
              'class_weight': ['balanced',{1:2},{1:3}],
              'C':[0.001,0.003,0.005,0.007,0.009,0.10,0.30,0.50,1,3,5,10,25,35],
              'kernel' : ['rbf', 'sigmoid']
        }

grid = GridSearchCV(estimator = SVMC, param_grid = params,cv = kfold, scoring = 'roc_auc', verbose = 1, n_jobs=-1)
grid.fit(train_bw, train['sentiment'])
grid_best = grid.best_estimator_
print(grid_best)

In [None]:
submission = grid.predict(test_bw)

### 4.2 Gaussian Naive Bayes Classifier

In [None]:
gnb = GaussianNB()

gnb.fit(train_bw, train['sentiment'])

In [None]:
submission = gnb.predict(test_bw)

### 4.3 Bernoulli Naive Bayes Classifier

In [None]:
bnb = BernoulliNB()

params = {    
              'alpha': np.linspace(0,10,100)
        }

grid = GridSearchCV(estimator = bnb, param_grid = params, cv = kfold, scoring = 'roc_auc', verbose = 1, n_jobs=-1)
grid.fit(train_bw, train['sentiment'])
grid_best = grid.best_estimator_
print(grid_best)

In [95]:
submission = grid.predict(test_bw)

array([  0.        ,   0.1010101 ,   0.2020202 ,   0.3030303 ,
         0.4040404 ,   0.50505051,   0.60606061,   0.70707071,
         0.80808081,   0.90909091,   1.01010101,   1.11111111,
         1.21212121,   1.31313131,   1.41414141,   1.51515152,
         1.61616162,   1.71717172,   1.81818182,   1.91919192,
         2.02020202,   2.12121212,   2.22222222,   2.32323232,
         2.42424242,   2.52525253,   2.62626263,   2.72727273,
         2.82828283,   2.92929293,   3.03030303,   3.13131313,
         3.23232323,   3.33333333,   3.43434343,   3.53535354,
         3.63636364,   3.73737374,   3.83838384,   3.93939394,
         4.04040404,   4.14141414,   4.24242424,   4.34343434,
         4.44444444,   4.54545455,   4.64646465,   4.74747475,
         4.84848485,   4.94949495,   5.05050505,   5.15151515,
         5.25252525,   5.35353535,   5.45454545,   5.55555556,
         5.65656566,   5.75757576,   5.85858586,   5.95959596,
         6.06060606,   6.16161616,   6.26262626,   6.36

### 4.4 XG Boosting

In [None]:
XGB = XGBClassifier(random_state = 2018)

xg_param_grid = {
              'min_child_weight':[0.1,0.2,0.25,0.3,0.35],
              'n_estimators' : [5000,10000,20000],
              'learning_rate': [0.4,0.3,0.25,0.2,0.15],
              'max_depth': [1,2,3,4],
              'subsample':[0.40,0.45,0.47,0.5,0.52,0.54,0.56,0.57,0.59,0.6],
              'colsample_bytree':[0.8,0.9,1],
}


gsXGB = GridSearchCV(XGB, param_grid = xg_param_grid, cv = kfold, scoring = 'roc_auc', n_jobs= -1, verbose = 1)
gsXGB.fit(train_bw,train['sentiment'])
print(gsXGB.best_params_)

In [None]:
submission = gsXGB.predict(test_bw)

## 5. Submission

In [37]:
output = pd.DataFrame( data = {'id': test['id'], 'sentiment': submission })
output.to_csv('submission01.csv', index = False, quoting = 3)

NameError: name 'submission' is not defined