## Setup Tasks

In [0]:
!pip install kaggle

In [0]:
from google.colab import files
files.upload()

In [0]:
!mkdir ~/.kaggle
!cp /content/kaggle.json /root/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json

In [0]:
!kaggle competitions download -c word2vec-nlp-tutorial

In [9]:
!ls

kaggle.json		  sample_data		testData.tsv.zip
labeledTrainData.tsv	  sampleSubmission.csv	unlabeledTrainData.tsv
labeledTrainData.tsv.zip  testData.tsv		unlabeledTrainData.tsv.zip


In [0]:
!unzip -q testData.tsv.zip
!unzip -q labeledTrainData.tsv.zip
!unzip -q unlabeledTrainData.tsv.zip

In [0]:
import pandas as pd

In [68]:
train = pd.read_csv('labeledTrainData.tsv',header=0,delimiter='\t',quoting=3)
train.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [14]:
train.shape

(25000, 3)

In [16]:
print(train.review[0])

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

In [0]:
# Removing the br and other HTML tags using BeutifulSoup
from bs4 import BeautifulSoup

In [18]:
# Using BeautifulSoup on a single review to remove the tags
example = BeautifulSoup(train.review[0])

print(train.review[0])
print('\n')
print(example.get_text())

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

In [19]:
# Depending upon requirement may need to remove punctuations and numbers
# or replace them with a placeholder
# Using re to create a regex to remove anything that is not a-zA-Z
import re

letters = re.sub("[^a-zA-Z]"," ",example.get_text())
print(letters)

 With all this stuff going down at the moment with MJ i ve started listening to his music  watching the odd documentary here and there  watched The Wiz and watched Moonwalker again  Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent  Moonwalker is part biography  part feature film which i remember going to see at the cinema when it was originally released  Some of it has subtle messages about MJ s feeling towards the press and also the obvious message of drugs are bad m kay Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring  Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him The actual feature film bit when it finally starts is only on for    mi

In [0]:
# Lowercasing and splitting the words
lower = letters.lower()
words = lower.split()

In [0]:
# Using NLTK to remove the stopwords ,a ,and , is etc.
import nltk
nltk.download()

In [0]:
from nltk.corpus import stopwords
stopwords.words('english')

In [0]:
# removing words not in stop words
words = [w for w in words if w not in  stopwords.words('english')]
words

### Data Preprocessing final steps

* Remove HTML
* Remove/Replace punctuations , numbers ( according to the req)
* Convert to lowercase and tokenize (split )
* Using ' stopwords ' remove the stopwords

In [0]:
# using a set as searching is faster in set
stop = set(stopwords.words('english'))

In [0]:
# define a function to do the above mentioned steps
def preprocessing_data(text):
  # remove html
  pre_text = BeautifulSoup(text).get_text()
  # replace punctuations
  letters = re.sub("[^a-zA-Z]"," ",pre_text)
  # lower case and tokenize
  lowers = letters.lower().split()
  # removing stop words
  words = [w for w in lowers if w not in stop]
  # concatenating and returning the dinal sentence
  final_text = " ".join(words)
  return final_text

In [28]:
print(preprocessing_data(train.review[0]))

stuff going moment mj started listening music watching odd documentary watched wiz watched moonwalker maybe want get certain insight guy thought really cool eighties maybe make mind whether guilty innocent moonwalker part biography part feature film remember going see cinema originally released subtle messages mj feeling towards press also obvious message drugs bad kay visually impressive course michael jackson unless remotely like mj anyway going hate find boring may call mj egotist consenting making movie mj fans would say made fans true really nice actual feature film bit finally starts minutes excluding smooth criminal sequence joe pesci convincing psychopathic powerful drug lord wants mj dead bad beyond mj overheard plans nah joe pesci character ranted wanted people know supplying drugs etc dunno maybe hates mj music lots cool things like mj turning car robot whole speed demon sequence also director must patience saint came filming kiddy bad sequence usually directors hate working

In [0]:
# Applying to entire dataset
all_reviews = train.review.apply(preprocessing_data)

In [31]:
all_reviews.shape

(25000,)

## Bag Of Words : Using Scikit-Learn to create features 

convert text to numbers based on their frequency in the review

In [0]:
# Using frequency of words to limit vocab size : 10000 most used words
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer='word',tokenizer=None,preprocessor=None,
                            stop_words=None,max_features=10000)

In [0]:
# vecorizer is just an object using fit_transform to transform the data
train_data_features = vectorizer.fit_transform(all_reviews.values)

In [39]:
train_data_features[0]

<1x10000 sparse matrix of type '<class 'numpy.int64'>'
	with 150 stored elements in Compressed Sparse Row format>

In [40]:
train_data_features.shape

(25000, 10000)

In [42]:
# Each review is now split into 10000 columns 
vocab = vectorizer.get_feature_names()
vocab[:10]

['aaron',
 'abandon',
 'abandoned',
 'abbott',
 'abc',
 'abducted',
 'abilities',
 'ability',
 'able',
 'aboard']

In [48]:
# each word and it's frequency in the row (review)
train_data_features[0].data

array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  4,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  2,  1,  1,  1,  1,  1,  1,  1,  3,  5,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  2,  3,  1,  1,  1,  1,  1,  1,  1,  2,  1,  1,
        1,  1,  2,  3,  1,  1,  1,  1,  1,  2,  1,  3,  1,  1,  2,  2,  1,
        1,  1,  1,  3,  2,  2,  1,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  2,  2,  1,  2,  1,  2,  1,  1,  1,  1,  2,  2,  1,  2,  1,  1,
        1,  1,  3,  2,  1,  1,  1,  2,  1,  1, 11,  1,  3,  1])

In [0]:
from collections import Counter
# checking 
counts = Counter(preprocessing_data(train.review[0]).split())
# mj appears 11 times
counts

#### Using Random Forest for training

In [0]:
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [0]:
X=train_data_features
y=train.sentiment
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [52]:
model = RandomForestClassifier(n_estimators=100)

model.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [53]:
from sklearn.metrics import f1_score

y_preds = model.predict(X_test)

f1_score(y_preds,y_test)

0.8472019464720195

In [0]:
from sklearn.metrics import roc_auc_score,roc_curve

In [56]:
roc_auc_score(y_test,y_preds)

0.8477951364145935

In [58]:
roc_curve(y_test,y_preds,pos_label=1)

(array([0.        , 0.14445798, 1.        ]),
 array([0.        , 0.84004825, 1.        ]),
 array([2, 1, 0]))

## Word Vectors

Word2Vec

In [0]:
# Need to install cython and gensim for using word2vec
!pip install --upgrade Cython
!pip install --upgrade gensim

In [0]:
# Using Word2Vec can also use the unlabeled data 
# 50000 additional reviews in unlabeledTrain.tsv

train_unlabeled = pd.read_csv("unlabeledTrainData.tsv",header=0,delimiter='\t',quoting=3)

In [97]:
train_unlabeled.shape,train.shape

((50000, 2), (25000, 3))

#### Data Preprocessing 

Only removing the HTML and splitting the sentences . Word2Vec utilizes stop words and numbers for context 

In [0]:
def word_to_vec(text):
  remove_html = BeautifulSoup(text).get_text()
  # removing characters not in a-zA-Z0-9
  subs = re.sub("[^a-zA-Z0-9]"," ",remove_html)
  # lowercase and split
  words = subs.lower().split()
  # returning a list of words instead of sentence
  return words

In [63]:
train.review[0]

'"Watching Time Chasers, it obvious that it was made by a bunch of friends. Maybe they were sitting around one day in film school and said, \\"Hey, let\'s pool our money together and make a really bad movie!\\" Or something like that. What ever they said, they still ended up making a really bad movie--dull story, bad script, lame acting, poor cinematography, bottom of the barrel stock music, etc. All corners were cut, except the one that would have prevented this film\'s release. Life\'s like that."'

In [0]:
word_to_vec(train.review[0])

In [0]:
# Word2Vec expects single sentences each as a list of words
# Using nltk's punkt tokenizer for sentence splitting. 
# Each review multiple sentences and can end in ?,!,.etc.
# Thus tokenizing sentencces becomes important
nltk.download()

In [0]:
# loading the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [0]:
# defining the function to splt the data
def review_to_sentence(review):
  # Using the tokenizer to split paragraph into sentences
  # paragraph -> sentencelist
  raw_sentences = tokenizer.tokenize(review.strip())
  
  # Using the word to vec function above to convert each sentence into list of words
  sentences =[]
  for sentence in raw_sentences:
    if len(sentence)>0:
      # sentence -> wordlist
      sentences.append(word_to_vec(sentence))
  # returning the list of sentences
  return sentences

In [0]:
# Can also use the labeled data as well
sentences =[]

for review in train.review:
  # append will only append the first list in a list of lists
  # += will append all the lists
  sentences +=review_to_sentence(review)

In [0]:
# doing the same for unlabeled data and appending to sentences

for review in train_unlabeled.review:
  sentences +=review_to_sentence(review)

In [73]:
# total length of sentences
len(sentences),sentences[0]

(795538,
 ['with',
  'all',
  'this',
  'stuff',
  'going',
  'down',
  'at',
  'the',
  'moment',
  'with',
  'mj',
  'i',
  've',
  'started',
  'listening',
  'to',
  'his',
  'music',
  'watching',
  'the',
  'odd',
  'documentary',
  'here',
  'and',
  'there',
  'watched',
  'the',
  'wiz',
  'and',
  'watched',
  'moonwalker',
  'again'])

#### Word2Vec training

In [0]:
# Number of hyperparams to tune 
# architecture - skip-gram / cbow
# training algo -hierarchical softmax(default) or negetive sampling
# downsmpling of frequent words - b/w 0.0001 and 0.001
# word vector dimension -300
# context window size - 8/10
# worker threads - -1
# min word count - min frequency to be considered for addition in vocab 30

num_features = 400
min_word_count = 40
num_workers = -1
context =10
downsampling = 1e-3


In [0]:
from gensim.models import word2vec

model = word2vec.Word2Vec(sentences,workers=num_workers,size=num_features,
                         min_count=min_word_count,window=context,sample=downsampling)
model.init_sims(replace=True)

In [94]:
import warnings
warnings.filterwarnings('ignore')
model.most_similar("man")

[('dastardly', 0.18006286025047302),
 ('outsider', 0.1782134622335434),
 ('shocks', 0.16817712783813477),
 ('thrilling', 0.1632031500339508),
 ('walked', 0.15822070837020874),
 ('miscast', 0.15735647082328796),
 ('quicker', 0.15223057568073273),
 ('adrienne', 0.1511322259902954),
 ('cheered', 0.1479484736919403),
 ('umbrella', 0.1465863585472107)]

In [95]:
model.most_similar("queen")

[('dutch', 0.1822260320186615),
 ('fare', 0.17457765340805054),
 ('closest', 0.1706812083721161),
 ('answers', 0.16502219438552856),
 ('argues', 0.16261620819568634),
 ('gabrielle', 0.15670806169509888),
 ('valdez', 0.15394997596740723),
 ('rohmer', 0.1525345742702484),
 ('88', 0.15221452713012695),
 ('somebody', 0.1500397026538849)]

In [96]:
model.doesnt_match("france england germany berlin".split())

'france'

In [98]:
# sentiment
model.most_similar("awful")

[('protocol', 0.17013059556484222),
 ('dressing', 0.15827560424804688),
 ('categories', 0.15666204690933228),
 ('wednesday', 0.1522647738456726),
 ('belonged', 0.15128536522388458),
 ('unjustly', 0.14466138184070587),
 ('comics', 0.14405617117881775),
 ('slightest', 0.143955260515213),
 ('hottie', 0.1435181200504303),
 ('frenzy', 0.14274433255195618)]