In [23]:
import numpy as np
import pandas as pd
# BeautifulSoup is used to remove html tags from the text
from bs4 import BeautifulSoup 
import re # For regular expressions

# Stopwords can be useful to undersand the semantics of the sentence.
# Therefore stopwords are not removed while creating the word2vec model.
# But they will be removed  while averaging feature vectors.
from nltk.corpus import stopwords

In [24]:
train = pd.read_csv("./labeledTrainData.tsv", header=0,\
                    delimiter="\t", quoting=3)

test = pd.read_csv("./testData.tsv",header=0,\
                    delimiter="\t", quoting=3)

In [25]:
def review_wordlist(review, remove_stopwords=True):
    # 1. Removing html tags
    review_text = BeautifulSoup(review).get_text()
    # 2. Removing non-letter.
    review_text = re.sub("[^a-zA-Z]"," ",review_text)
    # 3. Converting to lower case and splitting
    words = review_text.lower().split()
    # 4. Optionally remove stopwords
    if remove_stopwords:
        stops = set(stopwords.words("english"))     
        words = [w for w in words if not w in stops]
    
    return(words)

In [26]:
import nltk.data
#nltk.download('popular')

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [27]:
def review_sentences(review, tokenizer, remove_stopwords=False):
    # 1. Using nltk tokenizer
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    # 2. Loop for each sentence
    for raw_sentence in raw_sentences:
        if len(raw_sentence)>0:
            sentences.append(review_wordlist(raw_sentence,\
                                            remove_stopwords))

    # This returns the list of lists
    return sentences

In [28]:
sentences = []
print("Parsing sentences from training set")
for review in train["review"]:
    sentences += review_sentences(review, tokenizer)
print(sentences[0])

Parsing sentences from training set


  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


['with', 'all', 'this', 'stuff', 'going', 'down', 'at', 'the', 'moment', 'with', 'mj', 'i', 've', 'started', 'listening', 'to', 'his', 'music', 'watching', 'the', 'odd', 'documentary', 'here', 'and', 'there', 'watched', 'the', 'wiz', 'and', 'watched', 'moonwalker', 'again']


In [29]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [30]:
num_features = 300  # Word vector dimensionality
min_word_count = 20 # Minimum word count
num_workers = 4     # Number of parallel threads
context = 10        # Context window size
downsampling = 1e-3 # (0.001) Downsample setting for frequent words

# Initializing the train model
from gensim.models import word2vec
print("Training model....")
model = word2vec.Word2Vec(sentences,
                          workers=num_workers,
                          size=num_features,
                          min_count=min_word_count,
                          window=context,
                          sample=downsampling)

# To make the model memory efficient
model.init_sims(replace=True)

# Saving the model for later use. Can be loaded using Word2Vec.load()
model_name = "300features_40minwords_10context"
model.save(model_name)

2019-03-09 00:54:02,929 : INFO : collecting all words and their counts
2019-03-09 00:54:02,929 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


Training model....


2019-03-09 00:54:02,992 : INFO : PROGRESS: at sentence #10000, processed 225803 words, keeping 17776 word types
2019-03-09 00:54:03,051 : INFO : PROGRESS: at sentence #20000, processed 451892 words, keeping 24948 word types
2019-03-09 00:54:03,113 : INFO : PROGRESS: at sentence #30000, processed 671315 words, keeping 30034 word types
2019-03-09 00:54:03,174 : INFO : PROGRESS: at sentence #40000, processed 897815 words, keeping 34348 word types
2019-03-09 00:54:03,235 : INFO : PROGRESS: at sentence #50000, processed 1116963 words, keeping 37761 word types
2019-03-09 00:54:03,293 : INFO : PROGRESS: at sentence #60000, processed 1338404 words, keeping 40723 word types
2019-03-09 00:54:03,328 : INFO : PROGRESS: at sentence #70000, processed 1561580 words, keeping 43333 word types
2019-03-09 00:54:03,405 : INFO : PROGRESS: at sentence #80000, processed 1780887 words, keeping 45714 word types
2019-03-09 00:54:03,456 : INFO : PROGRESS: at sentence #90000, processed 2004996 words, keeping 4813

2019-03-09 00:54:30,465 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-03-09 00:54:30,478 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-03-09 00:54:30,487 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-03-09 00:54:30,488 : INFO : EPOCH - 4 : training on 5920724 raw words (4195290 effective words) took 6.3s, 670792 effective words/s
2019-03-09 00:54:31,492 : INFO : EPOCH 5 - PROGRESS: at 15.34% examples, 650730 words/s, in_qsize 7, out_qsize 0
2019-03-09 00:54:32,524 : INFO : EPOCH 5 - PROGRESS: at 29.44% examples, 612873 words/s, in_qsize 7, out_qsize 0
2019-03-09 00:54:33,530 : INFO : EPOCH 5 - PROGRESS: at 43.13% examples, 598764 words/s, in_qsize 7, out_qsize 0
2019-03-09 00:54:34,538 : INFO : EPOCH 5 - PROGRESS: at 58.82% examples, 612147 words/s, in_qsize 8, out_qsize 0
2019-03-09 00:54:35,554 : INFO : EPOCH 5 - PROGRESS: at 74.76% examples, 621726 words/s, in_qsize 7, out_qsize 0
2019-03-09 00:54:36,564 :

In [31]:
model.wv.most_similar("terrible")
#https://cs224d.stanford.edu/reports/PouransariHadi.pdf
#https://blog.acolyer.org/2016/04/21/the-amazing-power-of-word-vectors/

  if np.issubdtype(vec.dtype, np.int):


[('horrible', 0.9171717166900635),
 ('awful', 0.8227857351303101),
 ('atrocious', 0.7224851846694946),
 ('laughable', 0.7200475931167603),
 ('dreadful', 0.7119424939155579),
 ('horrid', 0.6983188390731812),
 ('bad', 0.687454104423523),
 ('lame', 0.6834261417388916),
 ('pathetic', 0.6772339940071106),
 ('abysmal', 0.6757860779762268)]

In [32]:
model.wv.syn0.shape
words = list(model.wv.vocab)
print('Vocabulary size: %d' % len(words))
words[:10]


Vocabulary size: 13153


  """Entry point for launching an IPython kernel.


['with', 'all', 'this', 'stuff', 'going', 'down', 'at', 'the', 'moment', 'mj']

In [33]:
print(model['with'])
#This is how 300 dimensional word2vec look for the word 'with'. Try with different words.

[ 1.46628879e-02  1.07706273e-02  1.13535523e-01 -1.09947158e-03
  4.55297641e-02 -1.09991767e-02  8.97706859e-03  3.06147505e-02
  2.74838344e-03 -1.79987215e-02 -1.57652311e-02  3.54621448e-02
 -1.88522246e-02  8.42513330e-03  3.03037558e-02 -5.34946285e-02
 -5.84037751e-02  5.32212807e-03  5.08216396e-03 -1.71942972e-02
  4.68900427e-02  1.06043788e-02  5.38697727e-02  2.86109652e-02
 -7.74646029e-02 -2.94330381e-02 -1.32919699e-01 -3.40857692e-02
  1.08589204e-02 -9.05710608e-02  6.83899373e-02 -7.74414837e-02
  9.52017084e-02  3.23455445e-02  2.45676506e-02  1.75810251e-02
 -2.80611720e-02  8.95487294e-02  1.63848817e-01 -1.95380952e-03
  1.27040327e-01  1.66469708e-01  1.91589966e-02 -2.60981880e-02
 -2.98508047e-03  2.01526359e-02  5.97073585e-02  4.96000908e-02
  4.30821348e-03 -3.96617614e-02  3.15747336e-02 -1.00123614e-01
  2.27299724e-02  7.28075877e-02  5.82443625e-02 -8.40770304e-02
  7.34475926e-02  1.52407568e-02  1.12994071e-02  1.53573751e-02
  6.56339228e-02  3.11040

  """Entry point for launching an IPython kernel.


In [34]:
def featureVecMethod(words, model, num_features):
    # Pre-initialising empty numpy array for speed
    featureVec = np.zeros(num_features,dtype="float32")
    nwords = 0
    
    index2word_set = set(model.wv.index2word)
    
    for word in  words:
        if word in index2word_set:
            nwords = nwords + 1
            featureVec = np.add(featureVec,model[word])
    
    featureVec = np.divide(featureVec, nwords)
    return featureVec

In [35]:
def getAvgFeatureVecs(reviews, model, num_features):
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    for review in reviews:
        # Printing a status message every 1000th review
        if counter%1000 == 0:
            print("Review %d of %d"%(counter,len(reviews)))
            
        reviewFeatureVecs[counter] = featureVecMethod(review, model, num_features)
        counter = counter+1
        
    return reviewFeatureVecs

In [36]:
import nltk
nltk.download('stopwords')
clean_train_reviews = []
for review in train['review']:
    clean_train_reviews.append(review_wordlist(review, remove_stopwords=True))
    
trainDataVecs = getAvgFeatureVecs(clean_train_reviews, model, num_features)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mahe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Review 0 of 25000


  if sys.path[0] == '':


Review 1000 of 25000
Review 2000 of 25000
Review 3000 of 25000
Review 4000 of 25000
Review 5000 of 25000
Review 6000 of 25000
Review 7000 of 25000
Review 8000 of 25000
Review 9000 of 25000
Review 10000 of 25000
Review 11000 of 25000
Review 12000 of 25000
Review 13000 of 25000
Review 14000 of 25000
Review 15000 of 25000
Review 16000 of 25000
Review 17000 of 25000
Review 18000 of 25000
Review 19000 of 25000
Review 20000 of 25000
Review 21000 of 25000
Review 22000 of 25000
Review 23000 of 25000
Review 24000 of 25000


In [37]:
clean_test_reviews = []
for review in test["review"]:
    clean_test_reviews.append(review_wordlist(review,remove_stopwords=True))
    
testDataVecs = getAvgFeatureVecs(clean_test_reviews, model, num_features)

Review 0 of 25000


  if sys.path[0] == '':


Review 1000 of 25000
Review 2000 of 25000
Review 3000 of 25000
Review 4000 of 25000
Review 5000 of 25000
Review 6000 of 25000
Review 7000 of 25000
Review 8000 of 25000
Review 9000 of 25000
Review 10000 of 25000
Review 11000 of 25000
Review 12000 of 25000
Review 13000 of 25000
Review 14000 of 25000
Review 15000 of 25000
Review 16000 of 25000
Review 17000 of 25000
Review 18000 of 25000
Review 19000 of 25000
Review 20000 of 25000
Review 21000 of 25000
Review 22000 of 25000
Review 23000 of 25000
Review 24000 of 25000


In [63]:
clean_custom_test_reviews = []
custom_snt = 'one of the worst movie ever'
clean_custom_test_reviews.append(review_wordlist(custom_snt,remove_stopwords=True))
testDataCustomVecs = getAvgFeatureVecs(clean_custom_test_reviews, model, num_features)

Review 0 of 1


  if sys.path[0] == '':


In [39]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(trainDataVecs,train["sentiment"],test_size=0.3, random_state=0)

In [68]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 100)
    
print("Fitting random forest to training data....")    
forest = forest.fit(X_train,y_train)

Fitting random forest to training data....


In [69]:
from sklearn.metrics import accuracy_score,recall_score,confusion_matrix
y_pred = forest.predict(X_test)
print(len(X_test))
print(accuracy_score(y_test,y_pred))
print(recall_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

7500
0.8356
0.8552950687146321
[[3093  696]
 [ 537 3174]]


In [64]:
result1 = forest.predict(testDataCustomVecs)
output = pd.DataFrame(data={"review":custom_snt, "sentiment":result1})
print(output)

                        review  sentiment
0  one of the worst movie ever          0


In [86]:
result = forest.predict(testDataVecs)
#output = pd.DataFrame(data={"review":test["review"], "sentiment":result})
output = pd.DataFrame(data={"review":test["review"], "sentiment":result})
print(output)
#output.to_csv( "output.csv", index=False, quoting=3 )

                                                  Review  sentiment
0      "Naturally in a film who's main themes are of ...          1
1      "This movie is a disaster within a disaster fi...          0
2      "All in all, this is a movie for kids. We saw ...          1
3      "Afraid of the Dark left me with the impressio...          0
4      "A very accurate depiction of small time mob l...          1
5      "...as valuable as King Tut's tomb! (OK, maybe...          1
6      "This has to be one of the biggest misfires ev...          0
7      "This is one of those movies I watched, and wo...          0
8      "The worst movie i've seen in years (and i've ...          0
9      "Five medical students (Kevin Bacon, David Lab...          1
10     "'The Mill on the Floss' was one of the lesser...          1
11     "I just saw this film at the phoenix film fest...          0
12     "\"The Love Letter\" is one of those movies th...          0
13     "Another fantastic offering from the Monk