# Sentiment Analysis using word2vec
In this tutorial competition, we dig a little "deeper" into sentiment analysis. Google's Word2Vec is a deep-learning inspired method that focuses on the meaning of words. Word2Vec attempts to understand meaning and semantic relationships among words. It works in a way that is similar to deep approaches, such as recurrent neural nets or deep neural nets, but is computationally more efficient. This tutorial focuses on Word2Vec for sentiment analysis.

### Reference
* https://www.kaggle.com/c/word2vec-nlp-tutorial/overview
* https://www.kaggle.com/varun08/sentiment-analysis-using-word2vec/data

In [None]:
# !nltk.download('popular')

In [None]:
# Importing the built-in logging module
import logging

logging.basicConfig(
    format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO
)

In [None]:
# Firstly, please note that the performance of google word2vec is better on big datasets.
# In this example we are considering only 25000 training examples from the imdb dataset.
# Therefore, the performance is similar to the "bag of words" model.

# Importing libraries
import numpy as np
import pandas as pd

# BeautifulSoup is used to remove html tags from the text
from bs4 import BeautifulSoup
import re  # For regular expressions

# Stopwords can be useful to undersand the semantics of the sentence.
# Therefore stopwords are not removed while creating the word2vec model.
# But they will be removed  while averaging feature vectors.
from nltk.corpus import stopwords

# word2vec expects a list of lists.
# Using punkt tokenizer for better splitting of a paragraph into sentences.

import nltk.data


tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")

In [None]:
# Read data from files
train = pd.read_csv(
    "./data/labeledTrainData.tsv.gz",
    delimiter="\t",
)
test = pd.read_csv("./data/testData.tsv.gz", delimiter="\t")

In [None]:
# This function converts a text to a sequence of words.
def review_wordlist(review, remove_stopwords=False):
    # 1. Removing html tags
    review_text = BeautifulSoup(review).get_text()

    # 2. Removing non-letter.
    review_text = re.sub("[^a-zA-Z]", " ", review_text)

    # 3. Converting to lower case and splitting
    words = review_text.lower().split()

    # 4. Optionally remove stopwords
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]

    return words


# This function splits a review into sentences
def review_sentences(review, tokenizer, remove_stopwords=False):
    # 1. Using nltk tokenizer
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []

    # 2. Loop for each sentence
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(review_wordlist(raw_sentence, remove_stopwords))

    # This returns the list of lists
    return sentences

In [None]:
# !pip install -U tqdm

In [None]:
from tqdm.auto import tqdm

sentences = []
print("Parsing sentences from training set")
for review in tqdm(train["review"]):
    sentences += review_sentences(review, tokenizer)

### How word2vec is Trained

![](./figures/w2v_context.png)
![](./figures/w2v_training.png)

In [None]:
import multiprocessing


# Creating the model and setting values for the various parameters
num_features = 300  # Word vector dimensionality
min_word_count = 10  # Minimum word count
num_workers = multiprocessing.cpu_count() / 2  # Number of parallel threads
context = 10  # Context window size
downsampling = 1e-3  # (0.001) Downsample setting for frequent words

# Initializing the train model
from gensim.models import word2vec

print("Training model....")
model = word2vec.Word2Vec(
    sentences,
    workers=num_workers,
    vector_size=num_features,
    min_count=min_word_count,
    window=context,
    sample=downsampling,
)

# Saving the model for later use. Can be loaded using Word2Vec.load()
model_name = "word2vec.model"
model.save(model_name)

In [None]:
# https://github.com/RaRe-Technologies/gensim-data#models
# It may take some time to finish
import gensim.downloader as api

model = api.load("glove-wiki-gigaword-50")  # trained with 6B tokens

In [None]:
# Few tests: This will print the odd word among them
model.doesnt_match("man woman dog child kitchen".split())

In [None]:
model.doesnt_match("france england germany berlin".split())

In [None]:
# This will print the most similar words present in the model
model.most_similar("man")

In [None]:
model.most_similar("awful")

### Load the model trained on bigger corpus (for better result)

### Solving Word Analogies!

* Man is to Woman what King is to ___?
* USA is to hamburger what UK is to ___?
* Korea is to kimchi what USA is to ___?

![](./figures/analogy.png)

In [None]:
model.most_similar(positive=["king", "woman"], negative=["man"])
# model.most_similar(positive=["hamburger", "uk"], negative=["usa"])
# model.most_similar(positive=["kimchi", "usa"], negative=["korea"])

In [None]:
# This will give the total number of words in the vocabolary created from this dataset
model.vectors.shape

In [None]:
# Function to average all word vectors in a paragraph
def featureVecMethod(words, model, num_features, index2word_set):
    # Pre-initialising empty numpy array for speed
    featureVec = np.zeros(num_features, dtype="float32")
    nwords = 0

    for word in words:
        if word in index2word_set:
            nwords = nwords + 1
            featureVec = np.add(featureVec, model.get_vector(word))

    # Dividing the result by number of words to get average
    featureVec = np.divide(featureVec, nwords)
    return featureVec


# Function for calculating the average feature vector
def getAvgFeatureVecs(reviews, model, num_features):
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews), num_features), dtype="float32")
    # Converting Index2Word which is a list to a set for better speed in the execution.
    index2word_set = set(model.index_to_key)

    for review in tqdm(reviews):
        # Printing a status message every 1000th review
        if counter % 1000 == 0:
            print("Review %d of %d" % (counter, len(reviews)))

        reviewFeatureVecs[counter] = featureVecMethod(
            review, model, num_features, index2word_set
        )
        counter = counter + 1

    return reviewFeatureVecs

In [None]:
# Calculating average feature vector for training set
clean_train_reviews = []
for review in tqdm(train["review"]):
    cleaned = review_wordlist(review, remove_stopwords=True)
    clean_train_reviews.append(cleaned)

num_features = model.vectors.shape[1]
trainDataVecs = getAvgFeatureVecs(clean_train_reviews, model, num_features)

In [None]:
# Calculating average feature vactors for test set
clean_test_reviews = []
for review in tqdm(test["review"]):
    cleaned = review_wordlist(review, remove_stopwords=True)
    clean_test_reviews.append(cleaned)

testDataVecs = getAvgFeatureVecs(clean_test_reviews, model, num_features)

In [None]:
# Fitting a random forest classifier to the training data
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=100)

print("Fitting random forest to training data....")
forest = forest.fit(trainDataVecs, train["sentiment"])

In [None]:
# Predicting the sentiment values for test data and saving the results in a csv file
result = forest.predict(testDataVecs)
output = pd.DataFrame(data={"id": test["id"], "sentiment": result})
output.to_csv("output.csv", index=False, quoting=3)

Submit the output at https://www.kaggle.com/c/word2vec-nlp-tutorial/leaderboard

# Bonus: Aspect-base Sentiment Analysis 

In [None]:
import spacy
from spacy import displacy
from pprint import pprint

nlp = spacy.load("en_core_web_md")

In [None]:
sentences = [
    "The food we had yesterday was delicious",
    "My time in Italy was very enjoyable",
    "I found the meal to be tasty",
    "The internet was slow.",
    "Our experience was suboptimal",
]

### First, we pick up the sentiment description

In [None]:
for sentence in sentences:
    doc = nlp(sentence)
    descriptive_term = ""
    for token in doc:
        if token.pos_ == "ADJ":
            descriptive_term = token
    print(sentence)
    print(descriptive_term)
    print()

### Try to also extract intensifiers (e.g., "very")

In [None]:
for sentence in sentences:
    doc = nlp(sentence)
    descriptive_term = ""
    for token in doc:
        if token.pos_ == "ADJ":
            prepend = ""
            for child in token.children:
                if child.pos_ != "ADV":
                    continue
                prepend += child.text + " "
            descriptive_term = prepend + token.text
    print(sentence)
    print(descriptive_term)
    print()

### Now, identify the targets of the sentiments

In [None]:
doc = nlp(sentences[0])
displacy.render(doc, style="dep")

In [None]:
aspects = []
for sentence in sentences:
    doc = nlp(sentence)
    descriptive_term = ""
    target = ""
    for token in doc:
        if token.dep_ == "nsubj" and token.pos_ == "NOUN":
            target = token.text
        if token.pos_ == "ADJ":
            prepend = ""
            for child in token.children:
                if child.pos_ != "ADV":
                    continue
                prepend += child.text + " "
            descriptive_term = prepend + token.text
    aspects.append({"aspect": target, "description": descriptive_term})
pprint(aspects)

### Classify the sentiment using `TextBlob`

In [None]:
from textblob import TextBlob

for aspect in aspects:
    aspect["sentiment"] = TextBlob(aspect["description"]).sentiment  # or other sentiment classifiers
pprint(aspects)