# Word2Vec

## Imports

In [76]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
from gensim.models import word2vec
import json
import multiprocessing
from time import time
import spacy

nlp = spacy.load('en_core_web_sm')

## Load Data

In [77]:
def load_data(path):
    return pd.read_csv(path, header = 0, delimiter = '\t', quoting = 3)

In [78]:
# Train Data
train_data = load_data(path = 'Data/Raw/labeledTrainData.tsv')
print('Train Data Shape:', train_data.shape)

# Unlabeled Train Data
unlabled_train_data = load_data(path = 'Data/Raw/unlabeledTrainData.tsv')
print('Unlabled Train Data Shape:', unlabled_train_data.shape)

# Test Data
test_data = load_data(path = 'Data/Raw/testData.tsv')
print('Test Data Shape:', test_data.shape)

Train Data Shape: (25000, 3)
Unlabled Train Data Shape: (50000, 2)
Test Data Shape: (25000, 2)


## Preprocessing

### Sentence To Words

In [79]:
def sentence_to_words(sentence):

    # Remove Markups
    sentence =  BeautifulSoup(sentence).get_text()

    # Remove Numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    
    # Remvoe URLs
    sentence = re.sub(r'http\S+', '', sentence)

    # Lemmatize
    words = [token.lemma_.lower() for token in nlp(sentence)]

    return words

### Review To Sentence

In [80]:
def review_to_sentences(review):
    
    sentences = []

    # Generate sentences
    doc = nlp(review)
    review_sentences = [sent.text for sent in doc.sents]
    
    for sentence in review_sentences:
        if len(sentence) > 0:
            sentences.append(sentence_to_words(sentence))
    
    return sentences

### List of Sentences of Words

In [81]:
sentences = []

# Train Data
for i, review in enumerate(train_data['review'][:500]):
    sentences += review_to_sentences(review)
    
    if i % 100 == 0: print(f'Processing "Train Data" {i}...')

# Unlabeled Train Data
for i, review in enumerate(unlabled_train_data['review'][:500]):
    sentences += review_to_sentences(review)
    
    if i % 100 == 0: print(f'Processing "Unlabeled Train Data" {i}...')

Processing "Train Data" 0...
Processing "Train Data" 100...
Processing "Train Data" 200...




Processing "Train Data" 300...
Processing "Train Data" 400...
Processing "Unlabeled Train Data" 0...
Processing "Unlabeled Train Data" 100...




Processing "Unlabeled Train Data" 200...
Processing "Unlabeled Train Data" 300...




Processing "Unlabeled Train Data" 400...


## Save the Sentences into JSON

In [82]:
with open(r"Data/Processed/Word2Vec_sentences.json", "w") as file:
    json.dump(sentences, file)

## Load the Sentences from JSON

In [83]:
sentences = None

with open(r"Data/Processed/Word2Vec_sentences.json", "r") as file:
    sentences = json.load(file)

## Model (Word2Vec)

In [84]:
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

### Build the Model

In [85]:
w2v_model = word2vec.Word2Vec(
    workers = num_workers,
    vector_size = num_features,
    min_count = min_word_count,
    window = context,
    sample = downsampling
)

### Initialize the Model

In [86]:
t = time()

w2v_model.build_vocab(sentences, progress_per = 10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

Time to build vocab: 0.0 mins


### Train the Model

In [87]:
t = time()

w2v_model.train(sentences, total_examples = w2v_model.corpus_count, epochs = 30, report_delay = 1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

Time to train the model: 0.05 mins


## Save the Model

In [89]:
model_name = "Data/Processed/word2vec_model"
model.save(model_name)

## Evaluate the Model

In [108]:
w2v_model.wv.most_similar("scene".split())

[('sequence', 0.6082822680473328),
 ('shot', 0.5585779547691345),
 ('moment', 0.5281933546066284),
 ('side', 0.41745156049728394),
 ('hilarious', 0.41189679503440857),
 ('episode', 0.39389824867248535),
 ('violence', 0.37117457389831543),
 ('car', 0.36822807788848877),
 ('cut', 0.35873159766197205),
 ('score', 0.3525751531124115)]

In [114]:
w2v_model.wv.similarity("death", "war")

0.57044554

In [118]:
w2v_model.wv.most_similar("scene war music".split())

[('sequence', 0.6235834360122681),
 ('violence', 0.5777526497840881),
 ('score', 0.543147087097168),
 ('shot', 0.5370824337005615),
 ('musical', 0.4834674298763275),
 ('event', 0.47258278727531433),
 ('dialogue', 0.4615626931190491),
 ('title', 0.4596373438835144),
 ('side', 0.44574621319770813),
 ('country', 0.4351249933242798)]

In [120]:
w2v_model.wv.doesnt_match(['death', 'war', 'music'])

'music'

In [129]:
w2v_model.wv.syn0

AttributeError: 'KeyedVectors' object has no attribute 'syn0'