# WORD2VEC GENERATOR

This script is for generating multiple gensim word2vec models having different sized windows and vectors

**Import packages and modules**

In [1]:
import time
import os
import pandas as pd
from gensim.models import Word2Vec

**Get the number of processors**

In [2]:
import multiprocessing
WORKERS = multiprocessing.cpu_count()
print("Number of workers:",WORKERS)

Number of workers: 16


**Extract the total number of examples in combined train and test set**

In [3]:
input_dir = "../input/yelp_review_polarity_csv/"

In [4]:
train_data = pd.read_csv(input_dir+"train_data_processed.csv")
test_data = pd.read_csv(input_dir+"test_data_processed.csv")
total_examples = train_data.shape[0]+test_data.shape[0]
del(train_data, test_data)

## 1. Function to generate and save gensim word2vec models

**Class module to load sentences from files containing chunks of reviews**

In [5]:
class MySentences(object):
    
    def __init__(self, dirname):
        self.dirname = dirname
 
    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname)):
                yield line.split()

**Create an object of the senntence loader with the given data path**

In [6]:
sentences = MySentences('../input/yelp_review_processed_chunks/')

**A function to generate and save gensim word2vec models**

In [7]:
def word2vec_generator(sentences, size, window, total_examples, workers, destination_folder = "./word2vec_models/"):
    '''
    A function to generate word2vec model
    '''    
    # create a word2vec model
    model = Word2Vec(sentences, size=size, window=window, sg= 1, min_count=1, workers=workers)
    # train model
    model.train(sentences, total_examples=total_examples, epochs=5)
    # file path
    fname = destination_folder + "word2vec_size_" + str(size) + "_window_" + str(window) + ".mdl"
    # save the model
    model. wv.save_word2vec_format(fname)

## 2. Generate multiple word2vec models having different sized windows and vectors

**size = 100; window = 5**

In [8]:
t_start = time.time()
size = 100
window = 5
word2vec_generator(sentences, size, window, total_examples, WORKERS)
t_end = time.time()
print("Time taken to save model 1: {:.2f} mins".format((t_end-t_start)/60))

Time taken to save model 1: 7.20 mins


**size = 100; window = 10**

In [9]:
t_start = time.time()
size = 100
window = 10
word2vec_generator(sentences, size, window, total_examples, WORKERS)
t_end = time.time()
print("Time taken to save model 2: {:.2f} mins".format((t_end-t_start)/60))

Time taken to save model 2: 11.17 mins


**size = 100; window = 15**

In [10]:
t_start = time.time()
size = 100
window = 15
word2vec_generator(sentences, size, window, total_examples, WORKERS)
t_end = time.time()
print("Time taken to save model 3: {:.2f} mins".format((t_end-t_start)/60))

Time taken to save model 3: 15.45 mins


**size = 100; window = 20**

In [11]:
t_start = time.time()
size = 100
window = 20
word2vec_generator(sentences, size, window, total_examples, WORKERS)
t_end = time.time()
print("Time taken to save model 4: {:.2f} mins".format((t_end-t_start)/60))

Time taken to save model 4: 19.58 mins


**size = 100; window = 25**

In [12]:
t_start = time.time()
size = 100
window = 25
word2vec_generator(sentences, size, window, total_examples, WORKERS)
t_end = time.time()
print("Time taken to save model 5: {:.2f} mins".format((t_end-t_start)/60))

Time taken to save model 5: 23.28 mins


**size = 200; window = 5**

In [13]:
t_start = time.time()
size = 200
window = 5
word2vec_generator(sentences, size, window, total_examples, WORKERS)
t_end = time.time()
print("Time taken to save model 6: {:.2f} mins".format((t_end-t_start)/60))

Time taken to save model 6: 8.90 mins


**size = 200; window = 10**

In [14]:
t_start = time.time()
size = 200
window = 10
word2vec_generator(sentences, size, window, total_examples, WORKERS)
t_end = time.time()
print("Time taken to save model 7: {:.2f} mins".format((t_end-t_start)/60))

Time taken to save model 7: 14.39 mins


**size = 200; window = 15**

In [15]:
t_start = time.time()
size = 200
window = 15
word2vec_generator(sentences, size, window, total_examples, WORKERS)
t_end = time.time()
print("Time taken to save model 8: {:.2f} mins".format((t_end-t_start)/60))

Time taken to save model 8: 19.74 mins


**size = 200; window = 20**

In [16]:
t_start = time.time()
size = 200
window = 20
word2vec_generator(sentences, size, window, total_examples, WORKERS)
t_end = time.time()
print("Time taken to save model 9: {:.2f} mins".format((t_end-t_start)/60))

Time taken to save model 9: 25.05 mins


**size = 200; window = 25**

In [17]:
t_start = time.time()
size = 200
window = 25
word2vec_generator(sentences, size, window, total_examples, WORKERS)
t_end = time.time()
print("Time taken to save model 10: {:.2f} mins".format((t_end-t_start)/60))

Time taken to save model 10: 30.06 mins


**size = 300; window = 5**

In [18]:
t_start = time.time()
size = 300
window = 5
word2vec_generator(sentences, size, window, total_examples, WORKERS)
t_end = time.time()
print("Time taken to save model 11: {:.2f} mins".format((t_end-t_start)/60))

Time taken to save model 11: 10.61 mins


**size = 300; window = 10**

In [19]:
t_start = time.time()
size = 300
window = 10
word2vec_generator(sentences, size, window, total_examples, WORKERS)
t_end = time.time()
print("Time taken to save model 12: {:.2f} mins".format((t_end-t_start)/60))

Time taken to save model 12: 17.51 mins


**size = 300; window = 15**

In [20]:
t_start = time.time()
size = 300
window = 15
word2vec_generator(sentences, size, window, total_examples, WORKERS)
t_end = time.time()
print("Time taken to save model 13: {:.2f} mins".format((t_end-t_start)/60))

Time taken to save model 13: 24.30 mins


**size = 300; window = 20**

In [21]:
t_start = time.time()
size = 300
window = 20
word2vec_generator(sentences, size, window, total_examples, WORKERS)
t_end = time.time()
print("Time taken to save model 14: {:.2f} mins".format((t_end-t_start)/60))

Time taken to save model 14: 30.59 mins


**size = 300; window = 25**

In [22]:
t_start = time.time()
size = 300
window = 25
word2vec_generator(sentences, size, window, total_examples, WORKERS)
t_end = time.time()
print("Time taken to save model 15: {:.2f} mins".format((t_end-t_start)/60))

Time taken to save model 15: 36.76 mins
