# Custom GloVe
_global vector for word representation - customized_

In [1]:
import numpy as np
import pandas as pd
import os
import re
import time

from gensim.models import Word2Vec
from tqdm import tqdm

tqdm.pandas()

In [2]:
def preprocessing(titles_array):
    
    """
    Take in an array of titles, and return the processed titles.
    
    (e.g. input: 'i am a boy', output - 'am boy')  -> since I remove those words with length 1
    
    Feel free to change the preprocessing steps and see how it affects the modelling results!
    """
    
    processed_array = []
    
    for title in tqdm(titles_array):
        
        # remove other non-alphabets symbols with space (i.e. keep only alphabets and whitespaces).
        processed = re.sub('[^a-zA-Z ]', '', title)
        
        words = processed.split()
        
        # keep words that have length of more than 1 (e.g. gb, bb), remove those with length 1.
        processed_array.append(' '.join([word for word in words if len(word) > 1]))
    
    return processed_array

In [3]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [4]:
df_train['processed'] = preprocessing(df_train['title'])
df_test['processed'] = preprocessing(df_test['title'])

sentences = pd.concat([df_train['processed'], df_test['processed']],axis=0)
train_sentences = list(sentences.progress_apply(str.split).values)

100%|██████████████████████████████████████████████████████████████████████| 666615/666615 [00:02<00:00, 288451.28it/s]
100%|██████████████████████████████████████████████████████████████████████| 172402/172402 [00:00<00:00, 291952.23it/s]
100%|██████████████████████████████████████████████████████████████████████| 839017/839017 [00:01<00:00, 502774.24it/s]


In [9]:
df_train.title

0                      nyx sex bomb pallete natural palette
1         etude house precious mineral any cushion pearl...
2                                  milani rose powder blush
3                       etude house baby sweet sugar powder
4              bedak revlon color stay aqua mineral make up
5                                   dr pure whitening cream
6                                chanel powder blush malice
7                            snail white cream original 100
8                                 sunprise all proof spf 50
9                    eyebrow powder nyx satuan rp 15.000 pc
10                              monistat chafing relief gel
11                             milani rose powder blush tea
12                              the balm meet matte trimony
13                     laneige water base cc cream spf36 pa
14        giordani gold age defying compact foundation d...
15        the body shop refill moisture white perfect fo...
16        lancome blush subtil long last

In [8]:
df_train['processed']

0                      nyx sex bomb pallete natural palette
1         etude house precious mineral any cushion pearl...
2                                  milani rose powder blush
3                       etude house baby sweet sugar powder
4              bedak revlon color stay aqua mineral make up
5                                   dr pure whitening cream
6                                chanel powder blush malice
7                                snail white cream original
8                                    sunprise all proof spf
9                           eyebrow powder nyx satuan rp pc
10                              monistat chafing relief gel
11                             milani rose powder blush tea
12                              the balm meet matte trimony
13                       laneige water base cc cream spf pa
14        giordani gold age defying compact foundation d...
15        the body shop refill moisture white perfect fo...
16        lancome blush subtil long last

In [5]:
# Parameters reference : https://www.quora.com/How-do-I-determine-Word2Vec-parameters
# Feel free to customise your own embedding

start_time = time.time()

model = Word2Vec(sentences=train_sentences, 
                 sg=1, 
                 size=100,  
                 workers=4)

print(f'Time taken : {(time.time() - start_time) / 60:.2f} mins')

Time taken : 0.58 mins


In [6]:
# Total number of vocab in our custom word embedding

len(model.wv.vocab.keys())

16689

In [7]:
# Check out the dimension of each word (we set it to 100 in the above training step)

model.wv.vector_size

100

In [10]:
# Check out how 'iphone' is represented (an array of 100 numbers)

model.wv.get_vector('iphone')

array([-0.25342065,  0.4158013 ,  0.0435044 , -0.09832075,  0.57854474,
       -0.43043593, -0.59564346, -0.28499427, -0.13541934,  0.28901628,
       -0.47354352,  0.38355872,  0.01143684,  0.6523723 ,  0.14603406,
        0.13597351,  0.53975284, -0.04991831,  0.09222654,  0.93449503,
       -0.30724114,  1.0729951 , -0.1273521 , -0.9999721 ,  0.3569624 ,
        0.2943077 , -0.80097723,  0.00157885,  0.6185943 ,  0.1769473 ,
       -0.45400771, -0.59157634, -0.18546109,  0.1224787 ,  0.19793004,
        0.15625094,  0.32764742,  0.45199618, -0.38252822, -0.03542918,
       -0.2804497 , -0.30722934,  0.18441322, -0.49494416,  0.12744379,
        0.36388293, -0.30477178,  0.38865706,  0.24851401,  1.3736799 ,
       -0.39971182,  0.40666267, -0.61906695,  0.02509379, -0.31460485,
        1.0258284 , -0.26852623,  0.41874593,  0.32998958, -0.12060785,
        0.5871205 , -0.08545645, -0.9912803 , -0.2865913 ,  0.86547625,
       -0.04333999,  0.3538542 ,  0.8313623 ,  0.5438686 , -0.32

In [11]:
# Find words with similar meaning to 'iphone'

model.wv.most_similar('iphone')

[('iphones', 0.7719532251358032),
 ('apple', 0.7023388147354126),
 ('cpo', 0.7015286684036255),
 ('iph', 0.6962280869483948),
 ('ten', 0.692862868309021),
 ('jetblack', 0.6899570226669312),
 ('originaliphone', 0.6825147867202759),
 ('blackmatte', 0.6746338605880737),
 ('spacegrey', 0.6744704246520996),
 ('fu', 0.6695471405982971)]

In [12]:
model.wv.save_word2vec_format('custom_glove_100d.txt')


# How to load:
# w2v = KeyedVectors.load_word2vec_format('custom_glove_100d.txt')

# How to get vector using loaded model
# w2v.get_vector('iphone')