In [26]:
import pandas as pd
import json
import os
from utils import preprocessing_helpers
from utils.fasttext_cleaner import fasttext_cleaning
from collections import Counter

In [10]:
data_folder = 'data/data_sentiment140' 

##### Prepare vocabulary

In [11]:
data = pd.read_csv(os.path.join(data_folder, 'train_data.csv')) 

Fasttext_cleaning will perform a basic cleaning and tokenize by whitespace, we pass the result to split(' ') to get a list of token

In [12]:
data['tokenized_text'] = data['text'].apply(lambda x: fasttext_cleaning(x).split(' '))

Small treatment for tweets, it separates the hashtag from the words , for example the token #jobs becomes two tokens #, jobs , we keep the # (since using an RNN model , might help ..but not sure if its better to keep it or remove it, so we'll just leave it for the moment.
For @xxx, we'll replace with a special token TWEETER_USER

In [13]:
data['tokenized_text'] = data['tokenized_text'].apply(preprocessing_helpers.small_cleaning_tweets)

In [25]:
data['tokenized_text'].sample(n=5)

64882          [<TWEETER_USER>, its, not, on, in, colorado]
819782           [unhappy, about, my, lack, of, discipline]
598282    [lost, a, follower, ,, was, it, something, i, ...
441008    [played, the, fight, night, round, demo, just,...
395274    [<TWEETER_USER>, wow, ,, i, ', ve, checked, th...
Name: tokenized_text, dtype: object

In [7]:
def create_vocabulary(list_of_tokenized_texts, min_count=2):
    """Take a list of of tokenized text and create a vocabulary dict word->id"""
    
    df_counter = Counter([token for tokenized_text in list_of_tokenized_texts for token in set(tokenized_text)])
    vocabulary_list = sorted(token for token,df_count in df_counter.items() if df_count>=min_count)
    vocabulary_list = ['<PAD>'] + vocabulary_list
    return vocabulary_list

In [8]:
vocabulary_list = create_vocabulary(data['tokenized_text'].tolist(), min_count=5)
with open(os.path.join(data_folder,'vocabulary_list.txt'), 'w') as f:
    f.write('\n'.join(vocabulary_list))

In [9]:
len(vocabulary_list) # Maybe increase the min_count in the previous cell if vocabulary is very very big

47874

##### Prepare Embedding Matrix

In [11]:
from gensim.models import Word2Vec
import numpy as np

In [12]:
embedding_size = 150

In [13]:
model = Word2Vec(sentences=data['tokenized_text'].tolist(),size=embedding_size, window=5, min_count=0, workers=4)
model.train(data['tokenized_text'].tolist(), total_examples=len(data), epochs=5)
word_vectors = model.wv

In [14]:
word_vectors.most_similar('fantastic')

[('great', 0.8513270616531372),
 ('fab', 0.8434876203536987),
 ('fabulous', 0.841629147529602),
 ('brilliant', 0.8153545260429382),
 ('wonderful', 0.7884383201599121),
 ('terrific', 0.7380696535110474),
 ('lovely', 0.6846454739570618),
 ('awesome', 0.6787641048431396),
 ('amazing', 0.6767555475234985),
 ('incredible', 0.6757609248161316)]

In [15]:
word_vectors.most_similar('hate')

[('despise', 0.6307151913642883),
 ('dislike', 0.6078054308891296),
 ('hates', 0.5803361535072327),
 ('sucks', 0.5413668155670166),
 ('suck', 0.5285280346870422),
 ('hating', 0.5043717622756958),
 ('loathe', 0.4923359751701355),
 ('haaate', 0.4749675989151001),
 ('stupid', 0.47068506479263306),
 ('love', 0.4668755531311035)]

In [21]:
word_vectors.most_similar('food')

[('sushi', 0.6935381889343262),
 ('pizza', 0.6706090569496155),
 ('snacks', 0.6616131663322449),
 ('meat', 0.630408763885498),
 ('subway', 0.6285064816474915),
 ('foods', 0.6203563213348389),
 ('ingredients', 0.616908073425293),
 ('pasta', 0.6168296933174133),
 ('muay', 0.6165291666984558),
 ('seafood', 0.614884614944458)]

##### Creating the embedding matrix

In [16]:
id_to_token = {token_id: token for token_id, token in enumerate(vocabulary_list)}

In [17]:
NUM_OOV = 1 # Number of uknown token buckets
total_size = len(vocabulary_list) + NUM_OOV 

In [19]:
emb_matrix = np.random.uniform(-0.5,0.5,size=(total_size, embedding_size))/embedding_size # Initialization of the embedding matrix
emb_matrix[0] = np.zeros(embedding_size) # PAD token will get the id 0 and will be full of 0s
for i in range(1, total_size-NUM_OOV): # Except pad token and OOV buckets
    word = id_to_token[i]
    emb_matrix[i] = word_vectors[word]

Saving the embedding matrix

In [20]:
with open(os.path.join(data_folder,'embedding_matrix.npy'), 'wb') as f:
    np.save(f, emb_matrix)