In [9]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from gensim.models import Word2Vec, KeyedVectors

### Load tokenized tweets

In [2]:
with open('../data/tweets_tokenized.pickle', 'rb') as f:
    tweets_tokenized = pickle.load(f)


### Load word2vec model

In [13]:
w2v_model = Word2Vec.load("../models/w2v_model1.model")

### Embedd tweets

- First define a function that adds the individual words in each tweet together to form a vector representation of each tweet.

In [14]:
# This function embedds tweets based on the sum of the vector representations of each word in the tweet
def embed_w2v_sum(tokens, w2v):
    
    # Get the index of the word vectors for each token in a tweet
    idxs = [w2v.wv.vocab.get(t) for t in tokens]
    idxs = [t.index for t in idxs if t]
    
    # N is the number of tokens in tweet
    N = w2v.wv.vectors.shape[1]
    if len(idxs) < 1:
        return np.zeros(N)
    
    # A tweet is represented by the sum of the vectors it contains
    a = np.sum(w2v.wv.vectors[idxs, :], axis=0) 
    
    # Standardize the whole tweet by its norm
    a /= np.linalg.norm(a)
    return a


In [16]:
# Use t
tweets_embedded = np.array([embed_w2v_sum(t, w2v_model) for t in tweets_tokenized])

In [22]:
# Save matrix as pickle
with open('tweets_embedded.pickle', 'wb') as f:
    pickle.dump(tweets_embedded, f)