In [None]:
#pip install --upgrade gensim

In [None]:
import nltk                                # Python library for NLP
import matplotlib.pyplot as plt            # library for visualization
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
import re
from sklearn.decomposition import PCA      # PCA library

In [None]:
train = pd.read_csv('/content/drive/MyDrive/train_token.csv', encoding='latin-1')
test = pd.read_csv('/content/drive/MyDrive/test_token.csv', encoding='latin-1')

In [None]:
train = train[~train.iloc[:]['TweetTokens'].isnull()]
test = test[~test.iloc[:]['TweetTokens'].isnull()]

In [None]:
words_dictionary = pd.read_csv('/content/drive/MyDrive/words_dictionary.csv', encoding='latin-1')
words_dictionary = words_dictionary[~words_dictionary['Unnamed: 0'].isnull()]
vocabulary = set(words_dictionary['Unnamed: 0'])

In [None]:
def remove_words(words, vocabulary):
  '''
  Input:
     words = list of strings/words
     vocabulary = list of strings/words

  Output:
     new_list = list of words from input words that are in vocabulary
  '''
  new_list = []
  for word in words:
    if word in vocabulary:
      new_list.append(word)

  return new_list

Make sentences, list of lists of words of tweets 

In [None]:
sentence = []

for i in range(0,len(train)):
  list_of_words = re.sub("[^\w]", " ", train.iloc[i]['TweetTokens']).split()
  list_of_words = remove_words(list_of_words, vocabulary)
  sentence.append(list_of_words)

In [None]:
# train model
SIZE = 100
model = Word2Vec(sentence, min_count=1, size=SIZE)
embedings = model.wv

In [None]:
#embedings['covid']

In [None]:
def get_matrices(set_of_words, embedings):
    """
    Output:
        X: English words to their corresponding word embeddings. 
        Y: a matrix where the columns correspong to the embeddings.
    """
    Y_l = list()
    X = []

    for word in set_of_words:
      Y_l.append(embedings[word])
      X.append(word)

    # stack the vectors of Y_l into a matrix Y
    Y = np.stack(Y_l)
    return X,Y

In [None]:
list_of_words, coordinates = get_matrices(vocabulary, embedings)

In [None]:
def compute_pca(X, n_components: int=2):
    """Calculate the principal components for X

    Args:
       X: of dimension (m,n) where each row corresponds to a word vector
       n_components: Number of components you want to keep.

    Return:
       X_reduced: data transformed in 2 dims/columns + regenerated original data
    """
    # you need to set axis to 0 or it will calculate the mean of the entire matrix instead of one per row
    X_demeaned = X - X.mean(axis=0)

    # calculate the covariance matrix
    # the default numpy.cov assumes the rows are variables, not columns so set rowvar to False
    covariance_matrix = np.cov(X_demeaned, rowvar=False)

    # calculate eigenvectors & eigenvalues of the covariance matrix
    eigen_vals, eigen_vecs = np.linalg.eigh(covariance_matrix)

    # sort eigenvalue in increasing order (get the indices from the sort)
    idx_sorted = np.argsort(eigen_vals)

    # reverse the order so that it's from highest to lowest.
    idx_sorted_decreasing = list(reversed(idx_sorted))

    # sort the eigen values by idx_sorted_decreasing
    eigen_vals_sorted = eigen_vals[idx_sorted_decreasing]

    # sort eigenvectors using the idx_sorted_decreasing indices
    # We're only sorting the columns so remember to get all the rows in the slice
    eigen_vecs_sorted = eigen_vecs[:, idx_sorted_decreasing]

    # select the first n eigenvectors (n is desired dimension
    # of rescaled data array, or dims_rescaled_data)
    # once again, make sure to get all the rows and only slice the columns
    eigen_vecs_subset = eigen_vecs_sorted[:, :n_components]

    # transform the data by multiplying the transpose of the eigenvectors 
    # with the transpose of the de-meaned data
    # Then take the transpose of that product.
    X_reduced = np.dot(eigen_vecs_subset.T, X_demeaned.T).T
    return X_reduced

In [None]:
reduced_coordinates = compute_pca(coordinates)

In [None]:
#fig, ax = plt.subplots()
#plt.xlim([0.25,0.45])
#plt.ylim([-0.05,0.05])

#for i in range(10,15):
#  ax.scatter(reduced_coordinates[i,0], reduced_coordinates[i,1])
#  ax.annotate(list_of_words[i], (reduced_coordinates[i,0]-0.001, reduced_coordinates[i,1]-0.001))

#plt.title('Primjer pozicije riječi nakon primjene LSH i PCA')
#plt.xlabel('prva koordinata')
#plt.ylabel('druga koordinata')


In [None]:
def get_document_embedding(tweet, embeddings):
    '''
    Input:
        - tweet: tweet tokens
        - embeddings: a dictionary of word embeddings
    Output:
        - doc_embedding: sum of all word embeddings in the tweet
    '''
    doc_embedding = np.zeros(SIZE)
    for word in tweet:
      doc_embedding = doc_embedding + embedings[word] 

    return doc_embedding

In [None]:
train_tweets = np.zeros((len(train),SIZE))

for i in range(0,len(train)):
  list_of_words = re.sub("[^\w]", " ", train.iloc[i]['TweetTokens']).split()
  list_of_words = remove_words(list_of_words, vocabulary)
  train_tweets[i,:] = get_document_embedding(list_of_words, embedings)

In [None]:
pd.DataFrame(train_tweets).to_csv('/content/drive/MyDrive/train_lsh.csv',index = False)

In [None]:
test_tweets = np.zeros((len(test),SIZE))

for i in range(0,len(test)):
  list_of_words = re.sub("[^\w]", " ", test.iloc[i]['TweetTokens']).split()
  list_of_words = remove_words(list_of_words, vocabulary)
  test_tweets[i,:] = get_document_embedding(list_of_words, embedings)

In [None]:
pd.DataFrame(test_tweets).to_csv('/content/drive/MyDrive/test_lsh.csv',index = False)