In [2]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('punkt')

import numpy as np
import pandas as pd
from gensim.models import word2vec

from google.colab import drive
drive.mount('/content/drive')

import re # For regular expressions

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## (a) Load the dataset

In [3]:
import re
def load_data():
    """ Read tweets from the file.
        Return:
            list of lists (list_words), with words from each of the processed tweets
    """
    tweets = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AI_AS2/Corona_Tweets.csv', names=['text'])
    list_words = []
    ### iterate over all tweets from the dataset
    for i in tweets.index:
      ### remove URLs
      text = re.sub("https?://\S+|www\.\S+", " ", tweets.loc[i, 'text'])
      ### remove non-letter.
      text = re.sub("[^a-zA-Z]"," ",text)
      ### tokenize
      words = text.split()
      
      new_words = []
      ### iterate over all words of a tweet
      for w in words:
        ## TODO: remove the stop words and convert a word (w) to the lower case
        stops = set(stopwords.words("english"))
        if w not in stops:
          new_words.append(w.lower())
        
      list_words.append(new_words)
    return list_words

# check a few samples of twitter corpus
twitter_corpus = load_data()
print(twitter_corpus[:3])

[['trending', 'new', 'yorkers', 'encounter', 'empty', 'supermarket', 'shelves', 'pictured', 'wegmans', 'brooklyn', 'sold', 'online', 'grocers', 'foodkick', 'maxdelivery', 'coronavirus', 'fearing', 'shoppers', 'stock'], ['when', 'i', 'find', 'hand', 'sanitizer', 'fred', 'meyer', 'i', 'turned', 'amazon', 'but', 'pack', 'purell', 'check', 'coronavirus', 'concerns', 'driving', 'prices'], ['find', 'protect', 'loved', 'ones', 'coronavirus']]


## (b) Create co-occurrence matrix

In [4]:
def distinct_words(corpus):
    """ get a list of distinct words for the corpus.
        Params:
            corpus (list of list of strings): corpus of documents
        Return:
            corpus_words (list of strings): list of distinct words across the corpus, sorted (using python 'sorted' function)
            num_corpus_words (integer): number of distinct words across the corpus
    """
    corpus_words = set()
    for tweet in corpus:
      for word in tweet:
        corpus_words.add(word)
    corpus_words = sorted(list(corpus_words))
    num_corpus_words = len(corpus_words)
    return corpus_words, num_corpus_words

words, num_words = distinct_words(twitter_corpus)
print(words[:10], num_words)

['a', 'aadya', 'aadyasitara', 'aamiin', 'aapl', 'abajam', 'abandon', 'abandoning', 'abc', 'abeg'] 11454


In [5]:
def compute_co_occurrence_matrix(corpus, window_size=5):
    """ Compute co-occurrence matrix for the given corpus and window_size (default of 5).    
        Params:
            corpus (list of list of strings): corpus of documents
            window_size (int): size of context window
        Return:
            M (numpy matrix of shape = [number of corpus words x number of corpus words]): 
                Co-occurence matrix of word counts. 
                The ordering of the words in the rows/columns should be the same as the ordering of the words given by the distinct_words function.
            word2Ind (dict): dictionary that maps word to index (i.e. row/column number) for matrix M.
    """
    M = np.zeros((num_words, num_words), dtype=int)
    word2Ind = {}
    for i, w in enumerate(words):
      word2Ind[w] = i
    for tweet in corpus:
      for i, w in enumerate(tweet):
        w_idx = word2Ind[w]
        start = i - 5
        end = i + 5 + 1 #exclusive
        for j in range(start, end):
          if(i != j and j >= 0 and j < len(tweet)):
            c_idx = word2Ind[tweet[j]]
            M[w_idx, c_idx] += 1
            M[c_idx, w_idx] += 1
    return M, word2Ind

M, word2Ind = compute_co_occurrence_matrix(twitter_corpus)

## (c) SVD

In [6]:
# -----------------------------
# Run SVD
# Note: This may take several minutes (~20-30 minutes)
# ------------------------------
la = np.linalg
U, s, Vh = la.svd(M, full_matrices=False)

In [8]:
# Compute SVD embeddings
embedding_size = 75
S = np.diag(np.sqrt(s[:embedding_size]))
SVD_embeddings = np.dot(U[:, :embedding_size], S)

print(SVD_embeddings)
print(len(SVD_embeddings))

[[-1.14155327e+00  3.37115115e-01  1.68194484e-01 ... -5.13463566e-01
  -6.91859966e-03 -4.83082296e-01]
 [-1.78800131e-02 -2.31200903e-02 -1.16629915e-03 ...  9.58262009e-03
   7.46734194e-04 -1.39529727e-03]
 [-2.73409975e-03 -2.25576184e-03  3.02538431e-04 ...  1.03052699e-02
  -1.37167111e-03 -3.58876377e-03]
 ...
 [-1.14900967e-02 -6.29385799e-03  1.19180505e-03 ...  6.77193704e-03
   2.41658171e-02 -7.05097585e-03]
 [-8.40656902e-03 -4.38477606e-03  2.97293796e-04 ...  3.96570904e-03
  -6.79385620e-04  2.98318718e-03]
 [-3.37930568e-04 -5.01689942e-04 -1.51688773e-04 ...  2.74498963e-03
   1.18551928e-02 -4.63090044e-03]]
11454


## (d1) Word2Vec

In [None]:
# Creating the word2vec model and setting values for the various parameters

# Initializing the train model. 
num_features = ??   # Word vector dimensionality
min_word_count = 0  # Minimum word count. You can change it also.
num_workers = 4     # Number of parallel threads, can be changed
context = ??         # Context window size
downsampling = 1e-3 # (0.001) Downsample setting for frequent words, can be changed
# Initializing the train model
print("Training Word2Vec model....")
model = word2vec.Word2Vec(??)

# To make the model memory efficient
model.init_sims(replace=True)

## (d2) Compare SVD word embeddings with Word2Vec

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def svd_most_similar(query_word, n=10):
    """ return 'n' most similar words of a query word using the SVD word embeddings similar to word2vec's most_smilar    
        Params:
            query_word (strings): a query word
        Return:
            most_similar (list of strings): the list of 'n' most similar words
    """
    # get index of a query_word
    query_word_idx = word2Ind[query_word]
    # get word embedding for a query_word
    word = SVD_embeddings[query_word_idx]
    #cosine similarity matrix
    cos_similarity = cosine_similarity(SVD_embeddings, word.reshape(1, -1))
    most_similar = []
    'Write additional code to compute the list most_similar. Each entry in the list is a tuple (w, cos)
    'where w is one of the most similar word to query_word and cos is cosine similarity of w with query_word

    return most_similar   
    

## SVD vs Word2Vec: "???"

In [None]:
svd_most_similar("covid")

In [None]:
model.wv.most_similar("covid") #this word2vec trained model on tweets

In [None]:
svd_most_similar("grocery")

In [None]:
model.wv.most_similar("grocery")