In [4]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('punkt')

import numpy as np
import pandas as pd
from gensim.models import word2vec

from google.colab import drive
drive.mount('/content/drive')

import re # For regular expressions

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## (a) Load the dataset

In [5]:
import re
def load_data():
    """ Read tweets from the file.
        Return:
            list of lists (list_words), with words from each of the processed tweets
    """
    tweets = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AI_AS2/Corona_Tweets.csv', names=['text'])
    list_words = []
    ### iterate over all tweets from the dataset
    for i in tweets.index:
      ### remove URLs
      text = re.sub("https?://\S+|www\.\S+", " ", tweets.loc[i, 'text'])
      ### remove non-letter.
      text = re.sub("[^a-zA-Z]"," ",text)
      ### tokenize
      words = text.split()
      
      new_words = []
      ### iterate over all words of a tweet
      for w in words:
        ## TODO: remove the stop words and convert a word (w) to the lower case
        stops = set(stopwords.words("english"))
        if w not in stops:
          new_words.append(w.lower())
        
      list_words.append(new_words)
    return list_words

# check a few samples of twitter corpus
twitter_corpus = load_data()
print(twitter_corpus[:3])

[['trending', 'new', 'yorkers', 'encounter', 'empty', 'supermarket', 'shelves', 'pictured', 'wegmans', 'brooklyn', 'sold', 'online', 'grocers', 'foodkick', 'maxdelivery', 'coronavirus', 'fearing', 'shoppers', 'stock'], ['when', 'i', 'find', 'hand', 'sanitizer', 'fred', 'meyer', 'i', 'turned', 'amazon', 'but', 'pack', 'purell', 'check', 'coronavirus', 'concerns', 'driving', 'prices'], ['find', 'protect', 'loved', 'ones', 'coronavirus']]


## (b) Create co-occurrence matrix

In [6]:
def distinct_words(corpus):
    """ get a list of distinct words for the corpus.
        Params:
            corpus (list of list of strings): corpus of documents
        Return:
            corpus_words (list of strings): list of distinct words across the corpus, sorted (using python 'sorted' function)
            num_corpus_words (integer): number of distinct words across the corpus
    """
    corpus_words = set()
    for tweet in corpus:
      for word in tweet:
        corpus_words.add(word)
    corpus_words = sorted(list(corpus_words))
    num_corpus_words = len(corpus_words)
    return corpus_words, num_corpus_words

words, num_words = distinct_words(twitter_corpus)
print(words[:10], num_words)

['a', 'aadya', 'aadyasitara', 'aamiin', 'aapl', 'abajam', 'abandon', 'abandoning', 'abc', 'abeg'] 11454


In [7]:
def compute_co_occurrence_matrix(corpus, window_size=5):
    """ Compute co-occurrence matrix for the given corpus and window_size (default of 5).    
        Params:
            corpus (list of list of strings): corpus of documents
            window_size (int): size of context window
        Return:
            M (numpy matrix of shape = [number of corpus words x number of corpus words]): 
                Co-occurence matrix of word counts. 
                The ordering of the words in the rows/columns should be the same as the ordering of the words given by the distinct_words function.
            word2Ind (dict): dictionary that maps word to index (i.e. row/column number) for matrix M.
    """
    M = np.zeros((num_words, num_words), dtype=int)
    word2Ind = {}
    for i, w in enumerate(words):
      word2Ind[w] = i
    for tweet in corpus:
      for i, w in enumerate(tweet):
        w_idx = word2Ind[w]
        start = i - 5
        end = i + 5 + 1 #exclusive
        for j in range(start, end):
          if(i != j and j >= 0 and j < len(tweet)):
            c_idx = word2Ind[tweet[j]]
            M[w_idx, c_idx] += 1
            M[c_idx, w_idx] += 1
    return M, word2Ind

M, word2Ind = compute_co_occurrence_matrix(twitter_corpus)

## (c) SVD

In [8]:
# -----------------------------
# Run SVD
# Note: This may take several minutes (~20-30 minutes)
# ------------------------------
la = np.linalg
U, s, Vh = la.svd(M, full_matrices=False)

In [36]:
# Compute SVD embeddings
embedding_size = 75
# S = np.diag(np.sqrt(s[:embedding_size]))
# SVD_embeddings = np.dot(U[:, :embedding_size], S)
SVD_embeddings = np.dot(U[:,:embedding_size], np.diag(s[:embedding_size]))
print(SVD_embeddings[0])

[-7.84307351e+01  1.31731350e+01  6.29266715e+00 -8.53945247e+00
 -1.56811281e+00  3.69061675e+00  5.50158212e+00  2.22654627e+00
  6.75253024e+00 -5.68496969e+00 -2.95857447e+00 -5.60707114e+00
 -5.75058186e+00 -6.04011825e+00 -1.04871500e+01 -1.48266916e+01
  1.29361686e+01 -2.64826386e+00 -2.78846964e+00 -1.79650257e+00
  5.37266122e+00  5.21876584e+00  5.86186791e+00 -9.84126850e+00
 -1.45755820e+00 -1.72920131e+00 -1.27543562e+00 -3.23865355e+00
 -1.25375963e+00  1.50121588e+00 -3.89140772e+00  1.20714686e+00
  5.74143474e+00  8.43531726e+00  1.49201624e+00  2.93110925e-01
  2.10806036e+00 -5.03668509e-01 -5.38797187e+00  3.77701690e-01
 -1.52282753e+00  3.87760881e+00  2.95026217e+00 -4.12544915e+00
 -9.45784039e-01  5.70358010e-01 -5.58276097e+00  4.64388492e+00
  4.10835612e-01  2.29242881e-02 -8.07158667e-01  1.08967413e+00
 -4.15866287e+00  6.79476820e+00  1.01967674e+00 -4.59859189e+00
 -1.80688555e+00  3.27596948e-01  2.46240598e+00 -3.30243894e+00
 -1.45924423e-01 -8.51104

## (d1) Word2Vec

In [38]:
# Creating the word2vec model and setting values for the various parameters

# Initializing the train model. 
num_features = 75 # Word vector dimensionality
min_word_count = 0  # Minimum word count. You can change it also.
num_workers = 4     # Number of parallel threads, can be changed
context = 5         # Context window size
downsampling = 1e-3 # (0.001) Downsample setting for frequent words, can be changed
# Initializing the train model
print("Training Word2Vec model....")
model = word2vec.Word2Vec(twitter_corpus,
                          workers=num_workers,
                          vector_size=num_features, # API Change to vector_size
                          min_count=min_word_count,
                          window=context,
                          sample=downsampling)

# To make the model memory efficient
model.init_sims(replace=True)

Training Word2Vec model....


  model.init_sims(replace=True)


## (d2) Compare SVD word embeddings with Word2Vec

In [55]:
from sklearn.metrics.pairwise import cosine_similarity

def svd_most_similar(query_word, n=10):
    """ return 'n' most similar words of a query word using the SVD word embeddings similar to word2vec's most_smilar    
        Params:
            query_word (strings): a query word
        Return:
            most_similar (list of strings): the list of 'n' most similar words
    """
    # get index of a query_word
    query_word_idx = word2Ind[query_word]
    # get word embedding for a query_word
    word = SVD_embeddings[query_word_idx]
    #cosine similarity matrix
    cos_similarity = cosine_similarity(SVD_embeddings, word.reshape(1, -1))
    most_similar = []
    # model.wv.most_similar(query_word)
    '''
        Write additional code to compute the list most_similar. Each entry in the list is a tuple (w, cos)
        where w is one of the most similar word to query_word and cos is cosine similarity of w with query_word
    '''
    # get index of top n most similar words
    similar_i = np.argsort(-cos_similarity.flatten())[1:n+1]
    
    # get similar words and cos_sim score
    for i in similar_i:
      word = list(word2Ind.keys())[i]
      cos_sim = cos_similarity[i][0]
      most_similar.append((word, cos_sim))

    # sort decreasing based on second item in tuple
    most_similar.sort(key=lambda x: x[1], reverse=True)

    return most_similar

most similar: [('outbreak', 0.9045967595232447), ('pandemic', 0.902674858151078), ('new', 0.9006981057580782), ('check', 0.8917558169488365), ('fear', 0.8896410163955274), ('due', 0.888897034145722), ('toiletpaper', 0.8861295445542877), ('change', 0.8827712372574316), ('probably', 0.8796619972326563), ('news', 0.8783238923862796)]


## SVD vs Word2Vec: "???"

In [56]:
svd_most_similar("covid")

[('outbreak', 0.9045967595232447),
 ('pandemic', 0.902674858151078),
 ('new', 0.9006981057580782),
 ('check', 0.8917558169488365),
 ('fear', 0.8896410163955274),
 ('due', 0.888897034145722),
 ('toiletpaper', 0.8861295445542877),
 ('change', 0.8827712372574316),
 ('probably', 0.8796619972326563),
 ('news', 0.8783238923862796)]

In [57]:
model.wv.most_similar("covid") #this word2vec trained model on tweets

[('panicbuying', 0.9997747540473938),
 ('coronaoutbreak', 0.9997419714927673),
 ('coronavirus', 0.9997290372848511),
 ('pandemic', 0.9997248649597168),
 ('coronavirusoutbreak', 0.9996583461761475),
 ('lockdown', 0.9996491074562073),
 ('corona', 0.9996290802955627),
 ('coronapocalypse', 0.9996092319488525),
 ('uk', 0.9996013045310974),
 ('due', 0.9995934963226318)]

In [58]:
svd_most_similar("grocery")

[('mailing', 0.8919418102338428),
 ('mall', 0.8866803774039633),
 ('liquor', 0.8831283831185118),
 ('ht', 0.8790862027429133),
 ('accusations', 0.8787353350030835),
 ('elys', 0.8784858187576227),
 ('llama', 0.8754550627530243),
 ('dollargeneral', 0.875004724156989),
 ('pajama', 0.8742027409938027),
 ('quarterly', 0.8737884191106725)]

In [59]:
model.wv.most_similar("grocery")

[('went', 0.9979035258293152),
 ('shelves', 0.9971742033958435),
 ('empty', 0.9971593618392944),
 ('local', 0.9970316290855408),
 ('today', 0.9967623353004456),
 ('no', 0.9967173933982849),
 ('retail', 0.9967158436775208),
 ('bread', 0.9966362714767456),
 ('packs', 0.9966211318969727),
 ('pasta', 0.9966022968292236)]