In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
cd /content/gdrive/MyDrive/AMULET/

/content/gdrive/MyDrive/AMULET


In [4]:
!chmod -R 777 /content/gdrive/MyDrive/AMULET/glove4LVR/GloVe/

In [5]:
import numpy as np

def generate(vocab_file, vectors_file, normalization = False):

    with open(vocab_file, 'r') as f:
        words = [x.rstrip().split(' ')[0] for x in f.readlines()]
    with open(vectors_file, 'r') as f:
        vectors = {}
        for line in f:
            vals = line.rstrip().split(' ')
            vectors[vals[0]] = [float(x) for x in vals[1:]]

    vocab_size = len(words)
    vocab_glove = {w: idx for idx, w in enumerate(words)}
    ivocab_glove = {idx: w for idx, w in enumerate(words)}

    vector_dim = len(vectors[ivocab_glove[0]])
    W = np.zeros((vocab_size, vector_dim))
    for word, v in vectors.items():
        if word == '<unk>':
            continue
        W[vocab_glove[word], :] = v

    # normalize each word vector to unit variance
    W_norm = np.zeros(W.shape)
    d = (np.sum(W ** 2, 1) ** (0.5))
    W_norm = (W.T / d).T

    if normalization:
      return (W_norm, vocab_glove, ivocab_glove)
    else:
      return (W, vocab_glove, ivocab_glove)

In [6]:
import subprocess
import time

def build_glove_representations(glove_path, vec_dim, num_iterations, window, dict_texts, year, normalization = False):
    """Build GloVe word representations"""
      
    with open(glove_path+str(year)+"_"+str(vec_dim)+str(num_iterations)+str(window), "w") as f:
      f.write('\n'.join(list(dict_texts.values())) + ' ')
    print(len(list(dict_texts.values())))


    print('GloVe word vectors production...')
    print(glove_path+"demo.sh", glove_path+str(year)+"_"+str(vec_dim)+str(num_iterations)+str(window), str(year), str(vec_dim), str(num_iterations), glove_path, str(window))
    subprocess.call([glove_path+"demo.sh", glove_path+str(year)+"_"+str(vec_dim)+str(num_iterations)+str(window), str(year), str(vec_dim), str(num_iterations), glove_path, str(window)])
      
    time.sleep(10)  
    print('Loading GloVe word vectors...')
    glove_representations, vocab_glove, ivocab_glove = generate(glove_path+"vocab_"+str(year)+str(vec_dim)+str(num_iterations)+str(window), glove_path+"vectors_"+str(year)+str(vec_dim)+str(num_iterations)+str(window)+".txt", normalization = normalization)
    print('W_glove_preview:\n', glove_representations, glove_representations.shape, len(vocab_glove), len(ivocab_glove))


    return glove_representations, vocab_glove, ivocab_glove

In [7]:
import json

year = 2019
_VEC_DIM = 50
_WINDOW = 10
_NUM_ITERATIONS = 50

glove_path = '/content/gdrive/MyDrive/AMULET/glove4LVR/GloVe/'

dataset_dict_201x = '/content/gdrive/MyDrive/AMULET/glove4LVR/GloVe/glove datasets/texts_dict_'+str(year)

with open(dataset_dict_201x, 'r') as f:
    texts_dict_201x = json.load(f)

glove_vectors, vocab, ivocab = build_glove_representations(glove_path, _VEC_DIM, _NUM_ITERATIONS, _WINDOW, texts_dict_201x, year, normalization = True)

print(glove_vectors)
print(vocab)
print(ivocab)

197180
GloVe word vectors production...
/content/gdrive/MyDrive/AMULET/glove4LVR/GloVe/demo.sh /content/gdrive/MyDrive/AMULET/glove4LVR/GloVe/2019_505010 2019 50 50 /content/gdrive/MyDrive/AMULET/glove4LVR/GloVe/ 10
Loading GloVe word vectors...


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

