In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
%cd /content/gdrive/MyDrive/AMULET/

/content/gdrive/MyDrive/AMULET


In [3]:
%pwd

'/content/gdrive/My Drive/AMULET'

In [4]:
import numpy as np

def generate(vocab_file, vectors_file, normalization = False):

    raw_freq = {}
    with open(vocab_file, 'r') as f:
        for x in f.readlines():
            raw_freq[x.rstrip().split(' ')[0]] = x.rstrip().split(' ')[1]
    with open(vocab_file, 'r') as f:
        words = [x.rstrip().split(' ')[0] for x in f.readlines()]
    with open(vectors_file, 'r') as f:
        vectors = {}
        for line in f:
            vals = line.rstrip().split(' ')
            vectors[vals[0]] = [float(x) for x in vals[1:]]

    vocab_size = len(words)
    vocab_glove = {w: idx for idx, w in enumerate(words)}
    ivocab_glove = {idx: w for idx, w in enumerate(words)}

    vector_dim = len(vectors[ivocab_glove[0]])
    W = np.zeros((vocab_size, vector_dim))
    for word, v in vectors.items():
        if word == '<unk>':
            continue
        W[vocab_glove[word], :] = v

    # normalize each word vector to unit variance
    W_norm = np.zeros(W.shape)
    d = (np.sum(W ** 2, 1) ** (0.5))
    W_norm = (W.T / d).T

    if normalization:
      return (W_norm, vocab_glove, ivocab_glove, raw_freq)
    else:
      return (W, vocab_glove, ivocab_glove, raw_freq)

In [7]:
year_a = 2015
year_b = 2016
_VEC_DIM = 50
_WINDOW = 10
_NUM_ITERATIONS = 50

glove_path = '/content/gdrive/MyDrive/AMULET/glove4LVR/GloVe/'
glove_path_a = '/content/gdrive/MyDrive/AMULET/glove4LVR/GloVe/'+ str(year_a) +'/'
glove_path_b = '/content/gdrive/MyDrive/AMULET/glove4LVR/GloVe/'+ str(year_b) +'/'

print('Loading GloVe word vectors for year:', year_a)
glove_representations_a, vocab_glove_a, ivocab_glove_a, raw_freq_a = generate(glove_path_a+"vocab_"+str(year_a)+str(_VEC_DIM)+str(_NUM_ITERATIONS)+str(_WINDOW), glove_path_a+"vectors_"+str(year_a)+str(_VEC_DIM)+str(_NUM_ITERATIONS)+str(_WINDOW)+".txt", normalization = True)
print('W_glove_preview_a:\n', glove_representations_a, glove_representations_a.shape, len(vocab_glove_a), len(ivocab_glove_a))
print('Loading GloVe word vectors for year:', year_b)
glove_representations_b, vocab_glove_b, ivocab_glove_b, raw_freq_b = generate(glove_path_b+"vocab_"+str(year_b)+str(_VEC_DIM)+str(_NUM_ITERATIONS)+str(_WINDOW), glove_path_b+"vectors_"+str(year_b)+str(_VEC_DIM)+str(_NUM_ITERATIONS)+str(_WINDOW)+".txt", normalization = True)
print('W_glove_preview_b:\n', glove_representations_b, glove_representations_b.shape, len(vocab_glove_b), len(ivocab_glove_b))

Loading GloVe word vectors for year: 2015
W_glove_preview_a:
 [[-0.01442009 -0.03402605  0.05198131 ... -0.06275311 -0.15993551
   0.22689601]
 [ 0.00874586 -0.09918768  0.0446918  ...  0.02031538 -0.0841706
   0.19365608]
 [-0.13387098  0.01632587  0.07410746 ... -0.01804057 -0.08962123
   0.09384304]
 ...
 [-0.0294471   0.01525655 -0.06686196 ...  0.02601947 -0.10047408
  -0.02435168]
 [ 0.11438165 -0.13976011 -0.0432953  ... -0.00589196 -0.13381387
  -0.31354613]
 [ 0.14632004  0.01801446 -0.0178317  ...  0.07368621 -0.12486345
  -0.20712641]] (146511, 50) 146511 146511
Loading GloVe word vectors for year: 2016
W_glove_preview_b:
 [[-0.00505951  0.04405434 -0.07584668 ... -0.02949477 -0.12789908
   0.15108067]
 [ 0.08243186 -0.00993407 -0.04499974 ... -0.06465371 -0.11312894
   0.10820854]
 [-0.00132215 -0.03341998 -0.0227822  ... -0.03928124 -0.07171849
   0.0045288 ]
 ...
 [ 0.0869546   0.11377376 -0.1438401  ...  0.01082068 -0.34008756
   0.27107312]
 [ 0.17948778 -0.0158364   0.

In [8]:
def topK(label, glove_representations, vocab_glove, ivocab_glove, vocab_glove_of_the_other, k=10, count = None, min_freq = 0):#w, space, ):
  # identify the top k neighbors of a word in a space

  # compute similarity of w with all words in the vocabulary
  sim = glove_representations.dot(glove_representations[vocab_glove[label], :])

  # sort similarities by descending order
  sort_sim = (sim.argsort())[::-1]

  # choose topK
  if count:
    print('COUNT IS TRUE')
    # consider only the neighbors whose raw frequency is greater than min_freq
    best = []
    for i in sort_sim:
      if label != ivocab_glove[i] and int(count[ivocab_glove[i]]) > min_freq and ivocab_glove[i] in vocab_glove_of_the_other.keys():
        best.append(i)
        if len(best) == k:
          break
  else:
     best = sort_sim[:(k + 1)]

  return [ivocab_glove[i] for i in best if label != ivocab_glove[i]]

In [None]:
!pip install tqdm

In [9]:
import pandas as pd
import json
from tqdm import tqdm
import csv

with open('/content/gdrive/MyDrive/AMULET/glove4LVR/GloVe/glove datasets/labels_dict_2013', 'r') as f13:
    data_2013 = json.load(f13)

with open('/content/gdrive/MyDrive/AMULET/glove4LVR/GloVe/glove datasets/labels_dict_2014', 'r') as f14:
    data_2014 = json.load(f14)

with open('/content/gdrive/MyDrive/AMULET/glove4LVR/GloVe/glove datasets/labels_dict_2015', 'r') as f15:
    data_2015 = json.load(f15)

with open('/content/gdrive/MyDrive/AMULET/glove4LVR/GloVe/glove datasets/labels_dict_2016', 'r') as f16:
    data_2016 = json.load(f16)

with open('/content/gdrive/MyDrive/AMULET/glove4LVR/GloVe/glove datasets/labels_dict_2017', 'r') as f17:
    data_2017 = json.load(f17)

with open('/content/gdrive/MyDrive/AMULET/glove4LVR/GloVe/glove datasets/labels_dict_2018', 'r') as f18:
    data_2018 = json.load(f18)

with open('/content/gdrive/MyDrive/AMULET/glove4LVR/GloVe/glove datasets/labels_dict_2019', 'r') as f19:
    data_2019 = json.load(f19)


data_labels = list(data_2013.keys())+list(data_2014.keys())+list(data_2015.keys())+list(data_2016.keys())+list(data_2017.keys())+list(data_2018.keys())+list(data_2019.keys())
print('before', len(data_labels))
data_labels = list(set(data_labels))
print('after', len(data_labels))
print(data_labels)

data_labels_processed = []
for label in data_labels:
    data_labels_processed.append(label.replace(',','').replace(' ', '#'))
print('data_labels_processed', len(data_labels_processed))
print(data_labels_processed)

# compute nearest neighbors overlap for all the common words
nn_scores = []
common_labels = 0
for i, label in enumerate(data_labels_processed):
  # ensure the word is in both spaces
  # frequency in text shoulδ be checked here
  if label in vocab_glove_a.keys() and label in vocab_glove_b.keys() and int(raw_freq_a[label])>10 and int(raw_freq_b[label])>10:
    # identify the top-k neighbors
    common_labels += 1
    neighbors_a = topK(label, glove_representations_a, vocab_glove_a, ivocab_glove_a, vocab_glove_b, k=50, count=raw_freq_a, min_freq = 10)
    print(common_labels, '.', label, ':', 'freq --',raw_freq_a[label], 'neighbors:', neighbors_a)
    neighbors_b = topK(label, glove_representations_b, vocab_glove_b, ivocab_glove_b, vocab_glove_a, k=50, count=raw_freq_b, min_freq = 10)
    print(common_labels, '.', label, ':', 'freq --',raw_freq_b[label], 'neighbors:', neighbors_b)
    nn_scores.append((len(set(neighbors_a).intersection(set(neighbors_b))), label))
    print()
print('len of ranking', len(nn_scores))

# rank these words
nn_scores_sorted = sorted(nn_scores)
print(year_a,'-',year_b)
for cncpt in nn_scores_sorted:
  print(cncpt)


# year_labels_f1_score_diffs = {}
# f1_scores_passing_year_protocol = csv.DictReader(open("/content/gdrive/MyDrive/Colab_notebooks/AMULET/Datasets/f1_score_per_year_for_most_frequent_labels.csv"))
# for row in f1_scores_passing_year_protocol:
#     print(row)
#     # convert to dict
#     dict_row = json.loads(json.dumps(row))
#     year_labels_f1_score_diffs['2013-2014'] = 

before 1386
after 198
['base sequence', 'pilot projects', 'postoperative complications', 'stroke', 'multivariate analysis', 'oxidative stress', 'recurrence', 'computational biology', 'longitudinal studies', 'polymorphism, single nucleotide', 'dna, bacterial', 'mice, nude', 'plant extracts', 'escherichia coli', 'apoptosis', 'equipment design', 'age factors', 'genotype', 'promoter regions, genetic', 'child', 'proportional hazards models', 'molecular structure', 'tandem mass spectrometry', 'hypertension', 'health knowledge, attitudes, practice', 'biopsy', 'databases, factual', 'disease models, animal', 'bacterial proteins', 'muscle, skeletal', 'treatment outcome', 'lung', 'phenotype', 'tumor necrosis factor-alpha', 'liver neoplasms', 'body mass index', 'pain measurement', 'quality of life', 'reproducibility of results', 'prognosis', 'logistic models', 'odds ratio', 'water', 'pregnancy', 'mass spectrometry', 'cells, cultured', 'protein conformation', 'time factors', 'models, biological', '