# Homework 1

## Load libraries, stopwords, link google drive and set up some utils fnctions

In [1]:
# Insatll packages
!pip install -q wget # to download data
!pip install -q spacy
!python -m spacy download en_core_web_sm > /dev/null 2>&1

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for wget (setup.py) ... [?25l[?25hdone


In [2]:
# Import library
%matplotlib inline
import numpy as np
import gensim
import matplotlib.pyplot as plt
import seaborn as sns
import wget
import spacy
import scipy.stats

from tqdm import tqdm
from nltk.corpus import stopwords
import nltk
import re
from collections import defaultdict

from IPython.display import display, HTML
import zipfile
import json
import time
import os
import math

import gdown
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [3]:
# Stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
# Connect google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Utils
def custom_unzip(zip_file, PATH):
  with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    for member in tqdm(zip_ref.infolist(), desc='Extracting '):
      try:
        zip_ref.extract(member, PATH)

      except zipfile.error as e:
        pass

def get_data_from_user():
  print("Please get data from: ")
  link = {
      'text': 'sense embedding',
      'url': 'https://drive.google.com/file/d/1JZmM8--5By4aN0HoPbYaMMfHuYJyibyb/view?usp=sharing'
  }
  html_link = f'<a href="{link["url"]}" target="_blank">{link["text"]}</a>'
  display(HTML(html_link))
  print("and save it in: /content/drive/MyDrive/")

  trial = 0;
  while(trial < 10000000):
    time.sleep(1)

    folder = '/content/drive/MyDrive/sense-embeddings'
    if os.path.exists(folder):
      print("data acquired")
      break

    trial += 1
    if trial == 10000000:
      print("time exeeded")
      break

  return folder

def get_wordnet_pos(word):
  """Map POS tag to first character lemmatize() accepts"""
  tag = word[1][0].upper()
  tag_dict = {"J": wn.ADJ,
              "N": wn.NOUN,
              "V": wn.VERB,
              "R": wn.ADV}
  return tag_dict.get(tag, wn.NOUN)

def backup_preprocessed_data(data, destination):
  with open(destination, 'w') as file:
    file.writelines(' '.join(row) + '\n' for row in data)

## Get MOSAICo and Semantic Simlex999

### MOSAICo

In [7]:
# Get dataset
# Check if the dataset is on drive, if not ask the user to download it
mosaico = '/content/drive/MyDrive/AI_Robotics/NLP/sense-datasets/500000.jsonl'
is_here_mosaico = os.path.exists(mosaico)

simlex = '/content/drive/MyDrive/AI_Robotics/NLP/sense-datasets/semantic_simlex_v0.1.tsv'
is_here_simlex = os.path.exists(simlex)

external_download = False

if not is_here_mosaico or not is_here_simlex :

  if external_download:
    folder = get_data_from_user()

    # unzip the needed dataset
    if not is_here_mosaico:
      custom_unzip(folder + '/sample_annotated_sentences.zip', '/content/')
      mosaico = '/content/500000.jsonl'

    if not is_here_simlex:
      custom_unzip(folder + '/semantic_simlex_v0.1.zip', '/content/')
      simlex = '/content/semantic_simlex_v0.1.tsv'

  else:
    file_id = '1HiAj6q37Wu6yScq9dkydVLVNHv1JEciG'
    url = f"https://drive.google.com/uc?id={file_id}"
    mosaico = '/content/500000.jsonl'
    gdown.download(url, mosaico, quiet=False)

    file_id = '15TX6LudgvOCiMb0v5vJY6x6HM8sUzG0C'
    url = f"https://drive.google.com/uc?id={file_id}"
    simlex = '/content/semantic_simlex_v0.1.tsv'
    gdown.download(url, simlex, quiet=False)

else:
  print("data acquired")


Downloading...
From: https://drive.google.com/uc?id=1HiAj6q37Wu6yScq9dkydVLVNHv1JEciG
To: /content/500000.jsonl
100%|██████████| 213M/213M [00:02<00:00, 106MB/s] 
Downloading...
From: https://drive.google.com/uc?id=15TX6LudgvOCiMb0v5vJY6x6HM8sUzG0C
To: /content/semantic_simlex_v0.1.tsv
100%|██████████| 81.3k/81.3k [00:00<00:00, 19.4MB/s]


### Simlex999

In [None]:
# Extract simlex data
simplex_pairs = dict()
simlex_sense_pairs = dict()
with open(simlex, 'r') as simlex_file:
  next(simlex_file)

  for line in simlex_file:
    splitted_line = line.strip().split()

    w1, w2, pos, score, *_ = splitted_line
    simplex_pairs[(w1, w2)] = float(score)

    s1, s2 = splitted_line[-2], splitted_line[-1]
    # for sp1 in s1.split(','):
    #   for sp2 in s2.split(','):
    #     simlex_sense_pairs[(sp1, sp2)] = float(score)
    simlex_sense_pairs[(s1, s2)] = float(score)


In [None]:
simplex_pairs

In [None]:
simlex_sense_pairs

## Word Embedding

### Preprocessing

In [None]:
# Standard functions to process text
# nlp = spacy.load("en_core_web_sm")
regexp_alphbetic = re.compile('[^a-zA-Z]+')
lemmatizer = WordNetLemmatizer()

def preprocess_word_text(sentence, stopwords, lemmatize=True):
  # doc = nlp(sentence)
  doc = sentence.split(' ')
  pos = nltk.pos_tag(doc)
  sentence_tokens = []

  for token_id, token in enumerate(doc):
    token_text = lemmatizer.lemmatize(token, get_wordnet_pos(pos[token_id])) if lemmatize else token.text
    token_text = token_text.lower()

    # skip stopwords and NON alphanumeric
    if token_text in stopwords or regexp_alphbetic.search(token_text):
      continue

    sentence_tokens.append(token_text)

  return sentence_tokens

In [None]:
# Remove common words and tokenize
word_texts = []
stop_at = 500000

with open(mosaico, 'r') as file:
  for count, line in enumerate(tqdm(file,  total=stop_at, leave=True)):
    try:
      json_line = json.loads(line)
      sentence = json_line['text']

      text_token = preprocess_word_text(sentence, stop_words, lemmatize=True)
      word_texts.append(text_token)

    except json.JSONDecodeError as e:
      print(f"Error decoding JSON: {str(e)}")

    # Stop condiction
    if count+1 == stop_at : break

# remove words that appear only once
frequency = defaultdict(int)

for text in word_texts:
  for token in text:
    frequency[token] += 1

word_texts = [[token for token in text if frequency[token] > 1] for text in word_texts]

100%|█████████▉| 499999/500000 [19:43<00:00, 422.44it/s]


In [None]:
print(word_texts)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
backup_preprocessed_data(word_texts, 'non_semantic.txt')

### Model

In [None]:
# Get model
word_model = gensim.models.Word2Vec(word_texts, vector_size=100, window=2, epochs=20, min_count=1)

In [None]:
# Inference
word_model.wv.most_similar('box')

[('bag', 0.5282819271087646),
 ('cutout', 0.5128922462463379),
 ('button', 0.5043959021568298),
 ('desk', 0.4992261528968811),
 ('tray', 0.4921962022781372),
 ('screen', 0.48446744680404663),
 ('container', 0.4827355444431305),
 ('jar', 0.48166587948799133),
 ('kiosk', 0.46833348274230957),
 ('sofa', 0.4659251868724823)]

### Pretrained Model

In [None]:
# import gensim api to download pretarind model
import gensim.downloader as api

In [None]:
# Get gensim pretarined model
model_pretrained = api.load("glove-wiki-gigaword-50")
model_pretrained



<gensim.models.keyedvectors.KeyedVectors at 0x79245a9cc8e0>

In [None]:
# Inference
model_pretrained.most_similar('box')

[('boxes', 0.7834983468055725),
 ('piece', 0.7448906898498535),
 ('spot', 0.7257857322692871),
 ('filled', 0.7160665392875671),
 ('screen', 0.7088646292686462),
 ('onto', 0.7080516219139099),
 ('blank', 0.706522524356842),
 ('card', 0.7034342288970947),
 ('copy', 0.6995376348495483),
 ('empty', 0.6988034844398499)]

### SimLex999 evaluation

#### Compute correlation between human scores and word2vec similarities


In [None]:
def compute_correlation_score(model, word_pair2score, print_warning=True, save_data=False):
  human_scores = []
  system_scores = []

  if save_data: open('non_semantic.tsv', 'w').close() # clear output file

  for (w1, w2), score in tqdm(word_pair2score.items(),  total=len(word_pair2score.items()), leave=True):
    if (w1 not in model) or (w2 not in model):
      system_scores.append(-1)
      human_scores.append(score)

      if print_warning:
        print(f"WARNING ({w1} and {w2}) are not present in the embedding model!!" )

      continue

    system_similarity = model.similarity(w1, w2)
    human_scores.append(score)
    system_scores.append(system_similarity)

    if save_data:
      with open('non_semantic.tsv', 'a') as target_output:
        target_output.write(w1 + '\t' + w2 + '\t' + str(system_similarity) + '\n')

  human_scores = np.array(human_scores)
  system_scores = np.array(system_scores)

  pearson_r, _ = scipy.stats.pearsonr(human_scores, system_scores)    # Pearson's r
  spearman_rho = scipy.stats.spearmanr(human_scores, system_scores).statistic   # Spearman's rho

  return pearson_r, spearman_rho

#### Performances

In [None]:
# word2vect
compute_correlation_score(word_model.wv, simplex_pairs, print_warning=True, save_data=True)

100%|██████████| 999/999 [00:00<00:00, 16575.98it/s]






(0.41546162329260283, 0.4145001988694379)

In [None]:
# Pretarined gensim model
compute_correlation_score(model_pretrained, simplex_pairs)

100%|██████████| 999/999 [00:00<00:00, 60955.03it/s]


(0.2941386830730656, 0.2645792192990813)

## Sense Embedding


### Preprocessing

In [None]:
# Process text and substitute word with sense
# nlp = spacy.load("en_core_web_sm")
regexp_alphbetic = re.compile('[^a-zA-Z]+')
lemmatizer = WordNetLemmatizer()

def preprocess_sense_text(sentence, annotations, stopwords, lemmatize=True):

  doc = sentence.split(' ')
  pos = nltk.pos_tag(doc)

  sentence_tokens = []
  line_word_sense = dict()

  # get the mapping from some word to the corresponding sense
  # to disanbiguate the text
  for i in range(len(annotations)):
    index = annotations[i]['token_span'][0]
    sense = annotations[i]['label']
    line_word_sense[index] = sense


  for token_id, token in enumerate(doc):

    # tokenization
    token_text = lemmatizer.lemmatize(token, get_wordnet_pos(pos[token_id])) if lemmatize else token.text
    token_text = token_text.lower()

    if token_id in line_word_sense.keys():
      sense = line_word_sense[token_id]

      if token_text in sense[:sense.index('%')]:
        sentence_tokens.append(line_word_sense[token_id])

    # skip stopwords and NON alphanumeric
    elif not ((token_text in stopwords) or regexp_alphbetic.search(token_text)):
      sentence_tokens.append(token_text)

    else:
      continue

  return sentence_tokens

In [None]:
# Extract information from dataset
stop_at = 500000
sense_texts = [] # unambiguous document

with open(mosaico, 'r') as file:
  for count, line in enumerate(tqdm(file,  total=stop_at, leave=True)):
    try:
      json_line = json.loads(line)
      sentence = json_line['text']
      annotation = json_line['annotations']

      text_token = preprocess_sense_text(sentence, annotation, stop_words, lemmatize=True)
      sense_texts.append(text_token)

    except json.JSONDecodeError as e:
      print(f"Error decoding JSON: {str(e)}")

    # Stop condiction
    if count+1 == stop_at : break

# remove words that appear only once
frequency = defaultdict(int)
for text in sense_texts:
  for token in text:
    frequency[token] += 1

sense_texts = [[token for token in text if frequency[token] > 0] for text in sense_texts]

100%|█████████▉| 499999/500000 [18:29<00:00, 450.62it/s]


In [None]:
print(sense_texts)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
backup_preprocessed_data(sense_texts, 'semantic.txt')

### Model

In [None]:
# restor backup
sense_texts = []

with open('semantic.txt', 'r') as file:
  for line in file:
    value = line.split(' ')
    sense_texts.append(value)
    count += 1

    if count == stop_at:  break

In [None]:
# Get model
sense_model = gensim.models.Word2Vec(sense_texts, vector_size=100, window=2, epochs=20, min_count=1)

### SimLex999 semantic evaluation

#### Compute correlation between human scores and word2vec semantic similarities

In [None]:
def compute_semantic_correlation_score(model, simlex_sense_pairs,
                                       print_warning=True, save_data=False):
  human_scores = []
  system_scores = []

  if save_data: open('semantic.tsv', 'w').close() # clear output file

  for (senses_1, senses_2), score in simlex_sense_pairs.items():
    senses_1 = senses_1.split(',')
    senses_2 = senses_2.split(',')
    senses_1_in_model = [s for s in senses_1 if s in model]
    senses_2_in_model = [s for s in senses_2 if s in model]


    if len(senses_1_in_model) == 0 or len(senses_2_in_model) == 0:

      # sense is not present in the model
      s1_str = " ".join(senses_1)
      s2_str = " ".join(senses_2)

      if print_warning:
        print(f"WARNING ({s1_str} and {s2_str}) are not present in the embedding model!!" )

      system_scores.append(-1)
      human_scores.append(float(score))

      continue

    else:
      all_similarities = []

      for s1 in senses_1_in_model:
        for s2 in senses_2_in_model:
          all_similarities.append(model.similarity(s1, s2))

      system_similarity = max(all_similarities)

      if save_data:
        with open('semantic.tsv','a') as output:
          word_tag_1 = senses_1[0][:senses_1[0].index('%')]
          word_tag_2 = senses_2[0][:senses_2[0].index('%')]
          output.write(word_tag_1 + '\t' + word_tag_1 + '\t' + str(system_similarity) + '\n')

      human_scores.append(float(score))
      system_scores.append(system_similarity)

  human_scores = np.array(human_scores)
  system_scores = np.array(system_scores)

  # Calculate Pearson's r (Pearson correlation coefficient) and
  # Spearman's rho (Spearman rank correlation coefficient)
  pearson_r, _ = scipy.stats.pearsonr(human_scores, system_scores)    # Pearson's r
  spearman_rho = scipy.stats.spearmanr(human_scores, system_scores).statistic   # Spearman's rho

  return pearson_r, spearman_rho

#### Performances

In [None]:
compute_semantic_correlation_score(sense_model.wv, simlex_sense_pairs, print_warning=False, save_data=True)



(nan, nan)

## Explicit Representation

In [None]:
vocabulary = set()
stop_at = 10000
count = 0
reduced = []

with open('semantic.txt', 'r') as file:
  for line in file:
    value = line.split(' ')
    reduced.append(value)
    new_set = set(value)
    vocabulary = vocabulary.union(new_set)
    count += 1
    if count == stop_at:  break

vocabulary = list(vocabulary)
print(vocabulary)
print(reduced)

Output hidden; open in https://colab.research.google.com to view.

In [None]:
explicit = {word_key: {} for word_key in vocabulary}

for lines in reduced:
  for indx, word in enumerate(lines):
    for window in [-2,-1,1,2]:
      new_indx = indx+window

      if 0 <= new_indx < len(lines):
        compare_word = lines[new_indx]
        voc_indx = vocabulary.index(compare_word)

        if voc_indx in explicit[word].keys():
          explicit[word][voc_indx] += 1

        else:
          explicit[word][voc_indx] = 1

print(explicit)

In [None]:
# cosine similarity function
def cosine_similarity(w1, w2):

  common_keys = set(w1.keys()).intersection(w2.keys())
  dot_product = 0
  norm1 = 0
  norm2 = 0

  for key in common_keys:
    norm1 += w1[key]**2
    norm2 += w2[key]**2
    dot_product = w1[key]*w2[key]

  norm_product = (math.sqrt(norm1) * math.sqrt(norm2))

  cosine_sim = dot_product/norm_product if norm_product != 0 else 0
  return cosine_sim

In [None]:
def compute__explicit_semantic_correlation_score(explicit, simlex_sense_pairs,
                                       print_warning=True, save_data=False):
  system_scores = []
  human_scores = []
  for (senses_1, senses_2), score in simlex_sense_pairs.items():

    all_similarities = []

    for s1 in senses_1:
      for s2 in senses_2:
        if s1 in explicit.keys() and s2 in explicit.keys():
          w1 = explicit[s1]
          w2 = explicit[s2]
          cosine_sim = cosine_similarity(w1, w2)
          all_similarities.append(cosine_sim)

        else:
          all_similarities.append(-1)

    system_similarity = max(all_similarities)
    system_scores.append(system_similarity)
    human_scores.append(float(score))

  # Calculate Pearson's r (Pearson correlation coefficient)
  # Spearman's rho (Spearman rank correlation coefficient)
  pearson_r, _ = scipy.stats.pearsonr(human_scores, system_scores)    # Pearson's r
  spearman_rho = scipy.stats.spearmanr(human_scores, system_scores).statistic   # Spearman's rho

  return pearson_r,spearman_rho

In [None]:
compute__explicit_semantic_correlation_score(explicit, simlex_sense_pairs,print_warning=True, save_data=False)

(0.017800447597569845, 0.019522777579842776)

## Personalized Page Rank

In [None]:
# TODO
# compute sense/word PPMI
# get beack sense to sense PPMI
# populate the graph with this