# Homework 2.2

## Load libraries, stopwords, link google drive

In [1]:
# Insatll packages
!pip install -q wget # to download data
!pip install -q spacy
!pip install -q word2number
!python -m spacy download en_core_web_sm > /dev/null 2>&1
!pip -q install gdown==4.6.0

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for word2number (setup.py) ... [?25l[?25hdone


In [2]:
# Import library
import math
import numpy as np
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

import os
import time
import json
import wget
import gdown
import gensim
import zipfile
from tqdm import tqdm
from IPython.display import display, HTML

import re
import nltk
import spacy
import scipy.stats
from nltk.corpus import stopwords
from collections import defaultdict
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from word2number import w2n

nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

nlp = spacy.load("en_core_web_sm")
lemmatizer = WordNetLemmatizer()
regexp_alphbetic = re.compile('[^a-zA-Z]+')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [3]:
# Stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
# Connect google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Preprocess

In [5]:
# Data Preprocess for Word2Vec embedding

nlp = spacy.load("en_core_web_sm")
lemmatizer = WordNetLemmatizer()
regexp_alphbetic = re.compile('[^a-zA-Z0-9]+')

def get_wordnet_pos(word):
  """Map POS tag to first character lemmatize() accepts"""
  tag = word[1][0].upper()
  tag_dict = {"J": wn.ADJ,
              "N": wn.NOUN,
              "V": wn.VERB,
              "R": wn.ADV}
  return tag_dict.get(tag, wn.NOUN)

def tokenize_text(sentence, stopwords, use_nltk=False, lemmatize=True):

  sentence_tokens = []

  if use_nltk:
    doc = sentence.split(' ')
    pos = nltk.pos_tag(doc)

    for token_id, token in enumerate(doc):
      token_text = lemmatizer.lemmatize(token, get_wordnet_pos(pos[token_id])) if lemmatize else token.text
      token_text = token_text.lower()

      # skip stopwords and NON alphanumeric
      if token_text in stopwords or regexp_alphbetic.search(token_text):
        continue
      sentence_tokens.append(token_text)

  else:
    doc = nlp(sentence)

    for token in doc:

      if token.text.replace('.', '').isnumeric():
        # print(token.text)
        # break
        sentence_tokens.append(token.text)

      else:

        token_text = token.lemma_ if lemmatize else token.text
        token_text = token_text.lower()

        # skip stopwords and NON alphanumeric
        if token_text in stopwords or regexp_alphbetic.search(token_text):
          continue
        sentence_tokens.append(token_text)

  return sentence_tokens

def process(document, check_frequency=True, use_nltk=False, use_tqdm=False):

  texts = []

  if use_tqdm:
    for sentence in tqdm(document, total=len(document), leave=True):
      texts.append(tokenize_text(sentence, stop_words, use_nltk, lemmatize=True))
  else:
    texts = [tokenize_text(sentence, stop_words, use_nltk, lemmatize=True) for sentence in document]

  # remove words that appear only once --> MAYBE TO REMOVE
  if check_frequency:
    frequency = defaultdict(int)
    for text in texts:
        for token in text:
            frequency[token] += 1

    texts = [[token for token in text if frequency[token] > 1]
            for text in texts]

  return texts


def is_verb_or_adverb(word):

  if len(wn.synsets(word)) == 0: return False
  if word == 'no' or word=='not' or word=='yes': return False

  if len(wn.synsets(word, pos=wn.VERB))/len(wn.synsets(word)) >= 0.5 or len(wn.synsets(word, pos=wn.ADV))/len(wn.synsets(word)) >= 0.5:
    return True
  else:
    return False

# find nimbers in answers they might be useful
def is_number(word):
    try:
        # Try converting the word to a number
        w2n.word_to_num(word)
        return True
    except ValueError:
        # If a ValueError is raised, the word does not represent a number
        return False

## Get celaned class data

In [6]:
# Data extraction

def get_txt(file_location, filename):
  !gdown {file_location}
  print("Text file acquired")
  return'/content/' + filename

def get_data_from_txt(file_txt):

  # Expected file format:
  # line[0] : class_items
  # line[1] : question
  # line[2] : answer

  # data
  class_items = []
  questions = dict()
  answers = dict()
  keywords = dict()
  corpus = []

  save_questions = True
  index = 0

  with open(file_txt, 'r', encoding='utf-8') as qna_file:

    for lines in qna_file:
      line = lines.split('\t')

      # class items
      item = line[0]

      if not class_items or class_items[-1] != item:
        class_items.append(item)
        index = 0

      # questions
      question = line[1]

      if index not in questions.keys():
        questions[index] = []

      if line[1] not in  questions[index]:
        questions[index].append(question)

      # answers
      answer = line[2].split('\n')

      if index not in answers.keys():
        answers[index] = []

      answers[index].append(answer[0])

      # insert in keyword direct yes or no answers
      bad_tokenized_answer = answer[0].lower().split(' ')

      # yes case
      if 'yes' in bad_tokenized_answer[0]:
        keywords[(class_items[-1], question)] = ['yes']

      # no case
      elif 'no' in bad_tokenized_answer[0]:
        keywords[(class_items[-1], question)] = ['no']

      # insert in keywords negated answers (e.g. Is X aquatic? X is *not aquatic*)
      else:
        insertion = False
        sentence_question = question[:-1] # remove question mark

        for word in sentence_question.split(' '):
          if ('not ' +  word) in answer[0]:
            keywords[(class_items[-1], question)] = ['not ' + word]
            insertion = True

        # fill the dictionary with empty keywords
        if not insertion: keywords[(class_items[-1], question)] = []

      # q&a corpus
      corpus.append(item + ': ' + question + ' ' + answer[0])

      # increase question index while the item is still the same
      index += 1

    print("Items, questions and answers for class {} acquired".format(file_txt[9:]))
    return class_items, questions, answers, keywords, corpus

In [7]:
# Data from class 05 - 06

# NOTE: execute this cell multiple times in order because it may get stuck
# class 05
file_location = 'https://drive.google.com/uc?id=1M5m6gmVnn4D0in_s_EqvTYLxeQPinpxE'
filename = 'class_05.clean.txt'
file_txt_05 = get_txt(file_location, filename)
c05_items, c05_questions, c05_answers, c05_keywords, c05_corpus = get_data_from_txt(file_txt_05)

Downloading...
From: https://drive.google.com/uc?id=1M5m6gmVnn4D0in_s_EqvTYLxeQPinpxE
To: /content/class_05.clean.txt
  0% 0.00/503k [00:00<?, ?B/s]100% 503k/503k [00:00<00:00, 114MB/s]
Text file acquired
Items, questions and answers for class class_05.clean.txt acquired


In [8]:
# NOTE: execute this cell multiple times in order because it may get stuck
# class 06
file_location = 'https://drive.google.com/uc?id=1ri2shK3sgRHxpVox_aQnN5hz6HiQ0WZ4'
filename = 'class_06.clean.txt'
file_txt_06 = get_txt(file_location, filename)
c06_items, c06_questions, c06_answers, c06_keywords, c06_corpus = get_data_from_txt(file_txt_06)

Downloading...
From: https://drive.google.com/uc?id=1ri2shK3sgRHxpVox_aQnN5hz6HiQ0WZ4
To: /content/class_06.clean.txt
  0% 0.00/1.32M [00:00<?, ?B/s]100% 1.32M/1.32M [00:00<00:00, 116MB/s]
Text file acquired
Items, questions and answers for class class_06.clean.txt acquired


## Custom functions for text analysis

In [9]:
# Utils functions

# display results in tabular form
def preatty_print(keywords):
  max_q_lenght= len(max(keywords.keys(), key=lambda x: len(x[1]))[1])
  space0 = max_q_lenght - len('Question')

  header = "Concepts \t Questions" + ' ' * space0 + " \t Keywords"
  # print("-" * (len(header) + 10))
  print(header)
  print("-" * (len(header) + 10))

  for key, value in keywords.items():

    concept = key[0]
    question = key[1]
    answer = value[0] if len(value) == 1 else value

    space1 = ' ' * (len('Concepts') - len(concept))
    space2 = ' ' * (max_q_lenght - len(question))
    print("{}{} \t {}{} \t {}". format(concept, space1, question, space2, answer))

  return 0

# save data
def save_data(keywords, filename):

  with open(filename, 'w', encoding='utf-8') as save_file:
    for key, value in keywords.items():
      concept = key[0]
      question = key[1]
      answer = ''

      if len(value) == 1:
        answer = str(value[0])
      else:
        answer = ';'.join(w for w in value)

      save_file.write(concept + '\t' + question + '\t' + answer + '\n')

  return 0

# kind gpt suggestion to tokenize single items.
# It works by creating a fake contest for the item, it will be interpreted as a name
def check_item_in_model(item, model, enable_warn=False) :
  fake_sentence = f"The {item} are important."

  if item not in model.wv:
    item = nlp(fake_sentence)
    item = item[1].lemma_

    if item not in model.wv:
      if enable_warn: print("Warning: {} not found".format(item))
      return None
    else:
      return item

  else:
    return item


In [10]:
# Keywords computation

def get_keywords(keywords, questions, answers, model):

  old_item = list(keywords.keys())[0][0]
  answ_index = 0
  quest_index = 0

  for elem in tqdm(keywords.items(), total=len(keywords), leave=True):

    key = elem[0]
    item = key[0]
    answer = elem[1]
    # question = [key[1]]

    if answer != [] :
      quest_index += 1
      continue

    # when item change
    if old_item != item:
      answ_index += 1 # increment answer index
      quest_index = 0 # reset question index
      old_item = item # update item

    # current_answer = nlp(answers[quest_index][answ_index])
    current_question = questions[quest_index]
    current_answer = answers[quest_index][answ_index]

    # for each word in the answer select only the one that has the
    # highest similarity with both the item and the words in the questions
    aword_scores = dict()

    # numerical keywords
    numbers = []

    # print(current_answer) # uncomment for debug

    for aw_index, aword in enumerate(current_answer):

      # ignore single letter words
      if len(aword) <= 1 : continue

      # check also if word is a number
      # if so select it automatically as a keyword
      if is_number(aword) or aword.isnumeric():
        numbers.append((aword, aw_index, 1000.0))
        continue

      # check if items is a composit word e.g. polar bear
      second_item = ''
      double = item.split(' ')
      item = double[0].lower()
      if len(double) > 1: second_item = double[1].lower()

      # check if item is in the model (very intricate)
      item = check_item_in_model(item, model)
      if not item:
        print("Error: {} not in model".format(item))
        break
      if second_item:
        second_item = check_item_in_model(second_item, model)
        if not second_item: second_item = ''

      # exculed item itself from the possible keywords
      if aword == item or aword == item+'s' or aword == second_item or aword == second_item+'s':
        continue

      # ignore possible keywords not known by the model
      if aword not in model.wv: continue

      item_sim_value = model.wv.similarity(item, aword)
      if second_item: # average out
        item_sim_value += model.wv.similarity(second_item, aword)
        item_sim_value /= 2

      best_q_value = -1000.0
      for qword in current_question:

        if qword not in model.wv: continue
        quest_sim_value = model.wv.similarity(qword, aword)

        if quest_sim_value > best_q_value:
          best_q_value = quest_sim_value

      aword_scores[(aword, aw_index)] = (0.7*item_sim_value + 0.3*best_q_value)

    if len(aword_scores) < 1:
      print("Error: no candidate available")
      break

    mean = np.asarray(list(aword_scores.values())).mean() # to be more flexible on the mean
    # print("mean {}".format(mean)) # uncomment for debug
    candidates = [(w[0], w[1], s) for w, s in aword_scores.items() if s >= mean][:5]
    candidates = candidates + numbers
    candidates = sorted(candidates, key=lambda x: x[1])
    # print(candidates) # uncomment for debug

    # check word matching in the q&a pair (naive)
    for w, i, s in candidates:
      keywords[key].append(w)

    # increment question index
    quest_index += 1

    # if quest_index == 40 : break # uncomment for debug

  return 0

## Word Embedding method

#### Class 05

In [11]:
# tokenized questions 05
c05_proc_questions = dict()
for index, questions in tqdm(c05_questions.items(), total=len(c05_questions), leave=True):

  if len(questions) > 1: questions = [' '.join(questions)]
  proc_questions = process(questions, check_frequency=False)
  c05_proc_questions[index] = proc_questions[0]

# tokenized answers 05
c05_proc_answers = dict()
for index, answers in tqdm(c05_answers.items(), total=len(c05_answers), leave=True):
  proc_answers = process(answers, check_frequency=False)
  c05_proc_answers[index] = proc_answers

100%|██████████| 44/44 [00:00<00:00, 94.88it/s]
100%|██████████| 44/44 [00:41<00:00,  1.05it/s]


In [12]:
# tokenized corpus 05
c05_proc_corpus =  process(c05_corpus, check_frequency=True, use_nltk=False, use_tqdm=True)

100%|██████████| 4092/4092 [00:52<00:00, 78.32it/s]


In [None]:
# model 05
c05_model =  gensim.models.Word2Vec(c05_proc_corpus, vector_size=100, window=4, epochs=100, min_count=1)

In [None]:
# keyword extraction
get_keywords(c05_keywords, c05_proc_questions, c05_proc_answers, c05_model)

In [None]:
# evaluation
preatty_print(c05_keywords)

In [None]:
# save data
save_data(c05_keywords, 'class_05.values.tsv')

0

#### Class 06


In [None]:
# tokenized questions 06
c06_proc_questions = dict()
for index, questions in tqdm(c06_questions.items(), total=len(c06_questions), leave=True):

  if len(questions) > 1: questions = [' '.join(questions)]
  proc_questions = process(questions, check_frequency=False)
  c06_proc_questions[index] = proc_questions[0]

# tokenized answers 06
c06_proc_answers = dict()
for index, answers in tqdm(c06_answers.items(), total=len(c06_answers), leave=True):
  proc_answers = process(answers, check_frequency=False)
  c06_proc_answers[index] = proc_answers


100%|██████████| 55/55 [00:13<00:00,  4.12it/s]
100%|██████████| 55/55 [00:59<00:00,  1.08s/it]


In [None]:
# tokenized corpus 06
c06_proc_corpus =  process(c06_corpus, check_frequency=True, use_nltk=False, use_tqdm=True)

100%|██████████| 7370/7370 [01:22<00:00, 89.07it/s] 


In [None]:
# model 06
c06_model =  gensim.models.Word2Vec(c06_proc_corpus, vector_size=100, window=4, epochs=100, min_count=1)

In [None]:
# keyword extraction
get_keywords(c06_keywords, c06_proc_questions, c06_proc_answers, c06_model)

100%|██████████| 7210/7210 [09:49<00:00, 12.22it/s]


0

In [None]:
# evaluation
preatty_print(c06_keywords)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
closet   	 Does a closet have any moving parts?                                                                       	 ['move', 'part', 'door', 'also', 'door']
closet   	 Is a closet flexible or rigid?                                                                             	 ['rigid', 'structure', 'storage', 'layout', 'however']
closet   	 Does a closet break if I walk on it?                                                                       	 ['walk', 'shelf', 'may', 'structure']
closet   	 What is the purpose of a closet?                                                                           	 ['purpose', 'storage', 'clothing', 'shoe', 'living']
closet   	 When was a closet invented?                                                                                	 ['concept', 'storage', 'clothing', 'personal', 'challenge']
closet   	 Does a closet have any historical or cultural significance?                  

0

In [None]:
# save data
save_data(c06_keywords, 'class_06.values.tsv')

0