<a href="https://colab.research.google.com/github/jeremilev/comp472-project/blob/asg2/Asg2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TASK 1: Evaluation of word2vec-google-news-300 pre-trained model

In [69]:
import gensim.downloader as api
from gensim.models import Word2Vec
from gensim import similarities
from enum import Enum
import json
from dataclasses import dataclass
import nltk
import numpy as np
import pandas as pd
import csv
import random


In [2]:
GOOGLE_MODEL = "word2vec-google-news-300"

print(api.info(GOOGLE_MODEL))
model = api.load(GOOGLE_MODEL)

{'num_records': 3000000, 'file_size': 1743563840, 'base_dataset': 'Google News (about 100 billion words)', 'reader_code': 'https://github.com/RaRe-Technologies/gensim-data/releases/download/word2vec-google-news-300/__init__.py', 'license': 'not found', 'parameters': {'dimension': 300}, 'description': "Pre-trained vectors trained on a part of the Google News dataset (about 100 billion words). The model contains 300-dimensional vectors for 3 million words and phrases. The phrases were obtained using a simple data-driven approach described in 'Distributed Representations of Words and Phrases and their Compositionality' (https://code.google.com/archive/p/word2vec/).", 'read_more': ['https://code.google.com/archive/p/word2vec/', 'https://arxiv.org/abs/1301.3781', 'https://arxiv.org/abs/1310.4546', 'https://www.microsoft.com/en-us/research/publication/linguistic-regularities-in-continuous-space-word-representations/?from=http%3A%2F%2Fresearch.microsoft.com%2Fpubs%2F189726%2Frvecs.pdf'], 'che

In [3]:
class Label(Enum):
  GUESS = "guess"
  CORRECT = "correct"
  WRONG = "wrong"

@dataclass
class Question:
  question: str
  answer: str
  choices: [str]
  label: Label = Label.GUESS
  guessed_word: str = None

In [4]:
def predict_most_simmilar_word(question: Question) -> Question:
  highest_similarity = (0, "")
  for choice in question.choices:
      try:
        similarity = model.similarity(question.question, choice)
      except(KeyError):
        continue
      if similarity > highest_similarity[0]:
        highest_similarity = (similarity, choice)

  question.guessed_word = highest_similarity[1]
  if highest_similarity[0] == 0: return question
  if question.answer == question.guessed_word: question.label = Label.CORRECT
  else: question.label = Label.WRONG
  return question

def get_simmilar_list(questions: dict) -> list[Question]:
  for question in questions:
    predict_most_simmilar_word(question)

  return questions

In [5]:
# import files
!gdown  https://drive.google.com/uc?id=1LAclVWP_FJBhysLAR0ky1wIP3RveORXO

INP_FILENAME = "synonym.json"
OUTP_FILENAME = f"{GOOGLE_MODEL}-details.csv"
EVAL_FILENAME = "analysis.csv"

questions = []
with open(INP_FILENAME) as f:
  data = json.load(f)
  for obj in data:
    questions.append(Question(**obj))

res = get_simmilar_list(questions)
with open(OUTP_FILENAME, "a") as f:
  for question in res:
    if question.guessed_word == None:
      question.guessed_word = ""
    f.write(",".join([question.question,
                      question.answer,
                      question.guessed_word,
                      question.label.value]))
    f.write("\n")

with open(EVAL_FILENAME, "a") as f:
  length = len(model)
  C = len([x for x in res if x.label == Label.CORRECT])
  guesses = len([x for x in res if x.label == Label.GUESS])
  V = len(res) - guesses
  if V != 0:
    accuracy = C/V
  else: accuracy = 0

  f.write(",".join([GOOGLE_MODEL,str(length), str(C), str(V), str(accuracy)]))
  f.write("\n")

Downloading...
From: https://drive.google.com/uc?id=1LAclVWP_FJBhysLAR0ky1wIP3RveORXO
To: /content/synonym.json
  0% 0.00/16.2k [00:00<?, ?B/s]100% 16.2k/16.2k [00:00<00:00, 34.2MB/s]


# TASK 2: Other Pre-trained Models

## Two models with different corpora but same embeddings

In [6]:
from gensim.models import KeyedVectors
from huggingface_hub import hf_hub_download
wiki2018 = KeyedVectors.load_word2vec_format(hf_hub_download(repo_id="Word2vec/wikipedia2vec_enwiki_20180420_100d", filename="enwiki_20180420_100d.txt"))

enwiki_20180420_100d.txt:   0%|          | 0.00/3.49G [00:00<?, ?B/s]

@inproceedings{yamada2020wikipedia2vec,
  title = "{W}ikipedia2{V}ec: An Efficient Toolkit for Learning and Visualizing the Embeddings of Words and Entities from {W}ikipedia",
  author={Yamada, Ikuya and Asai, Akari and Sakuma, Jin and Shindo, Hiroyuki and Takeda, Hideaki and Takefuji, Yoshiyasu and Matsumoto, Yuji},
  booktitle = {Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations},
  year = {2020},
  publisher = {Association for Computational Linguistics},
  pages = {23--30}
}

In [7]:
wiki_gigaword = api.load('glove-wiki-gigaword-100')



## Two models with same corpora but different embeddings

In [9]:
twitter_25 = api.load('glove-twitter-25')
twitter_50 = api.load('glove-twitter-50')



##

In [10]:
!gdown https://drive.google.com/uc?id=1-T1NeG8vIkCRw_Vz7xSVMb4k1TiOAbMy

Downloading...
From: https://drive.google.com/uc?id=1-T1NeG8vIkCRw_Vz7xSVMb4k1TiOAbMy
To: /content/synonym.csv
  0% 0.00/4.42k [00:00<?, ?B/s]100% 4.42k/4.42k [00:00<00:00, 13.8MB/s]


In [40]:
# Read dataset and convert it to a numpy array for easier index referencing later on
synonym_data = pd.read_csv('synonym.csv')
synonym_np = synonym_data.to_numpy()
synonym_data.head()

Unnamed: 0,question,answer,0,1,2,3
0,enormously,tremendously,appropriately,uniquely,tremendously,decidedly
1,provisions,stipulations,stipulations,interrelations,jurisdictions,interpretations
2,haphazardly,randomly,dangerously,densely,randomly,linearly
3,prominent,conspicuous,battered,ancient,mysterious,conspicuous
4,zenith,pinnacle,completion,pinnacle,outset,decline


## wiki2018

In [44]:
wiki2018_results = np.zeros((80,4), dtype= 'U50')
synonym_np[0,1]

'tremendously'

In [55]:
# iterate over question to get words
c = 0
v = 80
for i in range(0,80):
  wiki2018_results[i,1] = synonym_np[i,1]
for i in range(0,80):
  wiki2018_results[i,0] = synonym_np[i,0]
  try:
    cossim_list = []
    for j in range(2,6):
      try:
        cossim_list.append(wiki2018.similarity(synonym_np[i,0], synonym_np[i,j]))
      except Exception as e:
            print(e)
    #cossim_list = [wiki2018.similarity(synonym_np[i,0], synonym_np[i,2]),wiki2018.similarity(synonym_np[i,0], synonym_np[i,3]),wiki2018.similarity(synonym_np[i,0], synonym_np[i,4]),wiki2018.similarity(synonym_np[i,0], synonym_np[i,5])]

    guess_value = max(cossim_list)
    guess_index = cossim_list.index(guess_value)
    guess = synonym_np[i,guess_index+2]
    wiki2018_results[i,1] = synonym_np[i,1]
    if guess == synonym_np[i,1]:
      wiki2018_results[i,3] = 'correct'
      wiki2018_results[i,2] = guess
      c +=1
    else:
      wiki2018_results[i,3] = 'wrong'
      wiki2018_results[i,2] = guess
  except Exception as e:
    print(e)
    v -= 1

wiki2018_results

"Key 'bipartisanly' not present"


array([['enormously', 'tremendously', 'tremendously', 'correct'],
       ['provisions', 'stipulations', 'stipulations', 'correct'],
       ['haphazardly', 'randomly', 'randomly', 'correct'],
       ['prominent', 'conspicuous', 'conspicuous', 'correct'],
       ['zenith', 'pinnacle', 'pinnacle', 'correct'],
       ['flawed', 'imperfect', 'imperfect', 'correct'],
       ['urgently', 'desperately', 'desperately', 'correct'],
       ['consumed', 'eaten', 'eaten', 'correct'],
       ['advent', 'coming', 'coming', 'correct'],
       ['concisely', 'succinctly', 'succinctly', 'correct'],
       ['salutes', 'greetings', 'ceremonies', 'wrong'],
       ['solitary', 'alone', 'restless', 'wrong'],
       ['hasten', 'accelerate', 'accelerate', 'correct'],
       ['perseverance', 'endurance', 'generosity', 'wrong'],
       ['fanciful', 'imaginative', 'imaginative', 'correct'],
       ['showed', 'demonstrated', 'demonstrated', 'correct'],
       ['constantly', 'continually', 'continually', 'correct'],

In [54]:
np.savetxt("word2vec-enwiki-2018-100-details.csv", wiki2018_results, delimiter=",",fmt='%s')
size = len(wiki2018)
model_info = ['word2vec-enwiki-2018-100', size, c, v, c/v]
with open('analysis.csv', 'a', newline='') as f:
  writer = csv.writer(f)
  writer.writerow(model_info)


## wiki_gigaword

In [57]:
# iterate over question to get words
wiki_giga_results = np.zeros((80,4), dtype= 'U50')
c = 0
v = 80
for i in range(0,80):
  wiki_giga_results[i,1] = synonym_np[i,1]
for i in range(0,80):
  wiki_giga_results[i,0] = synonym_np[i,0]
  try:
    cossim_list = []
    for j in range(2,6):
      try:
        cossim_list.append(wiki_gigaword.similarity(synonym_np[i,0], synonym_np[i,j]))
      except Exception as e:
            print(e)

    guess_value = max(cossim_list)
    guess_index = cossim_list.index(guess_value)
    guess = synonym_np[i,guess_index+2]
    wiki_giga_results[i,1] = synonym_np[i,1]
    if guess == synonym_np[i,1]:
      wiki_giga_results[i,3] = 'correct'
      wiki_giga_results[i,2] = guess
      c +=1
    else:
      wiki_giga_results[i,3] = 'wrong'
      wiki_giga_results[i,2] = guess
  except Exception as e:
    print(e)
    v -= 1

wiki_giga_results

"Key 'verbosely' not present"
"Key 'bipartisanly' not present"
"Key 'apathetically' not present"


array([['enormously', 'tremendously', 'tremendously', 'correct'],
       ['provisions', 'stipulations', 'stipulations', 'correct'],
       ['haphazardly', 'randomly', 'randomly', 'correct'],
       ['prominent', 'conspicuous', 'ancient', 'wrong'],
       ['zenith', 'pinnacle', 'pinnacle', 'correct'],
       ['flawed', 'imperfect', 'imperfect', 'correct'],
       ['urgently', 'desperately', 'desperately', 'correct'],
       ['consumed', 'eaten', 'eaten', 'correct'],
       ['advent', 'coming', 'coming', 'correct'],
       ['concisely', 'succinctly', 'succinctly', 'correct'],
       ['salutes', 'greetings', 'greetings', 'correct'],
       ['solitary', 'alone', 'restless', 'wrong'],
       ['hasten', 'accelerate', 'accelerate', 'correct'],
       ['perseverance', 'endurance', 'generosity', 'wrong'],
       ['fanciful', 'imaginative', 'imaginative', 'correct'],
       ['showed', 'demonstrated', 'demonstrated', 'correct'],
       ['constantly', 'continually', 'continually', 'correct'],
    

In [58]:
np.savetxt("glove-wiki-gigaword-100-details.csv", wiki_giga_results, delimiter=",",fmt='%s')
size = len(wiki_gigaword)
model_info = ['glove-wiki-gigaword-100', size, c, v, c/v]
with open('analysis.csv', 'a', newline='') as f:
  writer = csv.writer(f)
  writer.writerow(model_info)

## twitter_25

In [71]:
# iterate over question to get words
twitter25_results = np.zeros((80,4), dtype= 'U50')
c = 0
v = 80
for i in range(0,80):
  twitter25_results[i,1] = synonym_np[i,1]
for i in range(0,80):
  twitter25_results[i,0] = synonym_np[i,0]
  try:
    cossim_list = []
    for j in range(2,6):
      try:
        cossim_list.append(twitter_25.similarity(synonym_np[i,0], synonym_np[i,j]))
      except Exception as e:
            print(e)

    guess_value = max(cossim_list)
    guess_index = cossim_list.index(guess_value)
    guess = synonym_np[i,guess_index+2]
    twitter25_results[i,1] = synonym_np[i,1]
    if guess == synonym_np[i,1]:
      twitter25_results[i,3] = 'correct'
      twitter25_results[i,2] = guess
      c +=1
    else:
      twitter25_results[i,3] = 'wrong'
      twitter25_results[i,2] = guess
  except Exception as e:
    print(e)
    randi = random.randint(2, 5)
    twitter25_results[i,2] = synonym_np[i,randi]
    twitter25_results[i,3] = 'guess'
    v -= 1

twitter25_results

"Key 'interrelations' not present"
"Key 'linearly' not present"
"Key 'prudently' not present"
"Key 'unequaled' not present"
"Key 'peculiarly' not present"
"Key 'peculiarly' not present"
"Key 'peculiarly' not present"
"Key 'peculiarly' not present"
max() arg is an empty sequence
"Key 'shrewdly' not present"
"Key 'verbosely' not present"
"Key 'halfheartedly' not present"
"Key 'halfheartedly' not present"
"Key 'halfheartedly' not present"
"Key 'halfheartedly' not present"
max() arg is an empty sequence
"Key 'descriptively' not present"
"Key 'steadier' not present"
"Key 'haltingly' not present"


array([['enormously', 'tremendously', 'tremendously', 'correct'],
       ['provisions', 'stipulations', 'interrelations', 'wrong'],
       ['haphazardly', 'randomly', 'densely', 'wrong'],
       ['prominent', 'conspicuous', 'ancient', 'wrong'],
       ['zenith', 'pinnacle', 'pinnacle', 'correct'],
       ['flawed', 'imperfect', 'imperfect', 'correct'],
       ['urgently', 'desperately', 'desperately', 'correct'],
       ['consumed', 'eaten', 'eaten', 'correct'],
       ['advent', 'coming', 'coming', 'correct'],
       ['concisely', 'succinctly', 'succinctly', 'correct'],
       ['salutes', 'greetings', 'greetings', 'correct'],
       ['solitary', 'alone', 'fearless', 'wrong'],
       ['hasten', 'accelerate', 'accelerate', 'correct'],
       ['perseverance', 'endurance', 'generosity', 'wrong'],
       ['fanciful', 'imaginative', 'imaginative', 'correct'],
       ['showed', 'demonstrated', 'repeated', 'wrong'],
       ['constantly', 'continually', 'instantly', 'wrong'],
       ['issues',

In [72]:
np.savetxt('twitter-25-details.csv', twitter25_results, delimiter=",",fmt='%s')
size = len(twitter_25)
model_info = ['glove-twitter-25', size, c, v, c/v]
with open('analysis.csv', 'a', newline='') as f:
  writer = csv.writer(f)
  writer.writerow(model_info)

## twitter_50

In [73]:
# iterate over question to get words
twitter50_results = np.zeros((80,4), dtype= 'U50')
c = 0
v = 80
for i in range(0,80):
  twitter50_results[i,1] = synonym_np[i,1]
for i in range(0,80):
  twitter50_results[i,0] = synonym_np[i,0]
  try:
    cossim_list = []
    for j in range(2,6):
      try:
        cossim_list.append(twitter_50.similarity(synonym_np[i,0], synonym_np[i,j]))
      except Exception as e:
            print(e)

    guess_value = max(cossim_list)
    guess_index = cossim_list.index(guess_value)
    guess = synonym_np[i,guess_index+2]
    twitter50_results[i,1] = synonym_np[i,1]
    if guess == synonym_np[i,1]:
      twitter50_results[i,3] = 'correct'
      twitter50_results[i,2] = guess
      c +=1
    else:
      twitter50_results[i,3] = 'wrong'
      twitter50_results[i,2] = guess
  except Exception as e:
    print(e)
    randi = random.randint(2, 5)
    twitter50_results[i,2] = synonym_np[i,randi]
    twitter50_results[i,3] = 'guess'
    v -= 1

twitter50_results

"Key 'interrelations' not present"
"Key 'linearly' not present"
"Key 'prudently' not present"
"Key 'unequaled' not present"
"Key 'peculiarly' not present"
"Key 'peculiarly' not present"
"Key 'peculiarly' not present"
"Key 'peculiarly' not present"
max() arg is an empty sequence
"Key 'shrewdly' not present"
"Key 'verbosely' not present"
"Key 'halfheartedly' not present"
"Key 'halfheartedly' not present"
"Key 'halfheartedly' not present"
"Key 'halfheartedly' not present"
max() arg is an empty sequence
"Key 'descriptively' not present"
"Key 'steadier' not present"
"Key 'haltingly' not present"


array([['enormously', 'tremendously', 'tremendously', 'correct'],
       ['provisions', 'stipulations', 'interrelations', 'wrong'],
       ['haphazardly', 'randomly', 'densely', 'wrong'],
       ['prominent', 'conspicuous', 'battered', 'wrong'],
       ['zenith', 'pinnacle', 'pinnacle', 'correct'],
       ['flawed', 'imperfect', 'imperfect', 'correct'],
       ['urgently', 'desperately', 'desperately', 'correct'],
       ['consumed', 'eaten', 'eaten', 'correct'],
       ['advent', 'coming', 'coming', 'correct'],
       ['concisely', 'succinctly', 'succinctly', 'correct'],
       ['salutes', 'greetings', 'ceremonies', 'wrong'],
       ['solitary', 'alone', 'restless', 'wrong'],
       ['hasten', 'accelerate', 'accelerate', 'correct'],
       ['perseverance', 'endurance', 'endurance', 'correct'],
       ['fanciful', 'imaginative', 'imaginative', 'correct'],
       ['showed', 'demonstrated', 'repeated', 'wrong'],
       ['constantly', 'continually', 'instantly', 'wrong'],
       ['issues'

In [74]:
np.savetxt('twitter-50-details.csv', twitter50_results, delimiter=",",fmt='%s')
size = len(twitter_50)
model_info = ['glove-twitter-50', size, c, v, c/v]
with open('analysis.csv', 'a', newline='') as f:
  writer = csv.writer(f)
  writer.writerow(model_info)

# TASK 3: Training Own Models

In [None]:
### PREPROCESSING ###
#nltk.download()
#nltk.download('gutenberg')
nltk.download('punkt')
"""
TO DO
Fully implement preprocessing for all books.
Create word embeddings from sentences.
"""


"""
tokenizing (grouping) - word tokenizer will seperate words.
corpora - body of text
lexicon - words and their meanings
"""

#nltk.download('twitter_samples')
sample = open("/content/books/testfile.txt")
booktext = sample.read()
booktext = booktext.replace("\n"," ")


#sample = open("/content/books/alice.txt")
#booktext = sample.read()
#sample = open("/content/books/the_gold_thimble.txt")
#booktext += " " + sample.read()
#sample = open("/content/books/a_spring_harvest.txt")
#booktext += " " + sample.read()
#sample = open("/content/books/a_middle_english_vocabulary.txt")
#booktext += " " + sample.read()
#sample = open("/content/books/a_tangled_tale.txt")
#booktext += " " + sample.read()
#booktext = booktext.replace("\n"," ")

data = []

#iterate through each sentence in the file
for i in nltk.sent_tokenize(booktext):
  temp = []
  #tokenize sentence into words
  for j in nltk.word_tokenize(i):
    temp.append(j.lower())
  data.append(temp)


#tokens = nltk.sent_tokenize(booktext)
#print(tokens)

#wordvec = [nltk.word_tokenize(booktext)]
#wordvec





[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


FileNotFoundError: ignored

In [None]:
### TRAINING MODEL ###
"""
TO DO
Generate 4 different models.
What are the default values saw in class? For the CBOW
"""
#Window and Embeddings(vectors) sizes
W1 = 3
W2 = 5
E5 = 50
E6 = 200



#Create CBOW models
#Parameter example of Word2Vec(sentences=text, vector_size=100, window=5, min_count=1, workers=4)
own_model1 = Word2Vec(booktext, vector_size=E5, window=W1, min_count=1)
#own_model2 = Word2Vec(booktext, vector_size=E5, window=W2, min_count=1)
#own_model3 = Word2Vec(booktext, vector_size=E6, window=W1, min_count=1)
#own_model4 = Word2Vec(booktext, vector_size=E6, window=W2, min_count=1)



In [None]:
### CREATING CSV FILES ###
"""
TO DO
Verify if correct
"""
def create_model_file(model_name):
  model_name = str(model_name) + "details.csv"
  with open(model_name, "a") as f:
    for question in res:
      if question.guessed_word == None:
        question.guessed_word = ""
      f.write(",".join([question.question,
                        question.answer,
                        question.guessed_word,
                        question.label.value]))
      f.write("\n")


def append_results(model_name, embed_size, window_size):
  model_name = str(model_name) + "-" + str(embed_size) + "-" + str(window_size)
  with open(EVAL_FILENAME, "a") as f:
    f.write(",".join([model_name,str(length), str(C), str(V), str(accuracy)]))
    f.write("\n")


