<a href="https://colab.research.google.com/github/jeremilev/comp472-project/blob/asg2/Asg2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TASK 1: Evaluation of word2vec-google-news-300 pre-trained model

In [1]:
import gensim.downloader as api
from gensim.models import Word2Vec
from gensim import similarities
from enum import Enum
import json
from dataclasses import dataclass
import nltk

In [2]:
GOOGLE_MODEL = "word2vec-google-news-300"

print(api.info(GOOGLE_MODEL))
model = api.load(GOOGLE_MODEL)

{'num_records': 3000000, 'file_size': 1743563840, 'base_dataset': 'Google News (about 100 billion words)', 'reader_code': 'https://github.com/RaRe-Technologies/gensim-data/releases/download/word2vec-google-news-300/__init__.py', 'license': 'not found', 'parameters': {'dimension': 300}, 'description': "Pre-trained vectors trained on a part of the Google News dataset (about 100 billion words). The model contains 300-dimensional vectors for 3 million words and phrases. The phrases were obtained using a simple data-driven approach described in 'Distributed Representations of Words and Phrases and their Compositionality' (https://code.google.com/archive/p/word2vec/).", 'read_more': ['https://code.google.com/archive/p/word2vec/', 'https://arxiv.org/abs/1301.3781', 'https://arxiv.org/abs/1310.4546', 'https://www.microsoft.com/en-us/research/publication/linguistic-regularities-in-continuous-space-word-representations/?from=http%3A%2F%2Fresearch.microsoft.com%2Fpubs%2F189726%2Frvecs.pdf'], 'che

In [3]:
class Label(Enum):
  GUESS = "guess"
  CORRECT = "correct"
  WRONG = "wrong"

@dataclass
class Question:
  question: str
  answer: str
  choices: [str]
  label: Label = Label.GUESS
  guessed_word: str = None

In [4]:
def get_simmilar_word(question: Question) -> Question:

  try:
    suggestions = model.most_similar_cosmul(positive=question.question)
  except(KeyError):
    question.guessed_word=""
    return question

  for suggestion, _ in suggestions:
    if suggestion in question.choices and question.guessed_word == None:
      question.guessed_word = suggestion
    if suggestion in question.choices and question.guessed_word != None:
      if question.guessed_word == question.answer: question.label = Label.CORRECT
      else: question.label = Label.WRONG

  return question

def get_simmilar_list(questions: dict) -> list[Question]:
  for question in questions:
    get_simmilar_word(question)

  return questions

In [12]:
# import files
!gdown  https://drive.google.com/uc?id=1LAclVWP_FJBhysLAR0ky1wIP3RveORXO

INP_FILENAME = "synonym.json"
OUTP_FILENAME = f"{GOOGLE_MODEL}-details.csv"
EVAL_FILENAME = "analysis.csv"

questions = []
with open(INP_FILENAME) as f:
  data = json.load(f)
  for obj in data:
    questions.append(Question(**obj))

res = get_simmilar_list(questions)
with open(OUTP_FILENAME, "a") as f:
  for question in res:
    if question.guessed_word == None:
      question.guessed_word = ""
    f.write(",".join([question.question,
                      question.answer,
                      question.guessed_word,
                      question.label.value]))
    f.write("\n")

with open(EVAL_FILENAME, "a") as f:
  length = len(model)
  C = len([x.label == Label.CORRECT for x in res])
  guesses = len([x.label == Label.GUESS for x in res])
  V = len(res) - guesses
  if V != 0:
    accuracy = C/V
  else: accuracy = 0

  f.write(",".join([GOOGLE_MODEL,str(length), str(C), str(V), str(accuracy)]))
  f.write("\n")

Downloading...
From: https://drive.google.com/uc?id=1LAclVWP_FJBhysLAR0ky1wIP3RveORXO
To: /content/synonym.json
  0% 0.00/16.2k [00:00<?, ?B/s]100% 16.2k/16.2k [00:00<00:00, 48.0MB/s]


# TASK 2: Other Pre-trained Models

## Two models with different corpora but same embeddings

In [None]:
c1_300 = api.load('word2vec-ruscorpora-300')
c2_300 = api.load('glove-wiki-gigaword-300')



In [15]:
print(list(api.info()['models'].keys()))


['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


## Two models with same corpora but different embeddings

In [14]:
# For this task, we import pretrained models from Hugging face, the citations will be in the text cells below them.
from gensim.models import KeyedVectors
from huggingface_hub import hf_hub_download
c3_100 = KeyedVectors.load_word2vec_format(hf_hub_download(repo_id="Word2vec/wikipedia2vec_dewiki_20180420_100d", filename="dewiki_20180420_100d.txt"))
# model.most_similar("your_word")


KeyboardInterrupt: ignored

@inproceedings{yamada2020wikipedia2vec,
  title = "{W}ikipedia2{V}ec: An Efficient Toolkit for Learning and Visualizing the Embeddings of Words and Entities from {W}ikipedia",
  author={Yamada, Ikuya and Asai, Akari and Sakuma, Jin and Shindo, Hiroyuki and Takeda, Hideaki and Takefuji, Yoshiyasu and Matsumoto, Yuji},
  booktitle = {Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations},
  year = {2020},
  publisher = {Association for Computational Linguistics},
  pages = {23--30}
}


In [None]:
# For this task, we import pretrained models from Hugging face, the citations will be in the text cells below them.
from gensim.models import KeyedVectors
from huggingface_hub import hf_hub_download
c4_300 = KeyedVectors.load_word2vec_format(hf_hub_download(repo_id="Word2vec/wikipedia2vec_enwiki_20180420_win10_300d", filename="enwiki_20180420_win10_300d.txt"))
# model.most_similar("your_word")

@inproceedings{yamada2020wikipedia2vec,
  title = "{W}ikipedia2{V}ec: An Efficient Toolkit for Learning and Visualizing the Embeddings of Words and Entities from {W}ikipedia",
  author={Yamada, Ikuya and Asai, Akari and Sakuma, Jin and Shindo, Hiroyuki and Takeda, Hideaki and Takefuji, Yoshiyasu and Matsumoto, Yuji},
  booktitle = {Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations},
  year = {2020},
  publisher = {Association for Computational Linguistics},
  pages = {23--30}
}


# TASK 3: Training Own Models

In [None]:
#!pip uninstall nltk

In [None]:
#!pip install -U nltk

In [None]:
### PREPROCESSING ###
#nltk.download()
#nltk.download('gutenberg')
nltk.download('punkt')
"""
TO DO
Fully implement preprocessing for all books.
Create word embeddings from sentences.
"""


"""
tokenizing (grouping) - word tokenizer will seperate words.
corpora - body of text
lexicon - words and their meanings
"""

#nltk.download('twitter_samples')
sample = open("/content/books/testfile.txt")
booktext = sample.read()
booktext = booktext.replace("\n"," ")


#sample = open("/content/books/alice.txt")
#booktext = sample.read()
#sample = open("/content/books/the_gold_thimble.txt")
#booktext += " " + sample.read()
#sample = open("/content/books/a_spring_harvest.txt")
#booktext += " " + sample.read()
#sample = open("/content/books/a_middle_english_vocabulary.txt")
#booktext += " " + sample.read()
#sample = open("/content/books/a_tangled_tale.txt")
#booktext += " " + sample.read()
#booktext = booktext.replace("\n"," ")

data = []

#iterate through each sentence in the file
for i in nltk.sent_tokenize(booktext):
  temp = []
  #tokenize sentence into words
  for j in nltk.word_tokenize(i):
    temp.append(j.lower())
  data.append(temp)


#tokens = nltk.sent_tokenize(booktext)
#print(tokens)

#wordvec = [nltk.word_tokenize(booktext)]
#wordvec





[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


FileNotFoundError: ignored

In [None]:
### TRAINING MODEL ###
"""
TO DO
Generate 4 different models.
What are the default values saw in class? For the CBOW
"""
#Window and Embeddings(vectors) sizes
W1 = 3
W2 = 5
E5 = 50
E6 = 200



#Create CBOW models
#Parameter example of Word2Vec(sentences=text, vector_size=100, window=5, min_count=1, workers=4)
own_model1 = Word2Vec(booktext, vector_size=E5, window=W1, min_count=1)
#own_model2 = Word2Vec(booktext, vector_size=E5, window=W2, min_count=1)
#own_model3 = Word2Vec(booktext, vector_size=E6, window=W1, min_count=1)
#own_model4 = Word2Vec(booktext, vector_size=E6, window=W2, min_count=1)



In [None]:
### CREATING CSV FILES ###
"""
TO DO
Verify if correct
"""
def create_model_file(model_name):
  model_name = str(model_name) + "details.csv"
  with open(model_name, "a") as f:
    for question in res:
      if question.guessed_word == None:
        question.guessed_word = ""
      f.write(",".join([question.question,
                        question.answer,
                        question.guessed_word,
                        question.label.value]))
      f.write("\n")


def append_results(model_name, embed_size, window_size):
  model_name = str(model_name) + "-" + str(embed_size) + "-" + str(window_size)
  with open(EVAL_FILENAME, "a") as f:
    f.write(",".join([model_name,str(length), str(C), str(V), str(accuracy)]))
    f.write("\n")


