In [None]:
import gensim.downloader as api
from gensim.models import Word2Vec
from gensim import similarities
from enum import Enum
import json
from dataclasses import dataclass

In [None]:
GOOGLE_MODEL = "word2vec-google-news-300"

models = {GOOGLE_MODEL: api.load(GOOGLE_MODEL)}


In [None]:
class Label(Enum):
  GUESS = "guess"
  CORRECT = "correct"
  WRONG = "wrong"

@dataclass
class Question:
  question: str
  answer: str
  choices: [str]
  label: Label = Label.GUESS
  guessed_word: str = None

In [48]:
def get_simmilar_word(question: Question) -> Question:

  try:
    suggestions = model.most_similar_cosmul(positive=question.question)
  except(KeyError):
    question.guessed_word=""
    return question

  for suggestion, _ in suggestions:
    if suggestion in question.choices:
      question.guessed_word = suggestion
      if question.guessed_word == question.answer: question.label = Label.CORRECT
      else: question.label = Label.WRONG
      break

  return question

def predict_most_simmilar_word(question: Question, model=GOOGLE_MODEL) -> Question:
  model = models[model]
  highest_similarity = (0, "")
  for choice in question.choices:
      try:
        similarity = model.similarity(question.question, choice)
      except(KeyError):
        continue
      if similarity > highest_similarity[0]:
        highest_similarity = (similarity, choice)

  question.guessed_word = highest_similarity[1]
  if highest_similarity[0] == 0: return question
  if question.answer == question.guessed_word: question.label = Label.CORRECT
  else: question.label = Label.WRONG
  return question

def get_simmilar_list(questions: dict, model=GOOGLE_MODEL) -> list[Question]:
  for question in questions:
    predict_most_simmilar_word(question, model)

  return questions



In [44]:
INP_FILENAME = "/content/synonym.json"
EVAL_FILENAME = "analysis.csv"

def analyze_model(model = GOOGLE_MODEL):
  OUTP_FILENAME = f"{model}-details.csv"

  questions = []
  with open(INP_FILENAME) as f:
    data = json.load(f)
    for obj in data:
      questions.append(Question(**obj))

  res = get_simmilar_list(questions, model)
  with open(OUTP_FILENAME, "a") as f:
    for question in res:
      if question.guessed_word == None:
        question.guessed_word = ""
      f.write(",".join([question.question,
                        question.answer,
                        question.guessed_word,
                        question.label.value]))
      f.write("\n")

  with open(EVAL_FILENAME, "a") as f:
    length = len(models[model])
    C = len([x for x in res if x.label == Label.CORRECT])
    guesses = len([x for x in res if x.label == Label.GUESS])
    V = len(res) - guesses
    if V != 0:
      accuracy = C/V
    else: accuracy = 0

    f.write(",".join([model,str(length), str(C), str(V), str(accuracy)]))
    f.write("\n")

In [None]:
analyze_model()

In [None]:
from gensim.models import KeyedVectors
from huggingface_hub import hf_hub_download
WIKI2018_MODEL = "wiki2018"
models[WIKI2018_MODEL] = KeyedVectors.load_word2vec_format(hf_hub_download(repo_id="Word2vec/wikipedia2vec_enwiki_20180420_100d", filename="enwiki_20180420_100d.txt"))

In [None]:
WIKI_GIGIWORLD_MODEL = "glove-wiki-gigaword-100"
models[WIKI_GIGIWORLD_MODEL] = api.load(WIKI_GIGIWORLD_MODEL)



In [None]:
TWITTER_25_MODEL = 'glove-twitter-25'
TWITTER_50_MODEL = 'glove-twitter-50'
models[TWITTER_25_MODEL] = api.load(TWITTER_25_MODEL)
models[TWITTER_50_MODEL] = api.load(TWITTER_50_MODEL)



In [49]:
for model_name in models.keys():
  analyze_model(model_name)

Task 3

In [114]:
import string
from os import listdir
from os.path import isfile, join
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

from nltk.corpus import brown
import urllib.request
from urllib.error import HTTPError



def sentenceise_book(book_file_name: str) -> list[str]:
  inp = ""
  with open(f"/content/books/{book_file_name}") as f:
    inp = f.read()
  res = sent_tokenize(inp)
  return res

def preprocess_string (inp: str) -> str:
  inp = inp.replace("\n"," ")
  inp = inp.replace("\r"," ")
  inp = inp.replace("\ufeff"," ")
  translator = str.maketrans("", "", string.punctuation)
  inp = inp.translate(translator)
  words = inp.split(" ")
  processed_words = []
  for word in words:
    if len(word) == 0: continue
    new_word = word.lower()
    processed_words.append(new_word)
  return processed_words

def preprocess(inp: list[str]):
  res = []
  for sentence in inp:
    ps = preprocess_string(sentence)
    if len(ps) > 0: res.append(ps)
  return res


def train_model(inp: list[list[str]]) -> Word2Vec:
  model = Word2Vec(
    sentences=inp,
    vector_size=50,
    window=10,
    epochs=20
    )
  return model

def create_wordList_from_folder(folder: str):
  onlyfiles = [f for f in listdir(folder) if isfile(join(folder, f))]
  res = []
  for file in onlyfiles:
    raw_book = sentenceise_book(file)
    sentences = preprocess(raw_book)
    res.extend(sentences)
  return res

def create_wordList_from_gutenberg(max_books: int):
  res = []
  for i in range(1,max_books):
    try:
      raw_data = urllib.request.urlopen(f"https://www.gutenberg.org/cache/epub/{i}/pg{i}.txt")
      raw_data = [line.decode('utf-8') for line in raw_data]
      sentences = preprocess(raw_data)
      res.extend(sentences)
    except HTTPError:
      print(f"{i} could not be dowloaded")
  return res


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [92]:
data = urllib.request.urlopen("https://www.gutenberg.org/cache/epub/72294/pg72294.txt")


print([line.decode('utf-8') for line in data])



In [None]:


BOOK_FOLDER = "/content/books/"
CUSTOM_MODEL = "team_J_model"



l = create_wordList_from_gutenberg(500)
models[CUSTOM_MODEL] = train_model(l)

40 could not be dowloaded
52 could not be dowloaded
114 could not be dowloaded
116 could not be dowloaded
129 could not be dowloaded
182 could not be dowloaded
183 could not be dowloaded
184 could not be dowloaded
185 could not be dowloaded
186 could not be dowloaded
187 could not be dowloaded
188 could not be dowloaded
189 could not be dowloaded
190 could not be dowloaded
191 could not be dowloaded
192 could not be dowloaded
193 could not be dowloaded
194 could not be dowloaded
195 could not be dowloaded
196 could not be dowloaded
197 could not be dowloaded
198 could not be dowloaded
199 could not be dowloaded
239 could not be dowloaded
256 could not be dowloaded


In [None]:
for model_name in models.keys():
  analyze_model(model_name)

In [68]:
import string
import nltk
from nltk.corpus import brown
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from matplotlib import pyplot

nltk.download("brown")

# Preprocessing data to lowercase all words and remove single punctuation words
document = brown.sents()

print(document[0:10])

[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ['The', 'September-October', 'term', 'jury', 'had', 'been', 'charged', 'by', 'Fulton', 'Superior', 'Court', 'Judge', 'Durwood', 'Pye', 'to', 'investigate', 'reports', 'of', 'possible', '``', 'irregularities', "''", 'in', 'the', 'hard-fought', 'primary', 'which', 'was', 'won', 'by', 'Mayor-nominate', 'Ivan', 'Allen', 'Jr.', '.'], ['``', 'Only', 'a', 'relative', 'handful', 'of', 'such', 'rep

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
