In [2]:
import gensim.downloader as api
from gensim.models import Word2Vec
from gensim import similarities
from enum import Enum
import json
from dataclasses import dataclass

In [3]:
GOOGLE_MODEL = "word2vec-google-news-300"

models = {GOOGLE_MODEL: api.load(GOOGLE_MODEL)}


In [4]:
class Label(Enum):
  GUESS = "guess"
  CORRECT = "correct"
  WRONG = "wrong"

@dataclass
class Question:
  question: str
  answer: str
  choices: [str]
  label: Label = Label.GUESS
  guessed_word: str = None

In [5]:
def get_simmilar_word(question: Question) -> Question:

  try:
    suggestions = model.most_similar_cosmul(positive=question.question)
  except(KeyError):
    question.guessed_word=""
    return question

  for suggestion, _ in suggestions:
    if suggestion in question.choices:
      question.guessed_word = suggestion
      if question.guessed_word == question.answer: question.label = Label.CORRECT
      else: question.label = Label.WRONG
      break

  return question

def predict_most_simmilar_word(question: Question, model=GOOGLE_MODEL) -> Question:
  model = models[model]
  highest_similarity = (0, "")
  for choice in question.choices:
      try:
        similarity = model.similarity(question.question, choice)
      except(KeyError):
        continue
      if similarity > highest_similarity[0]:
        highest_similarity = (similarity, choice)

  question.guessed_word = highest_similarity[1]
  if highest_similarity[0] == 0: return question
  if question.answer == question.guessed_word: question.label = Label.CORRECT
  else: question.label = Label.WRONG
  return question

def get_simmilar_list(questions: dict, model=GOOGLE_MODEL) -> list[Question]:
  for question in questions:
    predict_most_simmilar_word(question, model)

  return questions



In [6]:
!gdown  https://drive.google.com/uc?id=1LAclVWP_FJBhysLAR0ky1wIP3RveORXO

INP_FILENAME = "/content/synonym.json"
EVAL_FILENAME = "analysis.csv"

def analyze_model(model = GOOGLE_MODEL):
  OUTP_FILENAME = f"{model}-details.csv"

  questions = []
  with open(INP_FILENAME) as f:
    data = json.load(f)
    for obj in data:
      questions.append(Question(**obj))

  res = get_simmilar_list(questions, model)
  with open(OUTP_FILENAME, "a") as f:
    for question in res:
      if question.guessed_word == None:
        question.guessed_word = ""
      f.write(",".join([question.question,
                        question.answer,
                        question.guessed_word,
                        question.label.value]))
      f.write("\n")

  with open(EVAL_FILENAME, "a") as f:
    length = len(models[model])
    C = len([x for x in res if x.label == Label.CORRECT])
    guesses = len([x for x in res if x.label == Label.GUESS])
    V = len(res) - guesses
    if V != 0:
      accuracy = C/V
    else: accuracy = 0

    f.write(",".join([model,str(length), str(C), str(V), str(accuracy)]))
    f.write("\n")

In [7]:
#analyze_model()

In [8]:
from gensim.models import KeyedVectors
from huggingface_hub import hf_hub_download
WIKI2018_MODEL = "wiki2018"
models[WIKI2018_MODEL] = KeyedVectors.load_word2vec_format(hf_hub_download(repo_id="Word2vec/wikipedia2vec_enwiki_20180420_100d", filename="enwiki_20180420_100d.txt"))

KeyboardInterrupt: ignored

In [9]:
WIKI_GIGIWORLD_MODEL = "glove-wiki-gigaword-100"
models[WIKI_GIGIWORLD_MODEL] = api.load(WIKI_GIGIWORLD_MODEL)

KeyboardInterrupt: ignored

In [None]:
TWITTER_25_MODEL = 'glove-twitter-25'
TWITTER_50_MODEL = 'glove-twitter-50'
models[TWITTER_25_MODEL] = api.load(TWITTER_25_MODEL)
models[TWITTER_50_MODEL] = api.load(TWITTER_50_MODEL)

In [None]:
#for model_name in models.keys():
#  analyze_model(model_name)

Task 3

In [22]:
import string
from os import listdir
from os.path import isfile, join
from nltk.tokenize import sent_tokenize
import nltk
nltk.download('punkt')

from nltk.corpus import brown
import urllib.request
from urllib.error import HTTPError



def sentenceise_book(book_file_name: str) -> list[str]:
  inp = ""
  with open(f"/content/books/{book_file_name}") as f:
    inp = f.read()
  res = sent_tokenize(inp)
  return res

def preprocess_string (inp: str) -> str:
  inp = inp.replace("\n"," ")
  inp = inp.replace("\r"," ")
  inp = inp.replace("\ufeff"," ")
  translator = str.maketrans("", "", string.punctuation)
  inp = inp.translate(translator)
  words = inp.split(" ")
  processed_words = []
  for word in words:
    if len(word) == 0: continue
    new_word = word.lower()
    processed_words.append(new_word)
  return processed_words

def preprocess(inp: list[str]):
  res = []
  for sentence in inp:
    ps = preprocess_string(sentence)
    if len(ps) > 0: res.append(ps)
  return res


def train_model(inp: list[list[str]], window_size:int, embedding_size:int) -> KeyedVectors:
  model = Word2Vec(
    sentences=inp,
    window=window_size,
    vector_size=embedding_size
    )
  return model.wv

def create_wordList_from_folder(folder: str):
  onlyfiles = [f for f in listdir(folder) if isfile(join(folder, f))]
  res = []
  for file in onlyfiles:
    raw_book = sentenceise_book(file)
    sentences = preprocess(raw_book)
    res.extend(sentences)
  return res

def create_wordList_from_gutenberg(max_books: int):
  res = []
  for i in range(1,max_books):
    try:
      print(f"downloading book {i}")
      raw_data = urllib.request.urlopen(f"https://www.gutenberg.org/cache/epub/{i}/pg{i}.txt")
      raw_data = [line.decode('utf-8') for line in raw_data]
      print(f"creating sentences for {i}")
      sentences = preprocess(raw_data)
      res.extend(sentences)
    except HTTPError:
      print(f"{i} could not be dowloaded")
  print("wordlist finsihed")
  return res


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
data = urllib.request.urlopen("https://www.gutenberg.org/cache/epub/72294/pg72294.txt")


print([line.decode('utf-8') for line in data])

In [23]:


BOOK_FOLDER = "/content/books/"
CUSTOM_MODEL = "team_J_model"
BOOK_COUNT = 50

EMBEDDING_SIZES = [10,20]
WINDOW_SIZES = [3, 5]

for embedding in EMBEDDING_SIZES:
  for window in WINDOW_SIZES:
    l = create_wordList_from_gutenberg(BOOK_COUNT)
    print("training model")
    models[f"{CUSTOM_MODEL}_E{embedding}_W{window}"] = train_model(l)

downloading book 1
creating sentences for 1
downloading book 2
creating sentences for 2
downloading book 3
creating sentences for 3
downloading book 4
creating sentences for 4
downloading book 5
creating sentences for 5
downloading book 6
creating sentences for 6
downloading book 7
creating sentences for 7
downloading book 8
creating sentences for 8
downloading book 9
creating sentences for 9
downloading book 10
creating sentences for 10
downloading book 11
creating sentences for 11
downloading book 12
creating sentences for 12
downloading book 13
creating sentences for 13
downloading book 14
creating sentences for 14
downloading book 15
creating sentences for 15
downloading book 16
creating sentences for 16
downloading book 17
creating sentences for 17
downloading book 18
creating sentences for 18
downloading book 19
creating sentences for 19
downloading book 20
creating sentences for 20
downloading book 21
creating sentences for 21
downloading book 22
creating sentences for 22
downlo

In [24]:
for model_name in models.keys():
  analyze_model(model_name)

In [None]:
import string
import nltk
from nltk.corpus import brown
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from matplotlib import pyplot

nltk.download("brown")

# Preprocessing data to lowercase all words and remove single punctuation words
document = brown.sents()

print(document[0:10])