# Word2Vec models comparison

We compare the different models of word2vec against different intrinsic word embeddings tasks.

### Import and load datasets

In [1]:
# imports
import random
!pip install ray
import ray
import xml.etree.ElementTree as ET
# ray.init()
!pip install xlrd
!pip install nltk
import nltk
nltk.download('wordnet')
import glob
!pip install tqdm
from tqdm import tqdm
import pandas as pd
!pip install gensim
from gensim.models import KeyedVectors
from gensim.models.word2vec import Word2Vec
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter

Collecting ray
  Downloading https://files.pythonhosted.org/packages/e2/bd/8dbe8a02c7a56b11554fce6da92151d68e508b6c57809693a4c5170b975a/ray-0.8.7-cp37-cp37m-win_amd64.whl (14.2MB)
Collecting msgpack<2.0.0,>=1.0.0 (from ray)
  Downloading https://files.pythonhosted.org/packages/51/10/19ddf3b6f8bfb2b273dddbcdc8293e79545c55688bdf7c09fc51bab2e4df/msgpack-1.0.0-cp37-cp37m-win_amd64.whl (72kB)
Collecting opencensus (from ray)
  Downloading https://files.pythonhosted.org/packages/8a/9c/d40e3408e72d02612acf247d829e3fa9ff15c59f7ad81418ed79962f8681/opencensus-0.7.10-py2.py3-none-any.whl (126kB)
Collecting aiohttp (from ray)
  Downloading https://files.pythonhosted.org/packages/0b/b3/744a16bdaba2e4df90f6ff10b9ade9c2dce3f01d94848f3949aa4ce7868d/aiohttp-3.6.2-cp37-cp37m-win_amd64.whl (649kB)
Collecting google (from ray)
  Downloading https://files.pythonhosted.org/packages/ac/35/17c9141c4ae21e9a29a43acdfd848e3e468a810517f862cad07977bf8fe9/google-3.0.0-py2.py3-none-any.whl (45kB)
Collecting aioredis

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Mohit\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!






In [229]:
# lemmatizer - noun lemma -- https://stackoverflow.com/questions/25534214/nltk-wordnet-lemmatizer-shouldnt-it-lemmatize-all-inflections-of-a-word
def lemma(word): return nltk.stem.WordNetLemmatizer().lemmatize(word)

# preprocss the word - lowercase and lemma
def pre(word): return lemma(word.lower())

def check_word(word): return " " not in word and "." not in word and "-" not in word and "/" not in word

## Load similarity/relatedness dataset

In [2]:
# load the files
def load_similarity_datasets():
    """Load all (13) datasets which can be used to test word interchangeable similarity
    """
    sim_data = {}
    for file_path in glob.glob("../data/word-sim/*"):
        file_name = file_path[17:].replace(".txt", "")
        print(file_name)
        try:
            df = pd.read_csv(file_path, sep="\t", header=None)
            df.columns = ['word_1', 'word_2', 'similarity_score']
        except:
            df = pd.read_csv(file_path, sep=" ", header=None)
            df.columns = ['word_1', 'word_2', 'similarity_score']
        sim_data[file_name] = df
    return sim_data

# load similarity datasets
similarity_datasets = load_similarity_datasets()

EN-MC-30
EN-MEN-TR-3k
EN-MTurk-287
EN-MTurk-771
EN-RG-65
EN-RW-STANFORD
EN-SIMLEX-999
EN-SimVerb-3500
EN-VERB-143
EN-WS-353-ALL
EN-WS-353-REL
EN-WS-353-SIM
EN-YP-130


## Load association datasets

In [155]:
def prepare_r123_strength_table(cue_data, cue_name):
    # calculate R123 strength
    responses = cue_data.loc[:, ['R1', 'R2', 'R3']].values.reshape(1, -1)[0]
    responses = [pre(x) for x in responses if str(x) != "nan" if "-" not in str(x)]
    responses = pd.DataFrame.from_dict(Counter(responses), orient='index').reset_index()
    responses.columns = ['response', 'R123']
    responses.loc[:, 'N'] = responses['R123'].sum()
    responses.loc[:, 'R123.Str'] = responses['R123'] / responses['N']
    responses.loc[:, 'cue'] = pre(cue_name)
    return responses

def prepare_swow_8500():
    # handle swow-8500
    data = pd.read_excel("../data/association/swow-8500.xlsx")
    swow_8500 = []
    for cue_name, cue_data in tqdm(data.groupby(['cue']), position=0, leave=True, desc="Loading SWOW"):
        if " " in str(cue_name):
            continue
        swow_8500.append(prepare_r123_strength_table(cue_data, cue_name))
    swow_8500 = pd.concat(swow_8500)
    return swow_8500

# swow_8500_df = prepare_swow_8500(df)

def prepare_eat_dataset():
    """http://rali.iro.umontreal.ca/rali/?q=en/Textual%20Resources/EAT
    """
    tree = ET.parse('../data/association/eat-stimulus-response.xml')
    root = tree.getroot()
    eat_table = []
    for stimulus in tqdm(root.findall("stimulus"), position=0, leave=True, desc="Loading EAT"):
        stimulus_word = stimulus.attrib['word']
        if check_word(stimulus_word):
            for res in stimulus.findall("response"):
                res_word = res.attrib['word']
                if check_word(res_word):
                    eat_table.append({'cue': pre(stimulus_word), 'response': pre(res_word), 
                               'occ_count': res.attrib['n'], 'occ_conf': res.attrib['r']})
    eat_table = pd.DataFrame.from_dict(eat_table)
    return eat_table

def load_association_dataset():
    return {"swow8500": prepare_swow_8500(), "eat": prepare_eat_dataset()}

association_datasets = load_association_dataset()

## Load Analogy datasets

In [296]:
def load_google_analogy():
    google_analogy={}
    with open("../data/analogy/google_analogy_set.txt", 'r') as f:
        for line in f:
            line = line.replace("\n", "")
            if ":" in line: # its a title
                title = line[2:]
                google_analogy[title] = []
            else:
                analogy = [pre(x) for x in line.split() if check_word(x)]
                if len(analogy) == 4:
                    google_analogy[title].append(analogy)
    return google_analogy

# x = load_google_analogy()


def load_bats_analogy():
    random.seed(0)
    file_analogy = []
    for section_path in glob.glob("../data/analogy/BATS_3.0/[0-9]*"):
        if "Inflectional_morphology" in section_path:
            continue
        section_name = section_path[10:]
        for file_path in glob.glob(section_path+"/*"):
            file_name = file_path.replace(section_path, "")
            file_analogy_prefix = []
            with open(file_path, 'r') as f:
                for line in f:
                    analogy_prefix = [pre(x) for x in line.split() if check_word(x)]
                    if len(set(analogy_prefix)) == 2:
                        file_analogy_prefix.append(analogy_prefix)
            random_choices=63
            if len(file_analogy_prefix) > random_choice*1:
                for _ in range(random_choices):
                    a, b = random.sample(file_analogy_prefix, 2)
                    a, b = a.copy(), b.copy()
                    a+=b
                    file_analogy.append(a)
    file_analogy = [x for x in file_analogy if len(set(x)) == 4]
    return file_analogy

file_analogy = load_bats_analogy()
