# Word2Vec models comparison

We compare the different models of word2vec against different intrinsic word embeddings tasks.

### Import and load datasets

In [None]:
# imports
import pickle
import random
!pip install ray
import ray
import xml.etree.ElementTree as ET
# ray.init()
!pip install xlrd
!pip install nltk
import nltk
nltk.download('wordnet')
import glob
!pip install tqdm
from tqdm import tqdm
import pandas as pd
!pip install gensim
from gensim.models import KeyedVectors
from gensim.models.word2vec import Word2Vec
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter

Collecting ray
  Downloading ray-0.8.7-cp37-cp37m-manylinux1_x86_64.whl (22.0 MB)
[K     |████████████████████████████████| 22.0 MB 15.4 MB/s eta 0:00:01
[?25hCollecting filelock
  Downloading filelock-3.0.12-py3-none-any.whl (7.6 kB)
Collecting aiohttp
  Downloading aiohttp-3.6.2-cp37-cp37m-manylinux1_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 32.0 MB/s eta 0:00:01
Collecting aioredis
  Downloading aioredis-1.3.1-py3-none-any.whl (65 kB)
[K     |████████████████████████████████| 65 kB 2.8 MB/s  eta 0:00:01
Collecting opencensus
  Downloading opencensus-0.7.10-py2.py3-none-any.whl (126 kB)
[K     |████████████████████████████████| 126 kB 27.8 MB/s eta 0:00:01
[?25hCollecting google
  Downloading google-3.0.0-py2.py3-none-any.whl (45 kB)
[K     |████████████████████████████████| 45 kB 1.3 MB/s  eta 0:00:01
[?25hCollecting msgpack<2.0.0,>=1.0.0
  Downloading msgpack-1.0.0-cp37-cp37m-manylinux1_x86_64.whl (275 kB)
[K     |████████████████████████████████

In [None]:
# lemmatizer - noun lemma -- https://stackoverflow.com/questions/25534214/nltk-wordnet-lemmatizer-shouldnt-it-lemmatize-all-inflections-of-a-word
def lemma(word): return nltk.stem.WordNetLemmatizer().lemmatize(word)

# preprocss the word - lowercase and lemma
def pre(word): return lemma(word.lower())

def check_word(word): return " " not in word and "." not in word and "-" not in word and "/" not in word

## Load similarity/relatedness dataset

In [None]:
# load the files
def load_similarity_datasets():
    """Load all (13) datasets which can be used to test word interchangeable similarity
    """
    sim_data = {}
    for file_path in glob.glob("../data/word-sim/*"):
        file_name = file_path[17:].replace(".txt", "")
        print(file_name)
        try:
            df = pd.read_csv(file_path, sep="\t", header=None)
            df.columns = ['word_1', 'word_2', 'similarity_score']
        except:
            df = pd.read_csv(file_path, sep=" ", header=None)
            df.columns = ['word_1', 'word_2', 'similarity_score']
        sim_data[file_name] = df
    return sim_data

# load similarity datasets
similarity_datasets = load_similarity_datasets()

EN-VERB-143
EN-SimVerb-3500
EN-RG-65
EN-RW-STANFORD
EN-MTurk-771
EN-MEN-TR-3k
EN-MC-30
EN-MTurk-287
EN-SIMLEX-999
EN-WS-353-REL
EN-YP-130
EN-WS-353-ALL
EN-WS-353-SIM


## Load association datasets

In [None]:
def prepare_r123_strength_table(cue_data, cue_name):
    # calculate R123 strength
    responses = cue_data.loc[:, ['R1', 'R2', 'R3']].values.reshape(1, -1)[0]
    responses = [pre(str(x)) for x in responses if str(x) != "nan" if "-" not in str(x)]
    responses = pd.DataFrame.from_dict(Counter(responses), orient='index').reset_index()
    responses.columns = ['response', 'R123']
    responses.loc[:, 'N'] = responses['R123'].sum()
    responses.loc[:, 'R123.Str'] = responses['R123'] / responses['N']
    responses.loc[:, 'cue'] = pre(str(cue_name))
    return responses

def prepare_swow_8500(conf_filter=0.1):
    """Return swow words with r123.str >= conf_filter (default:0.1 leads to 8270 unqiue cues)
    """
    # handle swow-8500
    data = pd.read_excel("../data/association/swow-8500.xlsx")
    swow_8500 = []
    for cue_name, cue_data in tqdm(data.groupby(['cue']), position=0, leave=True, desc="Loading SWOW"):
        if " " in str(cue_name):
            continue
        swow_8500.append(prepare_r123_strength_table(cue_data, cue_name))
    swow_8500 = pd.concat(swow_8500)
    swow_8500 = swow_8500.loc[swow_8500['R123.Str']>=conf_filter]
    return swow_8500

# swow_8500_df = prepare_swow_8500(df)

def clean_eat_dataset(data, conf_filter=0.1):
    data.loc[:, 'occ_conf'] = data.loc[:, 'occ_conf'].astype(float)
    data.loc[:, 'occ_count'] = data.loc[:, 'occ_count'].astype(int)
    data = data.query(f"occ_conf >= {conf_filter}")
    return data

def prepare_eat_dataset(conf_filter=0.1):
    """http://rali.iro.umontreal.ca/rali/?q=en/Textual%20Resources/EAT
    """
    tree = ET.parse('../data/association/eat-stimulus-response.xml')
    root = tree.getroot()
    eat_table = []
    for stimulus in tqdm(root.findall("stimulus"), position=0, leave=True, desc="Loading EAT"):
        stimulus_word = stimulus.attrib['word']
        if check_word(stimulus_word):
            for res in stimulus.findall("response"):
                res_word = res.attrib['word']
                if check_word(res_word):
                    eat_table.append({'cue': pre(stimulus_word), 'response': pre(res_word), 
                               'occ_count': res.attrib['n'], 'occ_conf': res.attrib['r']})
    eat_table = pd.DataFrame.from_dict(eat_table)
    return clean_eat_dataset(eat_table, conf_filter=0.1)

def load_association_dataset():
    return {"swow8500": prepare_swow_8500(), "eat": prepare_eat_dataset()}

association_datasets = load_association_dataset()

In [None]:
# # association_datasets['eat']['occ_count'].astype(int)
# eat_dataset = []
# for cue_name, cue_data in association_datasets['eat'].groupby(['cue']):
#     cue_data_copy = cue_data.copy()
#     cue_data_copy.loc[:, 'occ_conf'] = cue_data_copy.loc[:, 'occ_conf'].astype(float)
#     cue_data_copy.loc[:, 'occ_count'] = cue_data_copy.loc[:, 'occ_count'].astype(int)
#     cue_data_copy.loc[:, 'occ_conf'] = cue_data_copy['occ_count']/cue_data_copy['occ_count'].sum()
#     eat_dataset.append(cue_data_copy)
# eat_dataset = pd.concat(eat_dataset)

x.query("occ_conf >= 0.1")['cue'].nunique()

6673

In [None]:
# association_datasets['eat']['occ_conf'].astype(float).describe()
# x = clean_eat_dataset(association_datasets['eat'].copy())
association_datasets['eat']['cue'].nunique()

7182

## Load Analogy datasets

In [None]:
def load_google_analogy():
    google_analogy={}
    with open("../data/analogy/google_analogy_set.txt", 'r') as f:
        for line in f:
            line = line.replace("\n", "")
            if ":" in line: # its a title
                title = line[2:]
                google_analogy[title] = []
            else:
                analogy = [pre(x) for x in line.split() if check_word(x)]
                if len(analogy) == 4:
                    google_analogy[title].append(analogy)
    return google_analogy

# x = load_google_analogy()


def load_bats_analogy():
    random.seed(0)
    file_analogy = []
    for section_path in glob.glob("../data/analogy/BATS_3.0/[0-9]*"):
        if "Inflectional_morphology" in section_path:
            continue
        section_name = section_path[10:]
        for file_path in glob.glob(section_path+"/*"):
            file_name = file_path.replace(section_path, "")
            file_analogy_prefix = []
            with open(file_path, 'r') as f:
                for line in f:
                    analogy_prefix = [pre(x) for x in line.split() if check_word(x)]
                    if len(set(analogy_prefix)) == 2:
                        file_analogy_prefix.append(analogy_prefix)
            random_choices=63
            if len(file_analogy_prefix) > random_choices*1:
                for _ in range(random_choices):
                    a, b = random.sample(file_analogy_prefix, 2)
                    a, b = a.copy(), b.copy()
                    a+=b
                    file_analogy.append(a)
    file_analogy = [x for x in file_analogy if len(set(x)) == 4]
    return file_analogy

file_analogy = load_bats_analogy()


## Test dataset file

In [None]:
import pickle
with open("../data/all_datasets.pickle", 'rb') as f:
    all_dataset = pickle.load(f)

In [None]:
# all_dataset['association_datasets']['eat']['cue'].nunique()
len(all_dataset['analogy_datasets']['google_analogy'])#.keys()

994