In [2]:
import os, pickle
from joblib import Parallel, delayed
from gensim.models import KeyedVectors
import numpy as np
from numpy.linalg import norm
from scipy.spatial.distance import cosine, euclidean
from scipy.stats import pearsonr
import pandas as pd
import matplotlib.pyplot as plt

# import tensorflow as tf
# import tensorflow.contrib.eager as tfe
# tf.enable_eager_execution()

In [33]:
def get_fns_and_meta(data_pth, folders):
    """ load the word vector model filenames
    """
    models_meta = {}
    for folder in folders:
        for file in os.listdir(os.path.join(data_pth, 
                                            folder)):
            model_meta = {}
            model_meta['root'] = data_pth
            model_meta['class'] = folder
            model_meta['fn'] = file
            #model_meta['name'] = file[:-4]
            
            if folder == 'glove':
                dim = file[file.find('B')+2:file.find('d')]
                model_meta['d'] = int(dim)
            elif folder == 'w2v':
                model_meta['d'] = 300
            
            models_meta[file[:-4]] = model_meta
    
    return models_meta

def load_relsim_data(path='', fn='relsim_mean_ratings.csv'):

    df = pd.read_csv(path+fn)
    df['rel1_type'] = df['relation1'].apply(lambda x: int(x[:-1]))
    df['rel2_type'] = df['relation2'].apply(lambda x: int(x[:-1]))
    
    return df

def words_in_vocab(words, model):
    
    status = True
    for w in words:
        try:
            if w not in model.vocab:
                status = False
        except:
            if w not in model.keys():
                status = False            
    return status


def compute_similarity(u, v, metric='e'):
    
    if metric in ['inner product', 'ip']:
        return np.dot(u, v)
    
    elif metric in ['cosine', 'c']:
        return 1 - cosine(u, v)
    
    elif metric in ['euclidean', 'e']:
        return -euclidean(u, v)
    
    elif metric in ['dawn_euclidean', 'd']:
        return 1 - euclidean(u, v)

    
def get_analogy_words(trial):
    
    return [trial.pair1_word1,
            trial.pair1_word2,
            trial.pair2_word1,
            trial.pair2_word2]


def get_word_vector(word, model, normalize=True):
    
    word_vector = model[word]
    
    if normalize:
        return word_vector / norm(word_vector)
    else:       
        return word_vector


def get_relsim_vocab(df):
    
    words = []
    words += list(df.pair1_word1.unique())
    words += list(df.pair1_word2.unique())
    words += list(df.pair2_word1.unique())
    words += list(df.pair2_word2.unique())
    
    return list(set(words))


def create_condensed_model_relsim(df, model):
    """ Create a condensed model made just
        for the relational similarity data.
    """
    
    vocab = get_relsim_vocab(df)
    
    return create_condensed_model(vocab, model)


def create_condensed_model(vocab, model):
    """ Create a condensed model as a {word: vector} 
        dictionary object for a smaller vocabulary
        from an input w2v gensim model.
    """
    condensed_model = {}
    
    for word in vocab:
        if word in model.vocab:
            condensed_model[word] = model[word]
        
    return condensed_model


def load_model(model_fn='GoogleNews-vectors-negative300.bin',
               data_pth = '../../../../datasets/word-vector-datasets/',
               binary=True, load_condensed_stem=None, 
               condensed_vocab=None, save_condensed=False, 
               condensed_path=None):
    
    """ load word vector model w/ gensim
    """
    
    if 'glove' in model_fn:
        binary = False
        data_pth += 'glove/'
    elif 'GoogleNews' in model_fn:
        data_pth += 'w2v/'
        
    if None not in [load_condensed_stem, condensed_vocab, condensed_path]:
        c_model_fn = model_fn[:-3] + load_condensed_stem
        c_model_path = condensed_path + c_model_fn
        
        if os.path.isfile(c_model_path):
            return pickle.load(open(c_model_path, "rb"))
        else:
            model = KeyedVectors.load_word2vec_format(data_pth + model_fn, 
                                                      binary=binary)
            c_model = create_condensed_model(condensed_vocab, model)
            if save_condensed: pickle.dump(c_model, open(c_model_path, "wb"))
            return c_model
    else:
        return KeyedVectors.load_word2vec_format(data_pth + model_fn, 
                                                 binary=binary)


def get_diff_vecs(words, model, dims=None):
    
    w1_vec = get_word_vector(words[0], model)
    w2_vec = get_word_vector(words[1], model)
    w3_vec = get_word_vector(words[2], model)
    w4_vec = get_word_vector(words[3], model)
    
    diff_pair1 = w1_vec - w2_vec
    diff_pair2 = w3_vec - w4_vec
    
    if dims is None:
        return diff_pair1, diff_pair2
    else:
        return diff_pair1[dims], diff_pair2[dims]
    

def naive_train_test_split(df, val_percent=0.2, 
                           shuffle=True, seed=1):
    """ Doesn't avoid shared single words
        across train and test sets!!
    """
    train_percent = 1 - val_percent
    
    n = df.shape[0]
    idxs = np.arange(n)
    np.random.seed(seed)
    if shuffle: np.random.shuffle(idxs)
    
    train_idxs = idxs[:int(n*train_percent)]
    val_idxs = idxs[int(n*train_percent):]
    
    return train_idxs, val_idxs


def score_preds(df):
    return pearsonr(df[df.in_vocab==True].mean_rating, 
                    df[df.in_vocab==True].preds)


def get_rel_sim_preds(df, model, dims=None,
                      metric='e'):
    
    preds, in_vocab = [], []
    for r, row in df.iterrows():
        
        words = get_analogy_words(row)
        
        if words_in_vocab(words, model):
        
            diff_pair1, diff_pair2 = \
                get_diff_vecs(words, model, dims=dims)
            
            sim = compute_similarity(diff_pair1, diff_pair2,
                                     metric=metric)
            preds.append(sim)
            in_vocab.append(True)
        else:
            preds.append(999)
            in_vocab.append(False)
        
    df['preds'] = preds
    df['in_vocab'] = in_vocab
    return df

def search_for_best_axes(df, model, epsilon=0, verbose=0):
    """ Find the subset of dimensions (axis-aligned subspace)
        giving the best fit to human data.
    """
    
    n_feats = model['dog'].size
    feat_idx_keep = np.arange(n_feats)
    
    df_pred = get_rel_sim_preds(df, model)
    base_score = score_preds(df_pred)[0]
    best_score = base_score
    if verbose > 0:
        print('Base Score : %.4f, Features: %i' % (best_score, n_feats))
    
    for feat_idx in np.arange(n_feats):
        
        curr_feat_set_proposal = feat_idx_keep[feat_idx_keep!=feat_idx]

        df_pred = get_rel_sim_preds(df, model, dims=curr_feat_set_proposal)
        curr_score = score_preds(df_pred)[0]
        
        if (curr_score > best_score) and (curr_score-best_score > epsilon):
            best_score = curr_score
            feat_idx_keep = curr_feat_set_proposal
            if verbose > 1:
                print('-- New Best: %.4f, Features: %i' % (best_score, feat_idx_keep.size))
                
    if verbose > 0:                
        print('Final Score: %.4f, Features: %i' % (best_score, feat_idx_keep.size))
            
    return feat_idx_keep, base_score, best_score

In [36]:
# load human relational similarity data
df_rel_sim = load_relsim_data()

# get the vocab for the dataset
vocab = get_relsim_vocab(df_rel_sim)

In [38]:
# model = load_model(model_fn='glove.6B.50d.w2vformat.txt',
#                    data_pth = '../../../../datasets/word-vector-datasets/',
#                    binary=True, load_condensed_stem='relsim.condensed.p', 
#                    condensed_vocab=vocab, save_condensed=True, 
#                    condensed_path='condensed_models/')

data_pth = '../../../../datasets/word-vector-datasets/'
folders = ['glove','w2v']

models = get_fns_and_meta(data_pth, folders)

# store all condensed models in one dict
for model_key in models.keys():
    
    models[model_key]['model'] = load_model(model_fn=models[model_key]['fn'],
           data_pth=models[model_key]['root'],
           binary=True, load_condensed_stem='relsim.condensed.p', 
           condensed_vocab=vocab, save_condensed=True, 
           condensed_path='condensed_models/')

In [None]:


# load word vector model w/ gensim

# model_fn = 'glove.6B.50d.w2vformat.txt'
# model_fn = 'glove.840B.300d.w2vformat.txt'
model_fn = 'GoogleNews-vectors-negative300.bin'

binary = True # will mark false below if needed
data_pth = '../../../../datasets/word-vector-datasets/'

if 'glove' in model_fn:
    fn_stem = -13
    binary = False
    data_pth += 'glove/'
else:
    fn_stem = -3
    data_pth += 'w2v/'

c_model_fn = model_fn[:fn_stem]+'relsim.condensed.p'
c_model_path = 'condensed_models/' + c_model_fn
if not os.path.isfile(c_model_path):
    model = KeyedVectors.load_word2vec_format(data_pth + model_fn, 
                                              binary=binary)

# load the human data
df_rel_sim = load_relsim_data()

if not os.path.isfile(c_model_path):
    c_model = create_condensed_model(df_rel_sim, model)
    pickle.dump(c_model, open(c_model_path, "wb"))
else:
    c_model = pickle.load(open(c_model_path, "rb"))

In [None]:
# basic analysis
df_rel_sim = get_rel_sim_preds(df_rel_sim, c_model)

print(score_preds(df_rel_sim))

In [None]:
# search for a subset of dimensions with best
# overall score across all types/subtypes

search_for_best_axes(df_rel_sim, model, epsilon=0.0001, verbose=2)

In [None]:
# search for a subset of dimensions with best
# overall score across all types/subtypes

n_splits = 10
epsilon = 0.0001

all_base_scores = []
all_best_scores = []

train_base_scores = []
train_best_scores = []

val_base_scores = []
val_best_scores = []

for rel_type in range(1, 11):
    
    # within-TYPE trials only (what Dawn did for paper!)
    exp_params = (df_rel_sim.rel1_type==rel_type) & (df_rel_sim.rel2_type==rel_type)
    
    df_exp = df_rel_sim[exp_params].copy()
    
    print('Type', rel_type, ' - All Data Score', df_exp.shape[0])

    feats_all_data, all_base_score, all_best_score = \
        search_for_best_axes(df_exp, model, verbose=1, epsilon=epsilon)
    all_base_scores.append(all_base_score)
    all_best_scores.append(all_best_score)
    print('')
    
    avg_train_base_scores = []
    avg_train_best_scores = []
    avg_val_base_scores = []
    avg_val_best_scores = []
    
    for split in range(n_splits):
        train_idxs, val_idxs = naive_train_test_split(df_exp, 
                                                      val_percent=0.2, 
                                                      shuffle=True)

    #     print('Type', rel_type, ' - Training Score', 
    #           df_exp.iloc[train_idxs].shape[0])

        feats_train, train_base_score, train_best_score = \
            search_for_best_axes(df_exp.iloc[train_idxs].copy(), 
                                 model, verbose=0, epsilon=epsilon)
        
        df_val_base = get_rel_sim_preds(df_exp.iloc[val_idxs].copy(), model)
        df_val = get_rel_sim_preds(df_exp.iloc[val_idxs].copy(), model, dims=feats_train)
        print(score_preds(df_val)[0])
        
        avg_train_base_scores.append(train_base_score)
        avg_train_best_scores.append(train_best_score)
        avg_val_base_scores.append(score_preds(df_val_base)[0])
        avg_val_best_scores.append(score_preds(df_val)[0])
        
    print('mean val', np.mean(avg_val_best_scores))
    
    train_base_scores.append(np.mean(avg_train_base_scores))
    train_best_scores.append(np.mean(avg_train_best_scores))
    val_base_scores.append(np.mean(avg_val_base_scores))
    val_best_scores.append(np.mean(avg_val_best_scores))
#     print('')
    
#     df_val = get_rel_sim_preds(df_exp.iloc[val_idxs].copy(), model, dims=feats_train)

#     print('Type', rel_type, ' - Validation Score', 
#           df_exp.iloc[val_idxs].shape[0])
#     print('%.4f' % score_preds(df_val)[0])
    print('')
    print('')

In [None]:
### JOBLIB TEST!!! ###

# search for a subset of dimensions with best
# overall score across all types/subtypes

condensed_model = create_condensed_model(df_rel_sim, model)

n_splits = 50
epsilon = 0.0001

all_base_scores = []
all_best_scores = []

train_base_scores = []
train_best_scores = []

val_base_scores = []
val_best_scores = []

for rel_type in range(1, 11):
    
    # within-TYPE trials only (what Dawn did for paper!)
    exp_params = (df_rel_sim.rel1_type==rel_type) & (df_rel_sim.rel2_type==rel_type)
    
    df_exp = df_rel_sim[exp_params].copy()
    
    print('Type', rel_type, ' - All Data Score', df_exp.shape[0])

    feats_all_data, all_base_score, all_best_score = \
        search_for_best_axes(df_exp, model, verbose=0, epsilon=epsilon)
    all_base_scores.append(all_base_score)
    all_best_scores.append(all_best_score)
#     print('')
    
    avg_train_base_scores = []
    avg_train_best_scores = []
    avg_val_base_scores = []
    avg_val_best_scores = []
    
    def run_split(seed, df_exp):
        train_idxs, val_idxs = naive_train_test_split(df_exp, 
                                                      val_percent=0.2,
                                                      shuffle=True,
                                                      seed=seed)

        feats_train, train_base_score, train_best_score = \
            search_for_best_axes(df_exp.iloc[train_idxs].copy(), 
                                 condensed_model, verbose=0, epsilon=epsilon)
        
        df_val_base = get_rel_sim_preds(df_exp.iloc[val_idxs].copy(), condensed_model)
        val_base_score = score_preds(df_val_base)[0]
        
        df_val = get_rel_sim_preds(df_exp.iloc[val_idxs].copy(), condensed_model, dims=feats_train)
        val_best_score = score_preds(df_val)[0]
        
        return train_base_score, train_best_score, val_base_score, val_best_score
    
    results = Parallel(n_jobs=n_splits)(delayed(run_split)(i, df_exp) for i in range(n_splits))
    for result in results: print(result)
    
    for result in results:        
        avg_train_base_scores.append(result[0])
        avg_train_best_scores.append(result[1])
        avg_val_base_scores.append(result[2])
        avg_val_best_scores.append(result[3])
        
    print('mean val', np.mean(avg_val_best_scores))
    
    train_base_scores.append(np.mean(avg_train_base_scores))
    train_best_scores.append(np.mean(avg_train_best_scores))
    val_base_scores.append(np.mean(avg_val_base_scores))
    val_best_scores.append(np.mean(avg_val_best_scores))

    print('')
    print('')

In [None]:
Type 1 epsilon test
121 0       0.5298 0.4289
121 0.00001 0.5298 0.4127
127 0.0001  0.5230 0.4552
204 0.001   0.4023 0.3364
300 0.01    0.1511 NA
300 0.1     0.1511 NA

In [None]:
for rel_type in range(1, 11):
    
    # within-TYPE trials only (what Dawn did for paper!)
    exp_params = (df_rel_sim.rel1_type==rel_type) & (df_rel_sim.rel2_type==rel_type)
    
    df_exp = df_rel_sim[exp_params].copy()
    
    print('Type', rel_type, ' - All Data Score', df_exp.shape[0])
    df_exp = get_rel_sim_preds(df_exp, model, metric='e')

    print(score_preds(df_exp)[0])
    print('')

In [None]:
plt.figure(dpi=150)

# set width of bar
barWidth = 0.25
 
# set height of bar
bars1 = all_base_scores
bars2 = all_best_scores
bars3 = val_best_scores
 
# Set position of bar on X axis
r1 = np.arange(len(bars1))
r2 = [x + barWidth for x in r1]
r3 = [x + barWidth for x in r2]
 
# Make the plot
plt.bar(r1, bars1, color='black', width=barWidth, edgecolor='white', 
        label='Original GloVe')
plt.axhline(y=np.mean(bars1), color='black', linestyle='--')
plt.bar(r2, bars2, color='#2d7f5e', width=barWidth, edgecolor='white', 
        label='Best Subspace (All Data)')
plt.axhline(y=np.mean(bars2), color='#2d7f5e', linestyle='--')
plt.bar(r3, bars3, color='purple', width=barWidth, edgecolor='white', 
        label='Best Subspace (Mean 10x Validation)')
plt.axhline(y=np.mean(bars3), color='purple', linestyle='--')
 
# Add xticks on the middle of the group bars
# plt.xlabel('group', fontweight='bold')
plt.ylabel('Pearson $r$', fontweight='bold')
plt.xlabel('Relation Type', fontweight='bold')
plt.xticks([r + barWidth for r in range(len(bars1))], range(1, 11))

plt.ylim([0,1])

# Create legend & Show graphic
plt.legend()
plt.show()

In [None]:
exp_params = (df_rel_sim.rel1_type==2) & (df_rel_sim.rel2_type==2)

df_rel_sim[exp_params]

In [None]:
# plt.figure(figsize=(15,15))
# fig, ax = plt.subplots(2, 5)
# ax = ax.flatten()

for rel_type in range(1, 11):
    plt.figure()
    
    # within-TYPE trials only (what Dawn did for paper!)
    exp_params = (df_rel_sim.rel1_type==rel_type) & (df_rel_sim.rel2_type==rel_type)
    
    df_exp = df_rel_sim[exp_params].copy()
    
    result = search_for_best_axes(df_exp, model, 
                                  epsilon=0.0001, verbose=0)
    good_feats = result[0]
    
    for r, row in df_exp.iterrows():

        words = get_analogy_words(row)

        if words_in_vocab(words, model):

            diff_pair1, diff_pair2 = \
                get_diff_vecs(words, model)

            sim = compute_similarity(diff_pair1[good_feats], 
                                     diff_pair2[good_feats],
                                     metric='e')
        plt.scatter(row.mean_rating, -sim, 
                    s=10, color='blue', alpha=0.5)
#         ax[rel_type-1].scatter(row.mean_rating, -sim, 
#                     s=10, color='blue', alpha=0.5)
    print(rel_type)
    plt.show()