<h1>Ensemble model</h1>


Version 1, 17 Jun 2016<br>
Jan Šnajder

In [1]:
import sys

import scipy as sp
import pandas as pd

from composes.matrix.dense_matrix import DenseMatrix
from composes.matrix.sparse_matrix import SparseMatrix
from composes.semantic_space.space import Space
from composes.similarity.cos import CosSimilarity
from composes.similarity.similarity import Similarity
from composes.utils import io_utils
from composes.transformation.scaling.row_normalization import RowNormalization

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [50]:
proj_path = '/home/jan/b9/derivsem/'
src_path = proj_path + 'src/'
data_path = '/data/dsm/sdewac/'
sys.path.append(src_path)

In [235]:
from Polysemy import *

In [67]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Distributional spaces

In [93]:
space = {}

### CBOW

In [116]:
model_file = 'cbow/cbow_300dim_hs0/sdewac.300.cbow.hs0.w2.vsm.pkl'
space['cbow-w2'] = io_utils.load(data_path + model_file).apply(RowNormalization(criterion = 'length'))
model_file = 'cbow/cbow_300dim_hs0/sdewac.300.cbow.hs0.w5.vsm.pkl'
space['cbow-w5'] = io_utils.load(data_path + model_file).apply(RowNormalization(criterion = 'length'))
model_file = 'cbow/cbow_300dim_hs0/sdewac.300.cbow.hs0.w10.vsm.pkl'
space['cbow-w10'] = io_utils.load(data_path + model_file).apply(RowNormalization(criterion = 'length'))

space_cbow = space['cbow-w2']

In [95]:
space_cbow.cooccurrence_matrix

<composes.matrix.sparse_matrix.SparseMatrix at 0x7f35279130d0>

In [62]:
type(space_cbow.cooccurrence_matrix)

composes.matrix.sparse_matrix.SparseMatrix

In [11]:
isinstance(space_cbow.cooccurrence_matrix, DenseMatrix)

True

In [41]:
sp.shape(space_cbow.cooccurrence_matrix)

(289699, 300)

### Count-based

In [117]:
model_file = 'count-based/sdewac_2015-11-23/sdewac-mst.prepro.bow-c10k-w5.ppmi.matrix.pkl'
space['ppmi'] = io_utils.load(data_path + model_file).apply(RowNormalization(criterion = 'length'))

In [78]:
space['ppmi'].cooccurrence_matrix

<composes.matrix.sparse_matrix.SparseMatrix at 0x7f34ace94810>

In [79]:
type(space['ppmi'].cooccurrence_matrix)

composes.matrix.sparse_matrix.SparseMatrix

In [80]:
sp.shape(space['ppmi'].cooccurrence_matrix)

(289393, 10000)

### TODO: COW model

### Gur350 check

In [53]:
gur350_gold = sp.genfromtxt(proj_path + "data/gur350-gold.txt", dtype=None, names=('w1', 'w2', 'gold', 'stdev'))

In [106]:
%%capture
predicted_cbow = space_cbow.get_sims(gur350_gold[['w1','w2']], CosSimilarity());

In [107]:
eval_correlation(predicted_cbow, gur350_gold['gold'])

'r = 0.469658, rho = 0.429867, r_cov = 0.655112, rho_cov = 0.686132, cov = 293 (0.84%)'

In [56]:
%%capture
predicted_ppmi = space_ppmi.get_sims(gur350_gold[['w1','w2']], CosSimilarity());

In [57]:
evalCorrelation(predicted_ppmi, gur350_gold['gold'])

'r = 0.469658, rho = 0.429867, r_cov = 0.655112, rho_cov = 0.686132, cov = 293 (0.84%)'

In [110]:
space.keys()

['cbow-w5', 'ppmi', 'cbow-w10', 'cbow-w2']

In [118]:
%%capture
pred = {}
for name in space.keys():
    pred[name] = space[name].get_sims(gur350_gold[['w1', 'w2']], CosSimilarity());

In [119]:
for name in sorted(space.keys()):
    print('%s: %s' % (name, evalCorrelation(pred[name], gur350_gold['gold'])))

cbow-w10: r = 0.590681, rho = 0.572978, r_cov = 0.740932, rho_cov = 0.745762, cov = 240 (0.69%)
cbow-w2: r = 0.603446, rho = 0.570147, r_cov = 0.769273, rho_cov = 0.782691, cov = 260 (0.74%)
cbow-w5: r = 0.598424, rho = 0.570678, r_cov = 0.754332, rho_cov = 0.759574, cov = 248 (0.71%)
ppmi: r = 0.469658, rho = 0.429867, r_cov = 0.655112, rho_cov = 0.686132, cov = 293 (0.84%)


# Data

In [126]:
pairs_df = pd.read_csv(proj_path + "data/pairs-all.txt", sep=' ')

In [127]:
pairs_df

Unnamed: 0,pattern,word1,word2,polysemy,invCL
0,dAA02,abgedeckt_A,unabgedeckt_A,0,0.051396
1,dAA02,abhängig_A,unabhängig_A,3,0.494741
2,dAA02,abkömmlich_A,unabkömmlich_A,0,0.354503
3,dAA02,absehbar_A,unabsehbar_A,1,0.214439
4,dAA02,absetzbar_A,unabsetzbar_A,2,0.047764
5,dAA02,absichtlich_A,unabsichtlich_A,1,0.245723
6,dAA02,abwendbar_A,unabwendbar_A,0,0.517306
7,dAA02,achtsam_A,unachtsam_A,1,0.353418
8,dAA02,ähnlich_A,unähnlich_A,1,0.068213
9,dAA02,anfechtbar_A,unanfechtbar_A,1,0.502672


In [143]:
def get_pairs(pairs_df, pattern):
    return pairs_df[pairs_df['pattern'] == pattern][['word1', 'word2']].values.tolist()

In [None]:
# Split into 50-30-20

# Model predictions

In [None]:
# TODO: lex fun

In [None]:
# TODO: hubness

In [123]:
model = {}
for s in space.keys():
    model['baseline-' + s] = BaselineModel(space[s])
    model['add-' + s] = AdditiveModel(space[s])

In [149]:
m = BaselineModel(space['cbow-w5'])
pairs = get_train_pairs(pairs_df, 'dAA02')
m.fit(pairs)

In [160]:
mrr_score(m, pairs, verbose=True, pos='A', max_neighbors=100)

abgedeckt_A: correct target 'unabgedeckt_A' is at rank 43 out of 100
abhängig_A: correct target 'unabhängig_A' is at rank 4 out of 100
abkömmlich_A: correct target 'unabkömmlich_A' is at rank 78 out of 100
absehbar_A: correct target 'unabsehbar_A' is at rank 11 out of 100
absetzbar_A: correct target 'unabsetzbar_A' is at rank 0 out of 100
absichtlich_A: correct target 'unabsichtlich_A' is at rank 2 out of 100
abwendbar_A: correct target 'unabwendbar_A' is at rank 5 out of 100
achtsam_A: correct target 'unachtsam_A' is at rank 0 out of 100
ähnlich_A: correct target 'unähnlich_A' is at rank 7 out of 100
anfechtbar_A: correct target 'unanfechtbar_A' is at rank 3 out of 100
angebracht_A: correct target 'unangebracht_A' is at rank 0 out of 100
angefochten_A: correct target 'unangefochten_A' is at rank 0 out of 100
angekündigt_A: correct target 'unangekündigt_A' is at rank 0 out of 100
angemeldet_A: correct target 'unangemeldet_A' is at rank 24 out of 100
angemessen_A: correct target 'unange

0.16140655961221811

In [228]:
m = LexfunModel(space['cbow-w5'])
pairs = get_train_pairs(pairs_df, 'dAA02')
m.fit(pairs[:400])

Training lexical function...dummy with 400 samples


In [229]:
mrr_score(m, pairs[400:], verbose=True, pos='A', max_neighbors=100)

Computing MRR score on 235 pairs
natürlich_A: correct target 'unnatürlich_A' is at rank 0 out of 100
nennbar_A: correct target 'unnennbar_A' is at rank 0 out of 100
normal_A: correct target 'unnormal_A' is at rank 45 out of 100
nötig_A: correct target 'unnötig_A' is at rank 51 out of 100
ordentlich_A: correct target 'unordentlich_A' is at rank 31 out of 100
organisch_A: correct target 'unorganisch_A' is at rank 0 out of 100
organisiert_A: correct target 'unorganisiert_A' is at rank 0 out of 100
orthodox_A: correct target 'unorthodox_A' is at rank 0 out of 100
pädagogisch_A: correct target 'unpädagogisch_A' is at rank 0 out of 100
parteiisch_A: correct target 'unparteiisch_A' is at rank 0 out of 100
parteilich_A: correct target 'unparteilich_A' is at rank 0 out of 100
passend_A: correct target 'unpassend_A' is at rank 17 out of 100
passierbar_A: correct target 'unpassierbar_A' is at rank 3 out of 100
pathetisch_A: correct target 'unpathetisch_A' is at rank 76 out of 100
persönlich_A: co

0.072392106891079

In [236]:
m = LexfunModel(space['cbow-w5'])
pairs = get_train_pairs(pairs_df, 'dAA02')
m.fit(pairs[:400])

Training lexical function...dummy with 400 samples


In [238]:
mrr_score(m, pairs[400:], pos='A', max_neighbors=100)

0.072392106891079

In [239]:
m = LexfunModel(space['cbow-w5'], learner='Ridge')
pairs = get_train_pairs(pairs_df, 'dAA02')
m.fit(pairs[:400])

Training lexical function...dummy with 400 samples


In [240]:
mrr_score(m, pairs[400:], pos='A', max_neighbors=100)

0.1942375195547534

# 3 Model and evaluation

In [None]:
vector_kaufen = space_cbow.get_row('kaufen_V')
get_neighbors(vector_kaufen, space_cbow, n_neighbors=5)

In [None]:
get_neighbors(vector_kaufen, space_cbow, n_neighbors=5, pos='A')

Sanity check: comparison with Composes implementation...

In [None]:
space_cbow.get_neighbours('kaufen_V', 5, CosSimilarity())

# 4 Baseline model

In [None]:
baseline_cbow = BaselineModel(space_cbow)

In [None]:
score(baseline_cbow, pairs)

In [None]:
# With POS restriction
score(baseline_cbow, pairs, pos='A')

# 5 Additive model (prototype-based)

In [None]:
additive_cbow = AdditiveModel(space_cbow)

Score on the train set:

In [None]:
additive_cbow.fit(pairs)
score(additive_cbow, pairs, pos='A')

Score using 10-fold CV:

In [None]:
score_cv(additive_cbow, pairs, random_state=42, pos='A', verbose=False)

Score using LOOCV:

In [None]:
score_cv(additive_cbow, pairs, random_state=42, pos='A', verbose=False, folds='loocv')

# 6 Diff vectors clustering

In [None]:
X, _ = get_diff_vectors(space_cbow, pairs)

In [None]:
shape(X)

In [None]:
from sklearn.cluster import KMeans
c = KMeans(n_clusters=2, random_state=42)
c.fit(X)
y1 = c.predict(X); y1

In [None]:
from sklearn.decomposition import PCA
X_2d = PCA(n_components=2).fit_transform(X)
scatter(X_2d[:,0], X_2d[:,1], c=c.predict(X), cmap='prism', s=50);

In [None]:
from k_medoids import KMedoids
c = KMedoids(n_clusters=2, random_state=42)
c.fit(X)
y1 = c.predict(X); y1

In [None]:
from sklearn.decomposition import PCA
X_2d = PCA(n_components=2).fit_transform(X)
scatter(X_2d[:,0], X_2d[:,1], c=c.predict(X), cmap='prism', s=50);

In [None]:
from k_medoids import KMedoids
c = KMedoids(n_clusters=2, random_state=42, distance_metric='cosine')
c.fit(X)
y1 = c.predict(X); y1

In [None]:
from sklearn.decomposition import PCA
X_2d = PCA(n_components=2).fit_transform(X)
scatter(X_2d[:,0], X_2d[:,1], c=c.predict(X), cmap='prism', s=50);

In [None]:
from sklearn.datasets import make_classification
X, _ = make_classification(n_features=2, n_classes=2, n_informative=2, n_redundant=0)

In [None]:
from k_medoids import KMedoids
c = KMedoids(n_clusters=2, random_state=42)
c.fit(X)
y1 = c.predict(X); y1

In [None]:
from sklearn.decomposition import PCA
scatter(X[:,0], X[:,1], c=c.predict(X), cmap='prism', s=50);

In [None]:
X = get_base_vectors(space_cbow, pairs)
shape(X)

In [None]:
from sklearn import mixture
g = mixture.GMM(n_components=3)

In [None]:
g.fit(X)

In [None]:
g.bic(X)

In [None]:
g.predict(X)

In [None]:
import matplotlib.pyplot as plt 

bic = []
aic = []
ks = range(1, 6)
for k in ks:
    g = mixture.GMM(n_components=k).fit(X) 
    bic.append(g.bic(X))
    aic.append(g.aic(X))
plt.plot(ks, aic, label="AIC")
plt.plot(ks, bic, label="BIC")
plt.legend()
plt.show()

In [None]:
g = mixture.GMM(n_components=2)
g.fit(X)

In [None]:
from sklearn.decomposition import PCA
X_2d = PCA(n_components=2).fit_transform(X)
scatter(X_2d[:,0], X_2d[:,1], c=g.predict(X), cmap='prism');

In [None]:
pairs1 = pairs[g.predict(X)==0]
pairs2 = pairs[g.predict(X)==1]

In [None]:
shape(pairs1)

In [None]:
shape(pairs2)

In [None]:
for w1, w2 in pairs1:
    print w1, w2

In [None]:
for w1, w2 in pairs2:
    print w1, w2

In [None]:
sp.savetxt(proj_path + "data/dVA01-bar-cluster1.txt", pairs1, fmt='%s')
sp.savetxt(proj_path + "data/dVA01-bar-cluster2.txt", pairs2, fmt='%s')

In [None]:
model1 = AdditiveModel(space_cbow)
model2 = AdditiveModel(space_cbow)
model1.fit(pairs1)
model2.fit(pairs2)

Difference between the two diff vectors...

In [None]:
from scipy.spatial.distance import cosine
1 - cosine(model1.diff_vector.mat, model2.diff_vector.mat)

Model scores on the train set...

In [None]:
score(model1, pairs1, pos='A')

In [None]:
score(model2, pairs2, pos='A')

CV score, but optimistic, because test pairs always come from correct cluster...

In [None]:
score_cv(model1, pairs1, random_state=42, pos='A', verbose=False)

In [None]:
score_cv(model2, pairs2, random_state=42, pos='A', verbose=False)

### Checking differences between kmeans and gmm

In [None]:
X = sp.random.random((100,300))

In [None]:
c1 = KMeans(n_clusters=2, random_state=42)
c1.fit(X)
y1 = c1.predict(X)
c2 = mixture.GMM(n_components=2, covariance_type='tied', random_state=42)
c2.fit(X)
y2 = c2.predict(X)
y1 == y2

In [None]:
y1

In [None]:
y2

=> Generally, gmm and kmeans give different cluster assignments.

# Cluster+predict model

Clustering of diff vectors:

In [None]:
m = ClusterAdditiveModel(space_cbow, n_clusters=3, cluster_select='BasePredictSim', random_state=42)
m.fit(pairs, verbose=True)

In [None]:
m.models

In [None]:
v = m.predict_with('kaufen_V', 0)
get_neighbors(v, space_cbow, pos='A')

In [None]:
v = m.predict('kaufen_V', verbose=True)
get_neighbors(v, space_cbow, pos='A')

Clustering of base words (rather than diff vectors):

In [None]:
m = ClusterAdditiveModel(space_cbow, n_clusters='BIC', clustering_instance='BaseWord', cluster_select='BasePredictSim', random_state=42)
m.fit(pairs, verbose=True)

In [None]:
v = m.predict('kaufen_V', verbose=True)
get_neighbors(v, space_cbow, pos='A')

In [None]:
m = ClusterAdditiveModel(space_cbow, n_clusters='BIC', clustering_instance='BaseWord', cluster_select='BaseClusterSim', random_state=42)
m.fit(pairs, verbose=True)

In [None]:
v = m.predict('kaufen_V', verbose=True)
get_neighbors(v, space_cbow, pos='A')

In [None]:
sp.linalg.norm(v1.mat)

In [None]:
avg_neighbors_sim(v1, space_cbow, pos='A')

In [None]:
v2 = m.predict_with('kaufen_V', 1)
get_neighbors(v2, space_cbow, pos='A')

In [None]:
sp.linalg.norm(v2.mat)

In [None]:
avg_neighbors_sim(v2, space_cbow, pos='A')

In [None]:
#TODO: Think again about vector normalization

In [None]:
m.predict('kaufen_V', verbose=True)

In [None]:
m = ClusterAdditiveModel(space_cbow, n_clusters=2, cluster_select='BasePredictSim', random_state=42)
m.fit(pairs)
score(m, pairs, verbose=False, pos='A')

In [None]:
m = ClusterAdditiveModel(space_ppmi, n_clusters=2, cluster_select='BasePredictSim', random_state=42)
m.fit(pairs)
score(m, pairs, verbose=False, pos='A')

### CV setup

In [None]:
pairs_train, pairs_holdout = pairs[0:50,:], pairs[50:,:]

In [None]:
shape(pairs_train), shape(pairs_holdout)

In [None]:
m_kmeans = ClusterAdditiveModel(space_cbow, clustering='kmeans', n_clusters=2, cluster_select='BasePredictSim', random_state=42)
score_cv(m_kmeans, pairs_train, pos='A', random_state=42, test_pairs_extra=pairs_holdout, verbose=True)

In [None]:
m_kmeans = ClusterAdditiveModel(space_cbow, clustering='kmeans', n_clusters=2, cluster_select='BasePredictSim', random_state=42)
score_cv(m_kmeans, pairs, pos='A', random_state=42)

In [None]:
m_kmeans = ClusterAdditiveModel(space_cbow, clustering='kmedoids', n_clusters=2, cluster_select='BasePredictSim', random_state=42)
score_cv(m_kmeans, pairs, pos='A', random_state=42)

In [None]:
m_kmeans = ClusterAdditiveModel(space_cbow, clustering='kmeans', n_clusters=2, cluster_select='BaseSim', random_state=66)
score_cv(m_kmeans, pairs, pos='A', random_state=66)

In [None]:
m_gmm = ClusterAdditiveModel(space_cbow, clustering='gmm', n_clusters=2, cluster_select='BaseSim', random_state=42)
score_cv(m_gmm, pairs, pos='A', random_state=42)

In [None]:
m_gmm = ClusterAdditiveModel(space_cbow, clustering='gmm', n_clusters=2, cluster_select='BaseSim', random_state=42)
score_cv(m_gmm, pairs, pos='A', random_state=42)

In [None]:
m_gmm = ClusterAdditiveModel(space_cbow, clustering='gmm', n_clusters=2, cluster_select='BaseSim', random_state=66)
score_cv(m_gmm, pairs, pos='A', random_state=66)

### Comparing kmeans and gmm

In [None]:
m_gmm = ClusterAdditiveModel(space_cbow, clustering='gmm', n_clusters=2, cluster_select='BaseSim', random_state=42)
m_gmm.fit(pairs)
y1 = m_gmm.cluster_assignments

m_kmeans = ClusterAdditiveModel(space_cbow, clustering='kmeans', n_clusters=2, cluster_select='BaseSim', random_state=42)
m_kmeans.fit(pairs)
y2 = m_kmeans.cluster_assignments

In [None]:
y1 == y2

=> In our case, cluster assignments are equivalent for kmeans and gmm.

# Exemplar model

In [None]:
m = AdditiveExemplarModel(space_cbow)
m.fit(pairs, verbose=True)

In [None]:
v = m.predict('kaufen_V', verbose=True)
get_neighbors(v, space_cbow, pos='A')

In [None]:
v = m.predict('singen_V', verbose=True)
get_neighbors(v, space_cbow, pos='A')

In [None]:
m = AdditiveExemplarModel(space_ppmi)
m.fit(pairs, verbose=True)

In [None]:
v = m.predict('kaufen_V', verbose=True)
get_neighbors(v, space_ppmi, pos='A')

# 7 Filtering based on GermanWN

In [None]:
# reads in pairs with polysemy level as the first attribute
def load_pairs(filename, pos1, pos2):
    lines = sp.loadtxt(filename, dtype=str)
    return map(lambda x : [int(x[0]), x[1] + "_V", x[2] + "_A"], lines)

In [None]:
dVA01 = load_pairs(proj_path + "data/dVA01-bar-data.polysemy", 'V', 'A')
dVA01_pairs = sp.array(dVA01)[:,[1,2]]
dVA01_monosemous_pairs = sp.array([(w1,w2) for (p, w1, w2) in dVA01 if p<=1])

In [None]:
dVA01_monosemous_pairs

In [None]:
len(dVA01_monosemous_pairs)

In [None]:
baseline_cbow = BaselineModel(space_cbow)
score_cv(baseline_cbow, dVA01_monosemous_pairs, pos='A')

In [None]:
additive_cbow = AdditiveModel(space_cbow)
additive_cbow.fit(dVA01_monosemous_pairs)
score_cv(additive_cbow, dVA01_monosemous_pairs, pos='A')

In [None]:
cluster_additive_cbow = ClusterAdditiveModel(space_cbow, n_clusters='BIC', cluster_select='BaseSim', random_state=42)
cluster_additive_cbow.fit(dVA01_monosemous_pairs, verbose=True)
score_cv(cluster_additive_cbow, dVA01_monosemous_pairs, random_state=42, pos='A', verbose=False)

In [None]:
for k in range(3, 7):
    print('\nk=%d' % k)
    cluster_additive_cbow = ClusterAdditiveModel(space_cbow, n_clusters=k, cluster_select='BaseSim', random_state=42)
    cluster_additive_cbow.fit(dVA01_monosemous_pairs, verbose=True)
    print score_cv(cluster_additive_cbow, dVA01_monosemous_pairs, random_state=42, pos='A', verbose=False)

### Filtering based on InvCL

In [None]:
# reads in pairs with inclusion data (invCL)
# File format is:
#   offenbar_A offenbaren_V clarkeDE: 0.134455726236 invCL: 0.328609106994
def load_pairs(filename):
    lines = sp.loadtxt(filename, dtype=str)
    return map(lambda x : [float(x[5]), x[1], x[0]], lines)

In [None]:
dVA01 = load_pairs(proj_path + "data/dVA01-bar-data.inclusion")
dVA01_pairs = sp.array(dVA01)[:,[1,2]]
dVA01_inclusive_pairs = sp.array([(w1,w2) for (i, w1, w2) in dVA01 if i>=0.72])

In [None]:
dVA01_inclusive_pairs

In [None]:
len(xs)

In [None]:
baseline_cbow = BaselineModel(space_cbow)
score_cv(baseline_cbow, dVA01_inclusive_pairs, pos='A')

In [None]:
additive_cbow = AdditiveModel(space_cbow)
additive_cbow.fit(dVA01_inclusive_pairs)
score_cv(additive_cbow, dVA01_inclusive_pairs, pos='A')

In [None]:
cluster_additive_cbow = ClusterAdditiveModel(space_cbow, n_clusters='BIC', cluster_select='BaseSim', random_state=42)
cluster_additive_cbow.fit(dVA01_inclusive_pairs, verbose=True)
score_cv(cluster_additive_cbow, dVA01_inclusive_pairs, random_state=42, pos='A', verbose=False)

In [None]:
for k in range(3, 7):
    print('k=%d' % k)
    cluster_additive_cbow = ClusterAdditiveModel(space_cbow, n_clusters=k, cluster_select='BaseSim', random_state=42)
    cluster_additive_cbow.fit(dVA01_inclusive_pairs, verbose=True)
    print score_cv(cluster_additive_cbow, dVA01_inclusive_pairs, random_state=42, pos='A', verbose=False)

### Filtering based on Polysemy + InvCL

In [None]:
sp.intersect1d(dVA01_monosemous_pairs, dVA01_inclusive_pairs)

### Pandas dataframe

In [None]:
pairs_df = pd.read_csv(proj_path + "data/pairs.txt", sep=' ')

In [None]:
pairs_df

In [None]:
dVA01_wellbehaved_df = pairs_df[conjunction(pairs_df.pattern=='dVA01', pairs_df.polysemy<=1, pairs_df.invCL>=0.5)]
dVA01_wellbehaved_df

In [None]:
len(_)

In [None]:
dVA01_wellbehaved_pairs = sp.array(pairs_wellbehaved_df[['word1', 'word2']])
dVA01_wellbehaved_pairs

In [None]:
baseline_cbow = BaselineModel(space_cbow)
score_cv(baseline_cbow, dVA01_wellbehaved_pairs, pos='A', folds=len(dVA01_wellbehaved_pairs))

In [None]:
additive_cbow = AdditiveModel(space_cbow)
additive_cbow.fit(dVA01_wellbehaved_pairs)
score_cv(additive_cbow, dVA01_wellbehaved_pairs, pos='A', folds=len(dVA01_wellbehaved_pairs))

In [None]:
cluster_additive_cbow = ClusterAdditiveModel(space_cbow, n_clusters='BIC', cluster_select='BaseSim', random_state=42)
cluster_additive_cbow.fit(dVA01_wellbehaved_pairs, verbose=True)
score_cv(cluster_additive_cbow, dVA01_wellbehaved_pairs, random_state=42, pos='A', verbose=False, folds=len(dVA01_wellbehaved_pairs))

In [None]:
for k in range(3, 7):
    print('k=%d' % k)
    cluster_additive_cbow = ClusterAdditiveModel(space_cbow, n_clusters=k, cluster_select='BaseSim', random_state=42)
    cluster_additive_cbow.fit(dVA01_wellbehaved_pairs, verbose=True)
    print score_cv(cluster_additive_cbow, dVA01_wellbehaved_pairs, random_state=42, pos='A', verbose=False, folds=len(dVA01_wellbehaved_pairs))

# 8 Grand experiment

In [None]:
def pattern_pos(pattern): return (pattern[1], pattern[2])

In [None]:
pattern_pos('dVV31')

In [None]:
def avg_invCL(pairs_df, pattern):
    return pairs_df[pairs_df.pattern == pattern]['invCL'].median()

In [None]:
avg_invCL(pairs_df, 'dAA02') 

In [None]:
# For a given pattern, fetches rows from a dataframe that satisfy the given poysemy and invCL thresholds.
# Returns two dataframes: one containing rows that satisfy the conditions and one containing those that don't.
def partition_pairs(pairs_df, pattern, polysemy_threshold=None, invCL_threshold=None, only_pairs=False):
    
    def get_pairs(df): return sp.array(df[['word1','word2']])
    
    ix0 = sp.logical_and(pairs_df.polysemy <= polysemy_threshold if polysemy_threshold != None else True,
                         pairs_df.invCL >= invCL_threshold if invCL_threshold != None else True)
    ix1 = sp.logical_and(pairs_df.pattern == pattern, ix0)
    ix2 = sp.logical_and(pairs_df.pattern == pattern, ~ix0)
    
    if only_pairs:
        return get_pairs(pairs_df[ix1]), get_pairs(pairs_df[ix2])
    else:
        return pairs_df[ix1], pairs_df[ix2]

In [None]:
df1, df2 = partition_pairs(pairs_df, 'dAA02', polysemy_threshold=1, invCL_threshold=0.5)

In [None]:
print df1.shape, df2.shape

In [None]:
df1

In [None]:
df2

In [None]:
def eval_pattern(space, pairs_df, pattern, folds=10, random_state=None, verbose=False):

    models = [
        ('Baseline', BaselineModel(space)), 
        ('Additive', AdditiveModel(space)),
        ('AdditiveExemplar', AdditiveExemplarModel(space))] + \
        [('CluAdditive (DiffVectors, kmeans, k=%d, BasePredictSim)' % k, 
         ClusterAdditiveModel(space, clustering_instance='DiffVector', clustering='kmeans', n_clusters=k, cluster_select='BasePredictSim', random_state=random_state)) 
         for k in range(2,6)] + \
        [('CluAdditive (BaseWord, kmeans, k=%d, BasePredictSim)' % k, 
         ClusterAdditiveModel(space, clustering_instance='BaseWord', clustering='kmeans', n_clusters=k, cluster_select='BasePredictSim', random_state=random_state)) 
         for k in range(2,6)] + \
        [('CluAdditive (BaseWord, kmeans, k=%d, BaseClusterSim)' % k, 
         ClusterAdditiveModel(space, clustering_instance='BaseWord', clustering='kmeans', n_clusters=k, cluster_select='BaseClusterSim', random_state=random_state)) 
         for k in range(2,6)]
            
    pairs_all, _ = partition_pairs(pairs_df, pattern, only_pairs=True)
    pairs_mono1, pairs_mono0 = partition_pairs(pairs_df, pattern, polysemy_threshold=1, only_pairs=True)
    pairs_incl1, pairs_incl0 = partition_pairs(pairs_df, pattern, invCL_threshold=0.5, only_pairs=True)
    pairs_monoincl1, pairs_monoincl0 = partition_pairs(pairs_df, pattern, polysemy_threshold=1, invCL_threshold=0.5, only_pairs=True)

    _, deriv_pos = pattern_pos(pattern)

    data = [
        ('All', pairs_all, None),
        ('Mono', pairs_mono1, None),
        ('Incl', pairs_incl1, None),
        ('MonoIncl', pairs_monoincl1, None),
        ('Mono', pairs_mono1, pairs_mono0),
        ('Incl', pairs_incl1, pairs_incl0),
        ('MonoIncl', pairs_monoincl1, pairs_monoincl0)]

    model_names = [n for n, _ in models]
    data_names = ['%s (%s:%d+%d)' % (pattern, pairs_name, len(pairs_train), 
                                     len(pairs_extra_test) if pairs_extra_test != None else 0)
                  for pairs_name, pairs_train, pairs_extra_test in data]
    scores_df = pd.DataFrame(index=model_names, columns=data_names)
    
    for data_name, (_, pairs_train, pairs_extra_test) in zip(data_names, data):
        if verbose:
            print('Data: %s' % data_name)
        for model_name, model in models:
            _, rof, rof_error = score_cv(model, pairs_train, test_pairs_extra=pairs_extra_test,
                                         pos=deriv_pos, folds=folds, random_state=random_state)
            scores_df[data_name][model_name] = '%.3f ± %.2f' % (rof, rof_error)
            if verbose:
                print('  %s: %.3f ± %.2f' % (model_name, rof, rof_error))
    
    return scores_df

In [None]:
df = eval_pattern(space_cbow, pairs_df, 'dAA02', folds=10, random_state=42); df

In [None]:
pd.unique(pairs_df['pattern'])

In [None]:
patterns = pd.unique(pairs_df['pattern'])
writer = pd.ExcelWriter('PolysemyDerivation-cbow-norm.xlsx')

for pattern in patterns:
    df = eval_pattern(space_cbow_norm, pairs_df, pattern, folds=10, random_state=42, verbose=True)
    df.to_excel(writer, pattern)
    writer.save()
    display(df)

In [None]:
patterns = pd.unique(pairs_df['pattern'])
writer = pd.ExcelWriter('PolysemyDerivation-ppmi.xlsx')

for pattern in patterns:
    df = eval_pattern(space_ppmi, pairs_df, pattern, folds=10, random_state=42)
    df.to_excel(writer, pattern)
    writer.save()
    display(df)

# Test

In [None]:
def median_invCL(pairs_df, pattern):
    return pairs_df[pairs_df.pattern == pattern]['invCL'].median()

In [None]:
def pattern_pos(pattern): return (pattern[1], pattern[2])

# For a given pattern, fetches rows from a dataframe that satisfy the given poysemy and invCL thresholds.
# Returns two dataframes: one containing rows that satisfy the conditions and one containing those that don't.
def partition_pairs(pairs_df, pattern, polysemy_threshold=None, invCL_threshold=None, only_pairs=False):

    def get_pairs(df): return sp.array(df[['word1','word2']])

    ix0 = sp.logical_and(pairs_df.polysemy <= polysemy_threshold if polysemy_threshold != None else True,
                         pairs_df.invCL >= invCL_threshold if invCL_threshold != None else True)
    ix1 = sp.logical_and(pairs_df.pattern == pattern, ix0)
    ix2 = sp.logical_and(pairs_df.pattern == pattern, ~ix0)

    if only_pairs:
        return get_pairs(pairs_df[ix1]), get_pairs(pairs_df[ix2])
    else:
        return pairs_df[ix1], pairs_df[ix2]


Data: dAV01 (MonoIncl:46+59)
 CluAdditive (BaseWord, kmeans, k=3, BaseClusterSim): 0.495 ± 0.12
 /proj/sci/b9/modality/ipython/k_medoids.py:176: UserWarning: Cluster 2 is empty!

Data: dAV04 (All:185+0)
CluAdditive (BaseWord, kmedoids, k=2, BasePredictSim): 0.351 ± 0.06

In [None]:
pairs_df = pd.read_csv('/home/jan/b9-modality/data/pairs-XX/pairs-AV.txt', sep=' ')

In [None]:
pattern = 'dAV04'
folds = 10

invCL_median = median_invCL(pairs_df, pattern)

pairs_all, _ = partition_pairs(pairs_df, pattern, only_pairs=True)
pairs_mono1, pairs_mono0 = partition_pairs(pairs_df, pattern, polysemy_threshold=1, only_pairs=True)
pairs_incl1, pairs_incl0 = partition_pairs(pairs_df, pattern, invCL_threshold=invCL_median, only_pairs=True)
pairs_monoincl1, pairs_monoincl0 = partition_pairs(pairs_df, pattern, polysemy_threshold=1, invCL_threshold=invCL_median, only_pairs=True)
pairs_train = pairs_all
pairs_extra_test = None

_, deriv_pos = pattern_pos(pattern)

model = ClusterAdditiveModel(space_cbow_norm, clustering_instance='BaseWord', clustering='kmedoids', n_clusters=3, cluster_select='BasePredictSim', random_state=42)

_, rof, rof_error = score_cv(model, pairs_train, test_pairs_extra=pairs_extra_test, pos=deriv_pos, folds=folds, random_state=42)
print('%.3f ± %.2f' % (rof, rof_error))

In [None]:
test_ix = [9, 18, 29, 30, 55, 56,  60,  65,  66,  75, 113, 119, 124, 126, 135, 146, 165, 170, 176]
train_ix = sp.delete(sp.arange(0,185), test_ix)
train_pairs = pairs_train[train_ix]
test_pairs = pairs_train[test_ix]

In [None]:
len(train_pairs), len(test_pairs)

In [None]:
X = get_base_vectors(space_cbow_norm, train_pairs)
Y = get_base_vectors(space_cbow_norm, test_pairs)

In [None]:
sp.shape(X)

In [None]:
X = get_base_vectors(self.space, train_pairs)

In [None]:
c = KMedoids(n_clusters=3, random_state=42, distance_metric='cosine')

In [None]:
c.fit(X)

In [None]:
c.predict(X)

In [None]:
c.predict(Y)

In [None]:
v = space_cbow_norm.get_row('Hund_N') + DenseMatrix(sp.zeros(1))

In [None]:
v.mat == space_cbow_norm.get_row('Hund_N').mat

In [None]:
space_cbow_norm.get_row('Hund_N').mat

In [None]:
v.mat

In [None]:
eval_pattern(space_cbow, pairs_df, 'dAV04', random_state=42, verbose=True)

Data: dAV04 (All:185+0)
  Baseline: 0.362 ± 0.06
  Additive: 0.367 ± 0.07
  AdditiveExemplar: 0.167 ± 0.05
  CluAdditive (DiffVectors, kmeans, k=2, BasePredictSim): 0.340 ± 0.05
  CluAdditive (DiffVectors, kmeans, k=3, BasePredictSim): 0.367 ± 0.07
  CluAdditive (DiffVectors, kmeans, k=4, BasePredictSim): 0.345 ± 0.05
  CluAdditive (DiffVectors, kmeans, k=5, BasePredictSim): 0.351 ± 0.06
  CluAdditive (DiffVectors, kmedoids, k=2, BasePredictSim): 0.351 ± 0.07
  CluAdditive (DiffVectors, kmedoids, k=3, BasePredictSim): 0.346 ± 0.06
  CluAdditive (DiffVectors, kmedoids, k=4, BasePredictSim): 0.345 ± 0.06
  CluAdditive (DiffVectors, kmedoids, k=5, BasePredictSim): 0.323 ± 0.07
  CluAdditive (BaseWord, kmeans, k=2, BasePredictSim): 0.356 ± 0.06
  CluAdditive (BaseWord, kmeans, k=3, BasePredictSim): 0.356 ± 0.07
  CluAdditive (BaseWord, kmeans, k=4, BasePredictSim): 0.361 ± 0.08
  CluAdditive (BaseWord, kmeans, k=5, BasePredictSim): 0.351 ± 0.06
  CluAdditive (BaseWord, kmedoids, k=2, BasePredictSim): 0.351 ± 0.06
Traceback (most recent call last):
  File "EvalPatterns.py", line 122, in <module>


# Pooled results (2x2 design)

In [None]:
m = ClusterAdditiveModel(space_cbow_norm, clustering_instance='BaseWord', clustering='kmedoids', n_clusters='AIC', cluster_select='BaseClusterSim', random_state=42)
m.fit(pairs, verbose=True)
#score(m, pairs, verbose=False, pos='A')

In [None]:
_, rof, rof_error = score_cv(model, pairs_train, test_pairs_extra=pairs_extra_test,
                                         pos=deriv_pos, folds=folds, random_state=random_state)

In [None]:
-

In [None]:
pairs_df = pd.read_csv(proj_path + "data/all/pairs-all.txt", sep=' ')

In [None]:
r = eval_pattern_2(space_cbow_norm, pairs_df, 'dAA03', random_state=42, verbose=True); r

In [None]:
from IPython.display import display
import os
path = proj_path + 'ipython/results2/'
xs = os.listdir(path)
d = pd.read_excel(path + xs[0])
for x in xs[1:]:
    d2 = pd.read_excel(path + x)
    d = d.add(d2)
d

In [None]:
d / d['All']['n_pairs']

In [None]:
zs = ['dAN03', 'dAN04', 'dAN09', 'dAN10', 'dAN11', 'dAN12', 'dAN16', 'dNA01', 'dNA02', 'dNA05', 'dNA06', 'dNA25', 'dNA26', 'dNA27', 'dNV09', 'dVA02', 'dVA03', 'dVA12', 'dVA13', 'dVN07', 'dVN09']

In [None]:
ws = [x for x in xs if not any([z in x for z in zs])]

In [None]:
from IPython.display import display
import os
d = pd.read_excel(path + ws[0])
for w in ws[1:]:
    d2 = pd.read_excel(path + w)
    d = d.add(d2)
d

In [None]:
d / d['All']['n_pairs']

### TODO (24 Jan 2016)

* <strike>Fix score_cv</strike>
* <strike>Implement POS filter in the scoring functions</strike>
* <strike>invCL filtering</strike>
* <strike>Put all word pairs data into a single Pandas dataframe</strike>
* <strike>Check clustering variance</strike>
* Implement oracles
* <strike>Stability of GMM (initial centroids)</strike>
* <strike>GMM parameters (maybe use full cov matrix?)</strike>
* <strike>Implement k-nn instead of GMM</strike>
* <strike>All patterns</strike>
* <strike>Train on subset, predict on all</strike>
* <strike>Exemplar model</strike>
* <strike>Base-centroid cluster selection</strike>
* <strike>**Grand experiment**</strike>
* <strike>Cluter base word</strike>
* <strike>Check gmm vs kmeans results</strike>
* <strike>Fix margin of error for LOOCV</strike>
* Evaluate with a count-based model

### Vector plausibility

* Vecchi et al 2011. (http://aclweb.org/anthology/W/W11/W11-1301.pdf)


1. Vector length

2. Similarity to base verb vector

3. Avg/median similarity to N nearest neighbors




### Oracles

* n_clusters:
  * the number of clusters that maximizes RooN

* cluster_select:
  * choose the cluster that maximizes RooN (if there is such)
  * compute the gold diff vector and choose the cluster which maximizes the class likelihood