<h1>Polysemy in Derivational Models</h1>


Version 13, 11 Feb 2016<br>
Jan Šnajder

In [7]:
import scipy as sp
import pandas as pd

from composes.matrix.dense_matrix import DenseMatrix
from composes.matrix.sparse_matrix import SparseMatrix
from composes.semantic_space.space import Space
from composes.similarity.cos import CosSimilarity
from composes.similarity.similarity import Similarity
from composes.utils import io_utils
from composes.transformation.scaling.row_normalization import RowNormalization

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [8]:
from Polysemy import *
#from EvalPatterns import *

In [9]:
%load_ext autoreload
%autoreload 2

# 1 Data

In [10]:
proj_path = "/home/jan/b9-modality/"
data_path = "/data/dsm/sdewac/"

### Verb-Adjective pairs

In [11]:
pairs_file = proj_path + "data/verb.adj.pairs"
pairs = sp.loadtxt(pairs_file, dtype=str)
pairs = sp.array(map(lambda x : [x[0] + "_V", x[1] + "_A"], pairs))

In [12]:
shape(pairs)

(200, 2)

In [13]:
pairs[:5]

array([['offenbaren_V', 'offenbar_A'],
       ['sichten_V', 'sichtbar_A'],
       ['vergleichen_V', 'vergleichbar_A'],
       ['scheinen_V', 'scheinbar_A'],
       ['erkennen_V', 'erkennbar_A']], 
      dtype='|S18')

### Distributional model

In [14]:
space_cbow = io_utils.load(data_path + 'cbow/cbow_300dim_hs0/sdewac.300.cbow.hs0.vsm.pkl')
space_cbow_norm = space_cbow.apply(RowNormalization(criterion = 'length'))

IOError: [Errno 2] No such file or directory: '/data/dsm/sdewac/cbow/cbow_300dim_hs0/sdewac.300.cbow.hs0.vsm.pkl'

In [None]:
space_cbow.cooccurrence_matrix

In [None]:
type(space_cbow.cooccurrence_matrix)

In [None]:
isinstance(space_cbow.cooccurrence_matrix, DenseMatrix)

In [None]:
sp.shape(space_cbow.cooccurrence_matrix)

In [None]:
v1 = space_cbow.get_row('Hund_N').mat
v2 = space_cbow.get_row('Katze_N').mat

In [None]:
type(v1)

In [None]:
space_ppmi = io_utils.load(data_path + 'count-based/sdewac_2015-11-23/sdewac-mst.prepro.bow-c10k-w5.ppmi.matrix.pkl')
space_ppmi_norm = space_ppmi.apply(RowNormalization(criterion = 'length'))

In [None]:
space_ppmi.cooccurrence_matrix

In [None]:
type(space_ppmi.cooccurrence_matrix)

In [None]:
sp.shape(space_ppmi.cooccurrence_matrix)

In [None]:
v3 = space_ppmi.get_row('Hund_N').mat
v4 = space_ppmi.get_row('Katze_N').mat

In [None]:
type(v4)

In [None]:
type(v2.todense())

In [None]:
v1 - v2

In [None]:
sp.shape(sp.transpose(v1))

In [None]:
sp.shape(v2)

In [None]:
sp.dot(v1, v2)

In [None]:
v1.dot(v2.T)

In [None]:
cosine(v1, v2)

In [None]:
shape(v1)

In [None]:
shape(v2)

In [None]:
def my_cosine_distance(v1, v2):
    if isinstance(v1, matrix) and isinstance(v2, matrix):
        return cosine(v1, v2)
    elif isinstance(v1, csr_matrix) and isinstance(v2, csr_matrix):
        print 'error'

In [None]:
my_cosine(v1, v2)

In [None]:
v1.dot(v2.T)[0, 0] / (sp.linalg.norm(v1) * sp.linalg.norm(v2))

In [None]:
def my_dot(v1, v2): return v1.dot(v2.T)
def my_norm(v): return sp.sqrt(my_dot(v, v)[0, 0])
def my_cosine(v1, v2): return my_dot(v1, v2)[0, 0] / (my_norm(v1) * my_norm(v2))
def my_cosine_dist(v1, v2): return 1 - my_cosine(v1, v2)

In [None]:
my_cosine(v1, v2)

In [None]:
my_cosine(v3, v4)

In [None]:
type(get_row_dense(space_cbow, 'Hund_N'))

In [None]:
type(get_row_dense(space_ppmi, 'Hund_N'))

In [None]:
sp.linalg.norm(space_cbow.get_row('Hund_N').mat)

In [None]:
sp.linalg.norm(space_cbow_norm.get_row('Hund_N').mat)

# 2 Checks

### Sanity check

In [None]:
space_cbow.get_sim('Hund_N','Katze_N', CosSimilarity())

In [None]:
space_cbow_norm.get_sim('Hund_N','Katze_N', CosSimilarity())

In [None]:
space_cbow.get_sim('Hund_N','Kaufvertrag_N', CosSimilarity())

In [None]:
space_cbow.get_sim('kaufen_V','Kaufvertrag_N', CosSimilarity())

In [None]:
space_ppmi.get_sim('Hund_N','Katze_N', CosSimilarity())

In [None]:
space_ppmi.get_sim('Hund_N','Kaufvertrag_N', CosSimilarity())

In [None]:
space_ppmi.get_sim('kaufen_V','Kaufvertrag_N', CosSimilarity())

### Gur350 check

In [None]:
gur350_gold= sp.genfromtxt(proj_path + "data/gur350-gold.txt", dtype=None, names=('w1', 'w2', 'gold'))

In [None]:
%%capture
predicted_cbow = space_cbow.get_sims(gur350_gold[['w1','w2']], CosSimilarity());

In [None]:
evalCorrelation(predicted_cbow, gur350_gold['gold'])

In [None]:
%%capture
predicted_ppmi = space_ppmi.get_sims(gur350_gold[['w1','w2']], CosSimilarity());

In [None]:
evalCorrelation(predicted_ppmi, gur350_gold['gold'])

In [None]:
# %%capture
for i, w in enumerate(['w2', 'w5', 'w10']):
    space_cbow = io_utils.load(data_path + 'cbow/cbow_300dim_hs0/sdewac.300.cbow.hs0.' + w + '.vsm.pkl')
    space_cbow_norm = space_cbow.apply(RowNormalization(criterion = 'length'))
    predicted_cbow[i] = space_cbow.get_sims(gur350_gold[['w1','w2']], CosSimilarity());

In [None]:
for i, w in enumerate(['w2', 'w5', 'w10']):
    print('%s: %s' % (w, evalCorrelation(predicted_cbow[i], gur350_gold['gold'])))

# 3 Model and evaluation

In [None]:
vector_kaufen = space_cbow.get_row('kaufen_V')
get_neighbors(vector_kaufen, space_cbow, n_neighbors=5)

In [None]:
get_neighbors(vector_kaufen, space_cbow, n_neighbors=5, pos='A')

Sanity check: comparison with Composes implementation...

In [None]:
space_cbow.get_neighbours('kaufen_V', 5, CosSimilarity())

# 4 Baseline model

In [None]:
baseline_cbow = BaselineModel(space_cbow)

In [None]:
score(baseline_cbow, pairs)

In [None]:
# With POS restriction
score(baseline_cbow, pairs, pos='A')

# 5 Additive model (prototype-based)

In [None]:
additive_cbow = AdditiveModel(space_cbow)

Score on the train set:

In [None]:
additive_cbow.fit(pairs)
score(additive_cbow, pairs, pos='A')

Score using 10-fold CV:

In [None]:
score_cv(additive_cbow, pairs, random_state=42, pos='A', verbose=False)

Score using LOOCV:

In [None]:
score_cv(additive_cbow, pairs, random_state=42, pos='A', verbose=False, folds='loocv')

# 6 Diff vectors clustering

In [None]:
X, _ = get_diff_vectors(space_cbow, pairs)

In [None]:
shape(X)

In [None]:
from sklearn.cluster import KMeans
c = KMeans(n_clusters=2, random_state=42)
c.fit(X)
y1 = c.predict(X); y1

In [None]:
from sklearn.decomposition import PCA
X_2d = PCA(n_components=2).fit_transform(X)
scatter(X_2d[:,0], X_2d[:,1], c=c.predict(X), cmap='prism', s=50);

In [None]:
from k_medoids import KMedoids
c = KMedoids(n_clusters=2, random_state=42)
c.fit(X)
y1 = c.predict(X); y1

In [None]:
from sklearn.decomposition import PCA
X_2d = PCA(n_components=2).fit_transform(X)
scatter(X_2d[:,0], X_2d[:,1], c=c.predict(X), cmap='prism', s=50);

In [None]:
from k_medoids import KMedoids
c = KMedoids(n_clusters=2, random_state=42, distance_metric='cosine')
c.fit(X)
y1 = c.predict(X); y1

In [None]:
from sklearn.decomposition import PCA
X_2d = PCA(n_components=2).fit_transform(X)
scatter(X_2d[:,0], X_2d[:,1], c=c.predict(X), cmap='prism', s=50);

In [None]:
from sklearn.datasets import make_classification
X, _ = make_classification(n_features=2, n_classes=2, n_informative=2, n_redundant=0)

In [None]:
from k_medoids import KMedoids
c = KMedoids(n_clusters=2, random_state=42)
c.fit(X)
y1 = c.predict(X); y1

In [None]:
from sklearn.decomposition import PCA
scatter(X[:,0], X[:,1], c=c.predict(X), cmap='prism', s=50);

In [None]:
X = get_base_vectors(space_cbow, pairs)
shape(X)

In [None]:
from sklearn import mixture
g = mixture.GMM(n_components=3)

In [None]:
g.fit(X)

In [None]:
g.bic(X)

In [None]:
g.predict(X)

In [None]:
import matplotlib.pyplot as plt 

bic = []
aic = []
ks = range(1, 6)
for k in ks:
    g = mixture.GMM(n_components=k).fit(X) 
    bic.append(g.bic(X))
    aic.append(g.aic(X))
plt.plot(ks, aic, label="AIC")
plt.plot(ks, bic, label="BIC")
plt.legend()
plt.show()

In [None]:
g = mixture.GMM(n_components=2)
g.fit(X)

In [None]:
from sklearn.decomposition import PCA
X_2d = PCA(n_components=2).fit_transform(X)
scatter(X_2d[:,0], X_2d[:,1], c=g.predict(X), cmap='prism');

In [None]:
pairs1 = pairs[g.predict(X)==0]
pairs2 = pairs[g.predict(X)==1]

In [None]:
shape(pairs1)

In [None]:
shape(pairs2)

In [None]:
for w1, w2 in pairs1:
    print w1, w2

In [None]:
for w1, w2 in pairs2:
    print w1, w2

In [None]:
sp.savetxt(proj_path + "data/dVA01-bar-cluster1.txt", pairs1, fmt='%s')
sp.savetxt(proj_path + "data/dVA01-bar-cluster2.txt", pairs2, fmt='%s')

In [None]:
model1 = AdditiveModel(space_cbow)
model2 = AdditiveModel(space_cbow)
model1.fit(pairs1)
model2.fit(pairs2)

Difference between the two diff vectors...

In [None]:
from scipy.spatial.distance import cosine
1 - cosine(model1.diff_vector.mat, model2.diff_vector.mat)

Model scores on the train set...

In [None]:
score(model1, pairs1, pos='A')

In [None]:
score(model2, pairs2, pos='A')

CV score, but optimistic, because test pairs always come from correct cluster...

In [None]:
score_cv(model1, pairs1, random_state=42, pos='A', verbose=False)

In [None]:
score_cv(model2, pairs2, random_state=42, pos='A', verbose=False)

### Checking differences between kmeans and gmm

In [None]:
X = sp.random.random((100,300))

In [None]:
c1 = KMeans(n_clusters=2, random_state=42)
c1.fit(X)
y1 = c1.predict(X)
c2 = mixture.GMM(n_components=2, covariance_type='tied', random_state=42)
c2.fit(X)
y2 = c2.predict(X)
y1 == y2

In [None]:
y1

In [None]:
y2

=> Generally, gmm and kmeans give different cluster assignments.

# Cluster+predict model

Clustering of diff vectors:

In [None]:
m = ClusterAdditiveModel(space_cbow, n_clusters=3, cluster_select='BasePredictSim', random_state=42)
m.fit(pairs, verbose=True)

In [None]:
m.models

In [None]:
v = m.predict_with('kaufen_V', 0)
get_neighbors(v, space_cbow, pos='A')

In [None]:
v = m.predict('kaufen_V', verbose=True)
get_neighbors(v, space_cbow, pos='A')

Clustering of base words (rather than diff vectors):

In [None]:
m = ClusterAdditiveModel(space_cbow, n_clusters='BIC', clustering_instance='BaseWord', cluster_select='BasePredictSim', random_state=42)
m.fit(pairs, verbose=True)

In [None]:
v = m.predict('kaufen_V', verbose=True)
get_neighbors(v, space_cbow, pos='A')

In [None]:
m = ClusterAdditiveModel(space_cbow, n_clusters='BIC', clustering_instance='BaseWord', cluster_select='BaseClusterSim', random_state=42)
m.fit(pairs, verbose=True)

In [None]:
v = m.predict('kaufen_V', verbose=True)
get_neighbors(v, space_cbow, pos='A')

In [None]:
sp.linalg.norm(v1.mat)

In [None]:
avg_neighbors_sim(v1, space_cbow, pos='A')

In [None]:
v2 = m.predict_with('kaufen_V', 1)
get_neighbors(v2, space_cbow, pos='A')

In [None]:
sp.linalg.norm(v2.mat)

In [None]:
avg_neighbors_sim(v2, space_cbow, pos='A')

In [None]:
#TODO: Think again about vector normalization

In [None]:
m.predict('kaufen_V', verbose=True)

In [None]:
m = ClusterAdditiveModel(space_cbow, n_clusters=2, cluster_select='BasePredictSim', random_state=42)
m.fit(pairs)
score(m, pairs, verbose=False, pos='A')

In [None]:
m = ClusterAdditiveModel(space_ppmi, n_clusters=2, cluster_select='BasePredictSim', random_state=42)
m.fit(pairs)
score(m, pairs, verbose=False, pos='A')

### CV setup

In [None]:
pairs_train, pairs_holdout = pairs[0:50,:], pairs[50:,:]

In [None]:
shape(pairs_train), shape(pairs_holdout)

In [None]:
m_kmeans = ClusterAdditiveModel(space_cbow, clustering='kmeans', n_clusters=2, cluster_select='BasePredictSim', random_state=42)
score_cv(m_kmeans, pairs_train, pos='A', random_state=42, test_pairs_extra=pairs_holdout, verbose=True)

In [None]:
m_kmeans = ClusterAdditiveModel(space_cbow, clustering='kmeans', n_clusters=2, cluster_select='BasePredictSim', random_state=42)
score_cv(m_kmeans, pairs, pos='A', random_state=42)

In [None]:
m_kmeans = ClusterAdditiveModel(space_cbow, clustering='kmedoids', n_clusters=2, cluster_select='BasePredictSim', random_state=42)
score_cv(m_kmeans, pairs, pos='A', random_state=42)

In [None]:
m_kmeans = ClusterAdditiveModel(space_cbow, clustering='kmeans', n_clusters=2, cluster_select='BaseSim', random_state=66)
score_cv(m_kmeans, pairs, pos='A', random_state=66)

In [None]:
m_gmm = ClusterAdditiveModel(space_cbow, clustering='gmm', n_clusters=2, cluster_select='BaseSim', random_state=42)
score_cv(m_gmm, pairs, pos='A', random_state=42)

In [None]:
m_gmm = ClusterAdditiveModel(space_cbow, clustering='gmm', n_clusters=2, cluster_select='BaseSim', random_state=42)
score_cv(m_gmm, pairs, pos='A', random_state=42)

In [None]:
m_gmm = ClusterAdditiveModel(space_cbow, clustering='gmm', n_clusters=2, cluster_select='BaseSim', random_state=66)
score_cv(m_gmm, pairs, pos='A', random_state=66)

### Comparing kmeans and gmm

In [None]:
m_gmm = ClusterAdditiveModel(space_cbow, clustering='gmm', n_clusters=2, cluster_select='BaseSim', random_state=42)
m_gmm.fit(pairs)
y1 = m_gmm.cluster_assignments

m_kmeans = ClusterAdditiveModel(space_cbow, clustering='kmeans', n_clusters=2, cluster_select='BaseSim', random_state=42)
m_kmeans.fit(pairs)
y2 = m_kmeans.cluster_assignments

In [None]:
y1 == y2

=> In our case, cluster assignments are equivalent for kmeans and gmm.

# Exemplar model

In [None]:
m = AdditiveExemplarModel(space_cbow)
m.fit(pairs, verbose=True)

In [None]:
v = m.predict('kaufen_V', verbose=True)
get_neighbors(v, space_cbow, pos='A')

In [None]:
v = m.predict('singen_V', verbose=True)
get_neighbors(v, space_cbow, pos='A')

In [None]:
m = AdditiveExemplarModel(space_ppmi)
m.fit(pairs, verbose=True)

In [None]:
v = m.predict('kaufen_V', verbose=True)
get_neighbors(v, space_ppmi, pos='A')

# 7 Filtering based on GermanWN

In [None]:
# reads in pairs with polysemy level as the first attribute
def load_pairs(filename, pos1, pos2):
    lines = sp.loadtxt(filename, dtype=str)
    return map(lambda x : [int(x[0]), x[1] + "_V", x[2] + "_A"], lines)

In [None]:
dVA01 = load_pairs(proj_path + "data/dVA01-bar-data.polysemy", 'V', 'A')
dVA01_pairs = sp.array(dVA01)[:,[1,2]]
dVA01_monosemous_pairs = sp.array([(w1,w2) for (p, w1, w2) in dVA01 if p<=1])

In [None]:
dVA01_monosemous_pairs

In [None]:
len(dVA01_monosemous_pairs)

In [None]:
baseline_cbow = BaselineModel(space_cbow)
score_cv(baseline_cbow, dVA01_monosemous_pairs, pos='A')

In [None]:
additive_cbow = AdditiveModel(space_cbow)
additive_cbow.fit(dVA01_monosemous_pairs)
score_cv(additive_cbow, dVA01_monosemous_pairs, pos='A')

In [None]:
cluster_additive_cbow = ClusterAdditiveModel(space_cbow, n_clusters='BIC', cluster_select='BaseSim', random_state=42)
cluster_additive_cbow.fit(dVA01_monosemous_pairs, verbose=True)
score_cv(cluster_additive_cbow, dVA01_monosemous_pairs, random_state=42, pos='A', verbose=False)

In [None]:
for k in range(3, 7):
    print('\nk=%d' % k)
    cluster_additive_cbow = ClusterAdditiveModel(space_cbow, n_clusters=k, cluster_select='BaseSim', random_state=42)
    cluster_additive_cbow.fit(dVA01_monosemous_pairs, verbose=True)
    print score_cv(cluster_additive_cbow, dVA01_monosemous_pairs, random_state=42, pos='A', verbose=False)

### Filtering based on InvCL

In [None]:
# reads in pairs with inclusion data (invCL)
# File format is:
#   offenbar_A offenbaren_V clarkeDE: 0.134455726236 invCL: 0.328609106994
def load_pairs(filename):
    lines = sp.loadtxt(filename, dtype=str)
    return map(lambda x : [float(x[5]), x[1], x[0]], lines)

In [None]:
dVA01 = load_pairs(proj_path + "data/dVA01-bar-data.inclusion")
dVA01_pairs = sp.array(dVA01)[:,[1,2]]
dVA01_inclusive_pairs = sp.array([(w1,w2) for (i, w1, w2) in dVA01 if i>=0.72])

In [None]:
dVA01_inclusive_pairs

In [None]:
len(xs)

In [None]:
baseline_cbow = BaselineModel(space_cbow)
score_cv(baseline_cbow, dVA01_inclusive_pairs, pos='A')

In [None]:
additive_cbow = AdditiveModel(space_cbow)
additive_cbow.fit(dVA01_inclusive_pairs)
score_cv(additive_cbow, dVA01_inclusive_pairs, pos='A')

In [None]:
cluster_additive_cbow = ClusterAdditiveModel(space_cbow, n_clusters='BIC', cluster_select='BaseSim', random_state=42)
cluster_additive_cbow.fit(dVA01_inclusive_pairs, verbose=True)
score_cv(cluster_additive_cbow, dVA01_inclusive_pairs, random_state=42, pos='A', verbose=False)

In [None]:
for k in range(3, 7):
    print('k=%d' % k)
    cluster_additive_cbow = ClusterAdditiveModel(space_cbow, n_clusters=k, cluster_select='BaseSim', random_state=42)
    cluster_additive_cbow.fit(dVA01_inclusive_pairs, verbose=True)
    print score_cv(cluster_additive_cbow, dVA01_inclusive_pairs, random_state=42, pos='A', verbose=False)

### Filtering based on Polysemy + InvCL

In [None]:
sp.intersect1d(dVA01_monosemous_pairs, dVA01_inclusive_pairs)

### Pandas dataframe

In [None]:
pairs_df = pd.read_csv(proj_path + "data/pairs.txt", sep=' ')

In [None]:
pairs_df

In [None]:
dVA01_wellbehaved_df = pairs_df[conjunction(pairs_df.pattern=='dVA01', pairs_df.polysemy<=1, pairs_df.invCL>=0.5)]
dVA01_wellbehaved_df

In [None]:
len(_)

In [None]:
dVA01_wellbehaved_pairs = sp.array(pairs_wellbehaved_df[['word1', 'word2']])
dVA01_wellbehaved_pairs

In [None]:
baseline_cbow = BaselineModel(space_cbow)
score_cv(baseline_cbow, dVA01_wellbehaved_pairs, pos='A', folds=len(dVA01_wellbehaved_pairs))

In [None]:
additive_cbow = AdditiveModel(space_cbow)
additive_cbow.fit(dVA01_wellbehaved_pairs)
score_cv(additive_cbow, dVA01_wellbehaved_pairs, pos='A', folds=len(dVA01_wellbehaved_pairs))

In [None]:
cluster_additive_cbow = ClusterAdditiveModel(space_cbow, n_clusters='BIC', cluster_select='BaseSim', random_state=42)
cluster_additive_cbow.fit(dVA01_wellbehaved_pairs, verbose=True)
score_cv(cluster_additive_cbow, dVA01_wellbehaved_pairs, random_state=42, pos='A', verbose=False, folds=len(dVA01_wellbehaved_pairs))

In [None]:
for k in range(3, 7):
    print('k=%d' % k)
    cluster_additive_cbow = ClusterAdditiveModel(space_cbow, n_clusters=k, cluster_select='BaseSim', random_state=42)
    cluster_additive_cbow.fit(dVA01_wellbehaved_pairs, verbose=True)
    print score_cv(cluster_additive_cbow, dVA01_wellbehaved_pairs, random_state=42, pos='A', verbose=False, folds=len(dVA01_wellbehaved_pairs))

# 8 Grand experiment

In [None]:
pairs_df = pd.read_csv(proj_path + "data/all/pairs-all.txt", sep=' ')

In [None]:
pairs_df

In [None]:
def pattern_pos(pattern): return (pattern[1], pattern[2])

In [None]:
pattern_pos('dVV31')

In [None]:
def avg_invCL(pairs_df, pattern):
    return pairs_df[pairs_df.pattern == pattern]['invCL'].median()

In [None]:
avg_invCL(pairs_df, 'dAA02') 

In [None]:
# For a given pattern, fetches rows from a dataframe that satisfy the given poysemy and invCL thresholds.
# Returns two dataframes: one containing rows that satisfy the conditions and one containing those that don't.
def partition_pairs(pairs_df, pattern, polysemy_threshold=None, invCL_threshold=None, only_pairs=False):
    
    def get_pairs(df): return sp.array(df[['word1','word2']])
    
    ix0 = sp.logical_and(pairs_df.polysemy <= polysemy_threshold if polysemy_threshold != None else True,
                         pairs_df.invCL >= invCL_threshold if invCL_threshold != None else True)
    ix1 = sp.logical_and(pairs_df.pattern == pattern, ix0)
    ix2 = sp.logical_and(pairs_df.pattern == pattern, ~ix0)
    
    if only_pairs:
        return get_pairs(pairs_df[ix1]), get_pairs(pairs_df[ix2])
    else:
        return pairs_df[ix1], pairs_df[ix2]

In [None]:
df1, df2 = partition_pairs(pairs_df, 'dAA02', polysemy_threshold=1, invCL_threshold=0.5)

In [None]:
print df1.shape, df2.shape

In [None]:
df1

In [None]:
df2

In [None]:
def eval_pattern(space, pairs_df, pattern, folds=10, random_state=None, verbose=False):

    models = [
        ('Baseline', BaselineModel(space)), 
        ('Additive', AdditiveModel(space)),
        ('AdditiveExemplar', AdditiveExemplarModel(space))] + \
        [('CluAdditive (DiffVectors, kmeans, k=%d, BasePredictSim)' % k, 
         ClusterAdditiveModel(space, clustering_instance='DiffVector', clustering='kmeans', n_clusters=k, cluster_select='BasePredictSim', random_state=random_state)) 
         for k in range(2,6)] + \
        [('CluAdditive (BaseWord, kmeans, k=%d, BasePredictSim)' % k, 
         ClusterAdditiveModel(space, clustering_instance='BaseWord', clustering='kmeans', n_clusters=k, cluster_select='BasePredictSim', random_state=random_state)) 
         for k in range(2,6)] + \
        [('CluAdditive (BaseWord, kmeans, k=%d, BaseClusterSim)' % k, 
         ClusterAdditiveModel(space, clustering_instance='BaseWord', clustering='kmeans', n_clusters=k, cluster_select='BaseClusterSim', random_state=random_state)) 
         for k in range(2,6)]
            
    pairs_all, _ = partition_pairs(pairs_df, pattern, only_pairs=True)
    pairs_mono1, pairs_mono0 = partition_pairs(pairs_df, pattern, polysemy_threshold=1, only_pairs=True)
    pairs_incl1, pairs_incl0 = partition_pairs(pairs_df, pattern, invCL_threshold=0.5, only_pairs=True)
    pairs_monoincl1, pairs_monoincl0 = partition_pairs(pairs_df, pattern, polysemy_threshold=1, invCL_threshold=0.5, only_pairs=True)

    _, deriv_pos = pattern_pos(pattern)

    data = [
        ('All', pairs_all, None),
        ('Mono', pairs_mono1, None),
        ('Incl', pairs_incl1, None),
        ('MonoIncl', pairs_monoincl1, None),
        ('Mono', pairs_mono1, pairs_mono0),
        ('Incl', pairs_incl1, pairs_incl0),
        ('MonoIncl', pairs_monoincl1, pairs_monoincl0)]

    model_names = [n for n, _ in models]
    data_names = ['%s (%s:%d+%d)' % (pattern, pairs_name, len(pairs_train), 
                                     len(pairs_extra_test) if pairs_extra_test != None else 0)
                  for pairs_name, pairs_train, pairs_extra_test in data]
    scores_df = pd.DataFrame(index=model_names, columns=data_names)
    
    for data_name, (_, pairs_train, pairs_extra_test) in zip(data_names, data):
        if verbose:
            print('Data: %s' % data_name)
        for model_name, model in models:
            _, rof, rof_error = score_cv(model, pairs_train, test_pairs_extra=pairs_extra_test,
                                         pos=deriv_pos, folds=folds, random_state=random_state)
            scores_df[data_name][model_name] = '%.3f ± %.2f' % (rof, rof_error)
            if verbose:
                print('  %s: %.3f ± %.2f' % (model_name, rof, rof_error))
    
    return scores_df

In [None]:
df = eval_pattern(space_cbow, pairs_df, 'dAA02', folds=10, random_state=42); df

In [None]:
pd.unique(pairs_df['pattern'])

In [None]:
patterns = pd.unique(pairs_df['pattern'])
writer = pd.ExcelWriter('PolysemyDerivation-cbow-norm.xlsx')

for pattern in patterns:
    df = eval_pattern(space_cbow_norm, pairs_df, pattern, folds=10, random_state=42, verbose=True)
    df.to_excel(writer, pattern)
    writer.save()
    display(df)

In [None]:
patterns = pd.unique(pairs_df['pattern'])
writer = pd.ExcelWriter('PolysemyDerivation-ppmi.xlsx')

for pattern in patterns:
    df = eval_pattern(space_ppmi, pairs_df, pattern, folds=10, random_state=42)
    df.to_excel(writer, pattern)
    writer.save()
    display(df)

# Test

In [None]:
def median_invCL(pairs_df, pattern):
    return pairs_df[pairs_df.pattern == pattern]['invCL'].median()

In [None]:
def pattern_pos(pattern): return (pattern[1], pattern[2])

# For a given pattern, fetches rows from a dataframe that satisfy the given poysemy and invCL thresholds.
# Returns two dataframes: one containing rows that satisfy the conditions and one containing those that don't.
def partition_pairs(pairs_df, pattern, polysemy_threshold=None, invCL_threshold=None, only_pairs=False):

    def get_pairs(df): return sp.array(df[['word1','word2']])

    ix0 = sp.logical_and(pairs_df.polysemy <= polysemy_threshold if polysemy_threshold != None else True,
                         pairs_df.invCL >= invCL_threshold if invCL_threshold != None else True)
    ix1 = sp.logical_and(pairs_df.pattern == pattern, ix0)
    ix2 = sp.logical_and(pairs_df.pattern == pattern, ~ix0)

    if only_pairs:
        return get_pairs(pairs_df[ix1]), get_pairs(pairs_df[ix2])
    else:
        return pairs_df[ix1], pairs_df[ix2]


Data: dAV01 (MonoIncl:46+59)
 CluAdditive (BaseWord, kmeans, k=3, BaseClusterSim): 0.495 ± 0.12
 /proj/sci/b9/modality/ipython/k_medoids.py:176: UserWarning: Cluster 2 is empty!

Data: dAV04 (All:185+0)
CluAdditive (BaseWord, kmedoids, k=2, BasePredictSim): 0.351 ± 0.06

In [None]:
pairs_df = pd.read_csv('/home/jan/b9-modality/data/pairs-XX/pairs-AV.txt', sep=' ')

In [None]:
pattern = 'dAV04'
folds = 10

invCL_median = median_invCL(pairs_df, pattern)

pairs_all, _ = partition_pairs(pairs_df, pattern, only_pairs=True)
pairs_mono1, pairs_mono0 = partition_pairs(pairs_df, pattern, polysemy_threshold=1, only_pairs=True)
pairs_incl1, pairs_incl0 = partition_pairs(pairs_df, pattern, invCL_threshold=invCL_median, only_pairs=True)
pairs_monoincl1, pairs_monoincl0 = partition_pairs(pairs_df, pattern, polysemy_threshold=1, invCL_threshold=invCL_median, only_pairs=True)
pairs_train = pairs_all
pairs_extra_test = None

_, deriv_pos = pattern_pos(pattern)

model = ClusterAdditiveModel(space_cbow_norm, clustering_instance='BaseWord', clustering='kmedoids', n_clusters=3, cluster_select='BasePredictSim', random_state=42)

_, rof, rof_error = score_cv(model, pairs_train, test_pairs_extra=pairs_extra_test, pos=deriv_pos, folds=folds, random_state=42)
print('%.3f ± %.2f' % (rof, rof_error))

In [None]:
test_ix = [9, 18, 29, 30, 55, 56,  60,  65,  66,  75, 113, 119, 124, 126, 135, 146, 165, 170, 176]
train_ix = sp.delete(sp.arange(0,185), test_ix)
train_pairs = pairs_train[train_ix]
test_pairs = pairs_train[test_ix]

In [None]:
len(train_pairs), len(test_pairs)

In [None]:
X = get_base_vectors(space_cbow_norm, train_pairs)
Y = get_base_vectors(space_cbow_norm, test_pairs)

In [None]:
sp.shape(X)

In [None]:
X = get_base_vectors(self.space, train_pairs)

In [None]:
c = KMedoids(n_clusters=3, random_state=42, distance_metric='cosine')

In [None]:
c.fit(X)

In [None]:
c.predict(X)

In [None]:
c.predict(Y)

In [None]:
v = space_cbow_norm.get_row('Hund_N') + DenseMatrix(sp.zeros(1))

In [None]:
v.mat == space_cbow_norm.get_row('Hund_N').mat

In [None]:
space_cbow_norm.get_row('Hund_N').mat

In [None]:
v.mat

In [None]:
eval_pattern(space_cbow, pairs_df, 'dAV04', random_state=42, verbose=True)

Data: dAV04 (All:185+0)
  Baseline: 0.362 ± 0.06
  Additive: 0.367 ± 0.07
  AdditiveExemplar: 0.167 ± 0.05
  CluAdditive (DiffVectors, kmeans, k=2, BasePredictSim): 0.340 ± 0.05
  CluAdditive (DiffVectors, kmeans, k=3, BasePredictSim): 0.367 ± 0.07
  CluAdditive (DiffVectors, kmeans, k=4, BasePredictSim): 0.345 ± 0.05
  CluAdditive (DiffVectors, kmeans, k=5, BasePredictSim): 0.351 ± 0.06
  CluAdditive (DiffVectors, kmedoids, k=2, BasePredictSim): 0.351 ± 0.07
  CluAdditive (DiffVectors, kmedoids, k=3, BasePredictSim): 0.346 ± 0.06
  CluAdditive (DiffVectors, kmedoids, k=4, BasePredictSim): 0.345 ± 0.06
  CluAdditive (DiffVectors, kmedoids, k=5, BasePredictSim): 0.323 ± 0.07
  CluAdditive (BaseWord, kmeans, k=2, BasePredictSim): 0.356 ± 0.06
  CluAdditive (BaseWord, kmeans, k=3, BasePredictSim): 0.356 ± 0.07
  CluAdditive (BaseWord, kmeans, k=4, BasePredictSim): 0.361 ± 0.08
  CluAdditive (BaseWord, kmeans, k=5, BasePredictSim): 0.351 ± 0.06
  CluAdditive (BaseWord, kmedoids, k=2, BasePredictSim): 0.351 ± 0.06
Traceback (most recent call last):
  File "EvalPatterns.py", line 122, in <module>


# Pooled results (2x2 design)

In [None]:
m = ClusterAdditiveModel(space_cbow_norm, clustering_instance='BaseWord', clustering='kmedoids', n_clusters='AIC', cluster_select='BaseClusterSim', random_state=42)
m.fit(pairs, verbose=True)
#score(m, pairs, verbose=False, pos='A')

In [None]:
_, rof, rof_error = score_cv(model, pairs_train, test_pairs_extra=pairs_extra_test,
                                         pos=deriv_pos, folds=folds, random_state=random_state)

In [None]:
-

In [None]:
pairs_df = pd.read_csv(proj_path + "data/all/pairs-all.txt", sep=' ')

In [None]:
r = eval_pattern_2(space_cbow_norm, pairs_df, 'dAA03', random_state=42, verbose=True); r

In [None]:
from IPython.display import display
import os
path = proj_path + 'ipython/results2/'
xs = os.listdir(path)
d = pd.read_excel(path + xs[0])
for x in xs[1:]:
    d2 = pd.read_excel(path + x)
    d = d.add(d2)
d

In [None]:
d / d['All']['n_pairs']

In [None]:
zs = ['dAN03', 'dAN04', 'dAN09', 'dAN10', 'dAN11', 'dAN12', 'dAN16', 'dNA01', 'dNA02', 'dNA05', 'dNA06', 'dNA25', 'dNA26', 'dNA27', 'dNV09', 'dVA02', 'dVA03', 'dVA12', 'dVA13', 'dVN07', 'dVN09']

In [None]:
ws = [x for x in xs if not any([z in x for z in zs])]

In [None]:
from IPython.display import display
import os
d = pd.read_excel(path + ws[0])
for w in ws[1:]:
    d2 = pd.read_excel(path + w)
    d = d.add(d2)
d

In [None]:
d / d['All']['n_pairs']

### TODO (24 Jan 2016)

* <strike>Fix score_cv</strike>
* <strike>Implement POS filter in the scoring functions</strike>
* <strike>invCL filtering</strike>
* <strike>Put all word pairs data into a single Pandas dataframe</strike>
* <strike>Check clustering variance</strike>
* Implement oracles
* <strike>Stability of GMM (initial centroids)</strike>
* <strike>GMM parameters (maybe use full cov matrix?)</strike>
* <strike>Implement k-nn instead of GMM</strike>
* <strike>All patterns</strike>
* <strike>Train on subset, predict on all</strike>
* <strike>Exemplar model</strike>
* <strike>Base-centroid cluster selection</strike>
* <strike>**Grand experiment**</strike>
* <strike>Cluter base word</strike>
* <strike>Check gmm vs kmeans results</strike>
* <strike>Fix margin of error for LOOCV</strike>
* Evaluate with a count-based model

### Vector plausibility

* Vecchi et al 2011. (http://aclweb.org/anthology/W/W11/W11-1301.pdf)


1. Vector length

2. Similarity to base verb vector

3. Avg/median similarity to N nearest neighbors




### Oracles

* n_clusters:
  * the number of clusters that maximizes RooN

* cluster_select:
  * choose the cluster that maximizes RooN (if there is such)
  * compute the gold diff vector and choose the cluster which maximizes the class likelihood