This notebook partitions the SimLex-999 pairs. It also measure the pairwise distances between each partition. This information is crucial to clustering similar features in the last stage.

We use the Numberbatch embedding as a tool for measuring pairwise distances.

### Libraries

In [1]:
import requests
import csv
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats.stats import spearmanr
import pickle
import time

# Vital function for pairwise distances...
from scipy.spatial.distance import pdist

## 1. Reading concepts and SimLex-999 pairs

In [2]:
def Read_SimLex():
    df = pd.read_excel('../Data/SimLex.xlsx') 

    # Selecting only Nouns
    df = df[df['POS'] == 'N']

    # Pairs
    Pairs = df[['word1','word2','SimLex999']].values.tolist()
    Pairs = [[str(x[0]), str(x[1]), float(x[2])] for x  in Pairs]

    # List of concepts
    Concepts = pd.unique(df[['word1','word2']].values.ravel('K'))
    Concepts = map(str, Concepts)
    print "There are", len(Concepts), "concepts"
    return Pairs, Concepts

Pairs, Concepts = Read_SimLex()
    
# Lista de relaciones usada PARA CLUSTERING... 
ListRel = ['RelatedTo','IsA', 'CapableOf', 'AtLocation','CausesDesire', 'HasA', 'HasProperty', 
           'MannerOf','PartOf','UsedFor']

There are 751 concepts


## 2. Partitioning

In [3]:
def find_concept_in_pairs (C, Pair_list, thr, G):
    "Given a single concept C, it returns a lists of pairs where C appears"
    for p in Pair_list:
        if C == p[0] and p[2] > thr : 
            if p[1] not in G:
                G.extend(find_concept_in_pairs(p[1], Pair_list, thr, G + [p[1]]))
        elif C == p[1] and p[2] > thr: 
            if p[0] not in G:
                G.extend(find_concept_in_pairs(p[0], Pair_list, thr, G + [p[0]]))

    return list(set(G))

In [4]:
thr_s = 0     # Threshold value

def CreatingPartitions ():
    Groups = []
    for pair in Pairs:
        g = find_concept_in_pairs(pair[0], Pairs, thr_s, [pair[0]])
        g.sort()
        if g not in Groups:
            Groups.append(g)

    # Sorting
    Groups.sort()
    Groups.sort(key = len, reverse = True)

    # Filtering out partitions of size 1 (only 1 pair)
    Groups = [x for x in Groups if len(x) > 1]

    print "There are",len(Groups),"partitions"
    return Groups

Groups = CreatingPartitions()

There are 137 partitions


### 2.1 Saving partitions in file

In [5]:
def Pairs_per_Partition(Save = None):
    i = -1
    GPairs = []
    for G in Groups:
        i += 1 # Contador de grupos...
        gPairs = []
        for j in range(len(G)): # Por concepto en grupo i...
            for k in range(j+1, len(G)):
                for p in Pairs:
                    if G[j] in p and G[k] in p and p[2] > thr_s:
                        gPairs.append(p)
                        #print p
        if Save:
            np.save('../Data/ClusteringData/01 Elementos/Elementos_Grupo' + str(i), G)
            np.save('../Data/ClusteringData/02 Parejas/Parejas_Grupo' + str(i), gPairs)
        
        # Lista de parejas, ordenadas de menor a mayor similitud [para cada grupo]
        GPairs.append(sorted(gPairs, key = lambda x: x[2]))
    return GPairs

Pairs_per_Partition(True)

print "Done!"

Done!


## 3. Measuring pairwise distance for features.

To save time, the pairwise distances are stored in files. 

### 3.1 Reading semantic features collected

In [6]:
def formatt2 (L):
    "Formatting the imported file into an array"
    new = L[:3] + [eval(L[3].replace(' ',','))] + [float(L[4])] + [float(L[5])]
    return new


def Reading_Feature_Matrix():
    Features_matrix = []
    # Reading csv file
    with open('../Data/ConceptNet_Semantic_Features.csv') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        count = 0
        for row in csv_reader:
            if count > 0:
                Features_matrix.append(formatt2(row))
            else:
                count = 1;
    csv_file.close()
    
    # Formatting
    concept_in = Features_matrix[0][0]  #initial concept
    Feat_matrix = [[] for i in range(751)]        

    # Loop for formatting
    i = 0
    features_concept_i = []
    for feat in Features_matrix:
        # Same concept... 
        if feat[0] == concept_in:
            # Adding 'feat' to aux array
            features_concept_i.append(feat)
        else:
           # Changing concept
            concept_in = feat[0]
           # Copying aux array to final array
            Feat_matrix[i] = features_concept_i[:]
           # Clearing aux array and increasing counter
            features_concept_i = [feat]
            i += 1

    Feat_matrix[i] = features_concept_i[:]

    # Sorting Feat_matrix based on weight (x[4])
    for con in Feat_matrix:
        con.sort(key = lambda x: x[4], reverse = True)
    
    return Feat_matrix

Feat_matrix = Reading_Feature_Matrix()
# Sanity check == 751 concepts
print "There are", len(Feat_matrix),"concepts"
#print Feat_matrix[0]

There are 751 concepts


### 3.2 Importing Numberbatch and defining distance function

In [7]:
A = pd.read_hdf('../Code/mini.h5')  #This file can be downloaded from the Numberbatch repository
# A.head(10)

#### Dictionary of features

In [8]:
Dict_Numberbatch = {}
# Loop over all concepts
for con in Feat_matrix:
    # For each feature:
    for feat in con:
        # If features is still not in dictionary: 
        if feat[2] not in Dict_Numberbatch.keys():
            # Initialize array
            vecs = []
            # For each lemmatized word
            for w in feat[3]:
                try:
                    vecs.append(np.array(A.loc['/c/en/' + w].tolist())) # If it exists in catalog, save its value
                except:
                    vecs.append(np.zeros(300))
            Dict_Numberbatch[feat[2]] = sum(vecs)  
# print Dict_Numberbatch

#### Distance function

In [9]:
def numberbatch_dist (w1, w2):
    #print w1
    word_vec1 = Dict_Numberbatch[w1[0]]  #Consulta vector 1
    word_vec2 = Dict_Numberbatch[w2[0]]  #Consulta vector 2
    if sum(word_vec1) == 0 or sum(word_vec2) == 0:
        return 1.
    else:
        d = 1. - np.dot(word_vec1, word_vec2) / (np.linalg.norm(word_vec1) * np.linalg.norm(word_vec2))
        if d < 0:
            return 0
        else:
            return d

# Example: 
numberbatch_dist(['dog'],['cat'])

0.4092759953852362

### 3.3 Measuring pairwise distance.
The results are saved for later used.
(This process can take up to 5 minutes)

In [10]:
i = 0

for G in Groups:
#    print "\nPartition",i
    for rel in ListRel:
        
        Features = set([])
        for con in Feat_matrix:
            for ft in con:
                if ft[0] in G and ft[1] == rel:
                    Features.add(ft[2])
        
        Features = list(Features)
        
        # Only takes into account lists of features larger than 10
        if len(Features) >= 10:
            Features = np.array(map(lambda x: np.array([x]), Features))
            # Storing list of features
            np.save('../Data/ClusteringData/03 Features_Group/Features_grupo_' + str(i) + '_' + str(rel), Features)
            
            #print "Relation", rel
            #print "There are", len(Features), "features..."
            
            # Pairwise distance
            X = pdist(Features, metric = numberbatch_dist)
            name_file = "Group_" + str(i) + "_Relation_" + rel
            print "Name of file:", name_file
            # Store array
            np.save('../Data/ClusteringData/04 PairwiseDistance/' + name_file, X) 

    # Next partition
    i += 1

print "Done!"

Name of file: Group_0_Relation_RelatedTo
Name of file: Group_0_Relation_IsA
Name of file: Group_0_Relation_CapableOf
Name of file: Group_0_Relation_AtLocation
Name of file: Group_0_Relation_CausesDesire
Name of file: Group_0_Relation_HasA
Name of file: Group_0_Relation_HasProperty
Name of file: Group_0_Relation_MannerOf
Name of file: Group_0_Relation_PartOf
Name of file: Group_0_Relation_UsedFor
Name of file: Group_1_Relation_RelatedTo
Name of file: Group_1_Relation_IsA
Name of file: Group_1_Relation_CapableOf
Name of file: Group_1_Relation_AtLocation
Name of file: Group_1_Relation_UsedFor
Name of file: Group_2_Relation_RelatedTo
Name of file: Group_2_Relation_IsA
Name of file: Group_2_Relation_CapableOf
Name of file: Group_2_Relation_AtLocation
Name of file: Group_3_Relation_RelatedTo
Name of file: Group_3_Relation_IsA
Name of file: Group_3_Relation_CausesDesire
Name of file: Group_3_Relation_HasProperty
Name of file: Group_4_Relation_RelatedTo
Name of file: Group_4_Relation_IsA
Name 

Name of file: Group_96_Relation_RelatedTo
Name of file: Group_96_Relation_IsA
Name of file: Group_97_Relation_RelatedTo
Name of file: Group_97_Relation_IsA
Name of file: Group_97_Relation_UsedFor
Name of file: Group_98_Relation_RelatedTo
Name of file: Group_98_Relation_IsA
Name of file: Group_101_Relation_RelatedTo
Name of file: Group_101_Relation_IsA
Name of file: Group_102_Relation_RelatedTo
Name of file: Group_102_Relation_IsA
Name of file: Group_102_Relation_CapableOf
Name of file: Group_103_Relation_IsA
Name of file: Group_104_Relation_RelatedTo
Name of file: Group_104_Relation_IsA
Name of file: Group_104_Relation_UsedFor
Name of file: Group_105_Relation_AtLocation
Name of file: Group_106_Relation_RelatedTo
Name of file: Group_106_Relation_UsedFor
Name of file: Group_107_Relation_RelatedTo
Name of file: Group_107_Relation_IsA
Name of file: Group_108_Relation_RelatedTo
Name of file: Group_109_Relation_IsA
Name of file: Group_110_Relation_IsA
Name of file: Group_111_Relation_IsA
Nam