# Measuring Semantic Similarity

This program measure the similarity between pairs of words from the McRae's dataset. First it uses the HD Computing approach and then compares it with similarity metrics from the NLTK library.

### Importing libraries and HD computing functions

In [17]:
import pandas as pd
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic

%run HDComputing_basics.ipynb

### Functions for reading dataset

In [18]:
def TranslateFeats(ListFeat):
    "It receives a list of features such as ['is_blue', 'is_rectangular'] and it returns: [['color','blue'], ['shape','rectangular']"
    # Dataframe for excel document
    df = pd.read_excel('FEATS_brm.xlsx') #../McRaedataset/FEATS_brm.xlsx')
    ListPairs = []
    for feat in ListFeat:
        # Row for feature...
        row = df.loc[df['Feature'] == feat]       
        # Look for values in vec_feat and vec_value
        ListPairs.append([str(row['feat_name'].tolist()[0]), str(row['feat_value'].tolist()[0])])       
    return ListPairs

def ReadDefinitions():
    "Given an xlsx file it returns all the concepts feature values as they appear in the original dataset"
    #Dataframe for excel document
    df = pd.read_excel('CONCS_FEATS_concstats_brm.xlsx') #../McRaeDataset/CONCS_FEATS_concstats_brm.xlsx') #MINI_
    #Create a list with all concept names
    names = set(df['Concept'])
    # Extract list of features for each name
    Concepts = []
    for n in names:
        row = df.loc[df['Concept'] == n]
        Concepts.append([str(n), map(str,list(row['Feature']))])
    return Concepts

def ClosestConcepts (concept, nc):
    "Given a concept label this function reads the distance matrix from McRae's and returns the 'nc' closests concepts in a list"
    # Excel document to data frame...
    df = pd.read_excel('cos_matrix_brm_IFR.xlsx') #../McRaeDataset/cos_matrix_brm_IFR.xlsx')
    ordered = df.sort_values(by=concept, ascending=False)[['CONCEPT', concept]]
    
    L1 = list(ordered['CONCEPT'][0:nc])
    L1 = map(str, L1)
    L2 = zip(L1,list(ordered[concept][0:nc]))
    L2 = map(list, L2)
    
    return L2

### Creating definitions dictionary

In [19]:
def CreateDictionary():
    global Dict_defs
    data = ReadDefinitions()
    for concept in data:
        Dict_defs[concept[0]] = TranslateFeats(concept[1])

## Storing ID vectors into memory

### Memory functions

In [20]:
def flat_list (L):
    "Recursive function that flats a list of lists (at any level)"
    if L == []:
        return L
    if type(L[0]) is list:
        return flat_list(L[0]) + flat_list(L[1:])
    return L[:1] + flat_list(L[1:])

def SaveConcepts(Dic):
    """Given a definitions dictionary it stores in memory the entire set of concepts in the dictionary (including feature vectors)"""
    keys = Dic.keys()
    vals = Dic.values()
    all_concepts = list(set(flat_list(vals) + keys))
    # Process for storing list of concepts in memory
    for concept in all_concepts:
        HDvector(N,concept) #This creates an object and store it in memory
        
def FeatureVectors(Dic):
    "It extract from the definition dictionary all the feature type vectors ('is','has','color', etc...)"
    global feature_vectors
    featt = []
    vals = Dic.values()
    for l in vals:
        for p in l:
            featt.append(p[0])
    feature_vectors = list(set(featt))
    
def CreateSemanticPointer (PairList):
    "Turns list as [[feat1,feat_val],[feat2,feat_val],[feat3,feat_val]] into vector feat1*feat_val + feat2*feat_val ..."
    vecs = []
    for pair in PairList:
        vecs.append(Dict[pair[0]] * Dict[pair[1]])
    return ADD(vecs)

def SaveDefinitions(Dic):
    """Given the definitions dictionary, and having all its concepts previously stored in memory, this functions
       creates a definition vector (semantic pointer) using HD operations and assign it as a pointer to an 
       object vector (ID vector)."""
    global feature_vectors
    # Going through all elements in dictionary
    for key, value in Dic.iteritems():
        Dict[key].setPointer(CreateSemanticPointer(value))
        
def NormalizeHammDist (Dist_list):
    "Given a distance list of the form [['name', dist], ['name', dist], ... ], it normalize each distance and return a list with the same form"
    for i in range(len(Dist_list)):
        Dist_list[i][1] = round( 1. - Dist_list[i][1] / float(N / 2), 3 )
    return Dist_list

### Initializing memory

In [21]:
def Init_mem():
    init()
    thr = 0.4 * N
    # Read dataset and create definition dictionary
    CreateDictionary()
    # Feature vectors
    FeatureVectors(Dict_defs)
    # Save concepts into memory (ID vectors)
    SaveConcepts(Dict_defs)
    # Associate definitions to concepts into memory (SP vectors)
    SaveDefinitions(Dict_defs)
    print "End of encoding"

Init_mem()

End of initialization
End of encoding


### Getting list of closest vectors (HDcomputing and from Dataset)

In [22]:
test_concept = 'bed'
num_concepts_1 = 20 #20

# Asking closest concept of another concept's definition...
HDC_sim = HDvector.getLabelSP(Dict[test_concept].getPointer())[:num_concepts_1]
print "Sin normalizar", HDC_sim
# Normalizing
HDC_sim = NormalizeHammDist(HDC_sim)
DatSet_sim = ClosestConcepts(test_concept, num_concepts_1)

print "Closest concepts using semantic features"
print "\nNumber of concepts: ", num_concepts_1

print "\n\nClosest concepts to '", test_concept,"' definition: ", HDC_sim
print "\n\nClosest concepts to '", test_concept,"' (from Dataset): ", DatSet_sim

Sin normalizar [['bed', 0], ['sofa', 4044], ['couch', 4172], ['nightgown', 4209], ['pajamas', 4258], ['pillow', 4410], ['cushion', 4432], ['robe', 4447], ['mink_(coat)', 4498]]
Closest concepts using semantic features

Number of concepts:  20


Closest concepts to ' bed ' definition:  [['bed', 1.0], ['sofa', 0.191], ['couch', 0.166], ['nightgown', 0.158], ['pajamas', 0.148], ['pillow', 0.118], ['cushion', 0.114], ['robe', 0.111], ['mink_(coat)', 0.1]]


Closest concepts to ' bed ' (from Dataset):  [['bed', 1.0], ['pillow', 0.374], ['sofa', 0.323], ['couch', 0.322], ['cushion', 0.284], ['mink_(coat)', 0.158], ['nightgown', 0.152], ['pajamas', 0.149], ['robe', 0.146], ['slippers', 0.145], ['dresser', 0.121], ['earmuffs', 0.105], ['carpet', 0.098], ['camisole', 0.087], ['chair', 0.082], ['bureau', 0.079], ['parka', 0.077], ['jeans', 0.073], ['banana', 0.067], ['mirror', 0.066]]


## Semantic Similarity using NLTK library

### Auxiliar functions for similarity metrics

In [23]:
def get_concepts_list ():
    "Returns a list of strings: the names of the concepts"
    df = pd.read_excel('CONCS_Synset_brm.xlsx')
    return map(str, list(df['Concept']))
    
def get_synset (concept):
    "Given a concept name (string) it returns its synset (string)"
    # Dataframe for excel document
    df = pd.read_excel('CONCS_Synset_brm.xlsx')
    row = df.loc[df['Concept'] == concept]
    return str(list(row['Synset'] )[0])

def Apply_sim_metric ( similarity_metric, num, in_concept, corpus = None):
    "Given a similarity_metric function it returns a list of the num closest concepts to 'concept'"
    dist_list = []
    for c in Concepts:
        c_synset = wn.synset( get_synset(c) )
        if corpus:
            dist_list.append([c, round(similarity_metric(in_concept, c_synset, corpus), 3) ])
        else:
            dist_list.append([c, round(similarity_metric(in_concept, c_synset), 3) ])
    return sorted(dist_list, key = lambda r : r[1], reverse = True ) [:num]

### Getting list of closest vectors (Path-based)

In [24]:
# List of concepts
Concepts = get_concepts_list() 

#Input concept
concept = wn.synset( get_synset("airplane") )

# Number of closest concepts to take into account
num_concepts_2 = 15 #20

print "\nMETRICS BASED ON PATH"
print "\nNumber of concepts: ", num_concepts_2

# PATH SIMILARITY
Path_sim = Apply_sim_metric(wn.path_similarity, num_concepts_2, concept)
print "\nWordNet path similarity: ", Path_sim

# Leacock-Chodorow similarity
LC_sim = Apply_sim_metric(wn.lch_similarity, num_concepts_2, concept ) 
print "\n\nLeacock-Chodorow similarity: ", LC_sim

# Wu-Palmer Similarity
WUP_sim = Apply_sim_metric(wn.wup_similarity, num_concepts_2, concept )
print "\n\nWu-Palmer similarity: ", WUP_sim


METRICS BASED ON PATH

Number of concepts:  15

WordNet path similarity:  [['airplane', 1.0], ['jet', 0.5], ['helicopter', 0.333], ['boat', 0.167], ['ship', 0.167], ['sled', 0.167], ['sleigh', 0.167], ['yacht', 0.167], ['bike', 0.143], ['missile', 0.143], ['sailboat', 0.143], ['scooter', 0.143], ['skateboard', 0.143], ['tank_(army)', 0.143], ['trailer', 0.143]]


Leacock-Chodorow similarity:  [['airplane', 3.638], ['jet', 2.944], ['helicopter', 2.539], ['boat', 1.846], ['ship', 1.846], ['sled', 1.846], ['sleigh', 1.846], ['yacht', 1.846], ['bike', 1.692], ['missile', 1.692], ['sailboat', 1.692], ['scooter', 1.692], ['skateboard', 1.692], ['tank_(army)', 1.692], ['trailer', 1.692]]


Wu-Palmer similarity:  [['airplane', 1.0], ['jet', 0.96], ['helicopter', 0.917], ['boat', 0.783], ['ship', 0.783], ['yacht', 0.783], ['sled', 0.762], ['sleigh', 0.762], ['sailboat', 0.75], ['bike', 0.727], ['missile', 0.727], ['scooter', 0.727], ['skateboard', 0.727], ['tank_(army)', 0.727], ['tricycle', 0

### Getting list of closest vectors (Information content-based)

In [None]:
## Corpus Information content...
brown_ic = wordnet_ic.ic('ic-brown.dat')
#semcor_ic = wordnet_ic.ic('ic-semcor.dat')

print "METRICS BASED ON INFORMATION CONTENT"
print "\nNumber of concepts: ", num_concepts_2

# Resnik similarity.
Res_sim = Apply_sim_metric(wn.res_similarity, num_concepts_2, concept, brown_ic)
print "\nResnick similarity (brown_ic): ", Res_sim
#Res_sim2 = Apply_sim_metric(wn.res_similarity, 20, concept, semcor_ic)
#print "\nResnick similarity (semcor_ic): ", Res_sim2

# Jiang-Conrath similarity
JC_sim = Apply_sim_metric(wn.jcn_similarity, num_concepts_2, concept, brown_ic)
print "\n\nJiang-Conrath similarity (brown_ic): ", JC_sim
#JC_sim2 = Apply_sim_metric(wn.jcn_similarity, 20, concept, semcor_ic)
#print "\nJiang-Conrath similarity (semcor_ic): ", JC_sim2

# Lin similarity
Lin_sim = Apply_sim_metric(wn.lin_similarity, num_concepts_2, concept, brown_ic)
print "\n\nLin similarity (brown_ic): ", Lin_sim
#Lin_sim2 = Apply_sim_metric(wn.lin_similarity, 20, concept, semcor_ic)
#print "\nLin similarity (semcor_ic): ", Lin_sim2

### Intersections and unions
In this following cell we obtain the intersection sets for the McRae and HDcomputing similarity name list. The goal is to compare which method is more similar to each of the similarity metrics.
The union sets is going to be used for creating Surveys to be applied on humans.

In [None]:
# Creating name sets
HDC_names = set([x[0] for x in HDC_sim])
DatSet_names = set([x[0] for x in DatSet_sim])
Path_names = set([x[0] for x in Path_sim])
LC_names = set([x[0] for x in LC_sim])
WUP_names = set([x[0] for x in WUP_sim])
Res_names = set([x[0] for x in Res_sim])
JC_names = set([x[0] for x in JC_sim])
Lin_names = set([x[0] for x in Lin_sim])

# Intersection between HDC and Dataset
union_1 = HDC_names.intersection(DatSet_names)

# Union of intersecion Path and intersection IC...
PathInt = Path_names.intersection(LC_names, WUP_names)
ICInt = Res_names.intersection(JC_names, Lin_names)
union_2 = PathInt.union(ICInt)

# Intersection between all NLTK metrics
union_3 = PathInt.intersection(ICInt)

# Intersection all NLTLK metrics and HDC and Dataset
union_4 = HDC_names.intersection(PathInt, ICInt)
union_5 = DatSet_names.intersection(PathInt, ICInt)
union_6 = HDC_names.intersection(DatSet_names, PathInt, ICInt)

# Ultimate union...
ult_union = set.union( union_1, union_2, union_3, union_4, union_5, union_6 )
print "**Ultimate Union** lenght: ", len(ult_union), "\n\n", ult_union


### Previous cell... 

In [None]:


# Intersection between HDC and Dataset
union_1 = HDC_names.intersection(DatSet_names)
print "*1*HDC and Dataset: ", union_1

# Intersection between HDC and NLTK similarity metrics
print "\n\nHDC & NLTK similarity metrics"
for sett in [Path_names, LC_names, WUP_names, Res_names, JC_names, Lin_names]:
    print "\n", HDC_names.intersection(sett)
    
# Intersection between HDC and NLTK similarity metrics
print "\n\nDataset & NLTK similarity metrics"
for sett in [Path_names, LC_names, WUP_names, Res_names, JC_names, Lin_names]:
    print "\n", DatSet_names.intersection(sett)

# Intersection between Path based metrics
print "\n\nIntersection Path based metrics"
PathInt = Path_names.intersection(LC_names, WUP_names)
print "\n", PathInt, "...", len(PathInt) / float( num_concepts_2 ) * 100, "%"

# Intersection between IC based metrics
print "\n\nIntersection IC based metrics"
ICInt = Res_names.intersection(JC_names, Lin_names)
print "\n", ICInt

# Union of intersecion Path and intersection IC...
print "\n\n*2*Union of intersections"
union_2 = PathInt.union(ICInt)
print "\n", union_2

# Intersection between all NLTK metrics
print "\n\n*3*Intersection between all NLTK metrics"
union_3 = PathInt.intersection(ICInt)
print "\n", union_3

# Intersection all NLTLK metrics and HDC and Dataset
print "\n\n*4*Intersection between NLTK metrics and HDC"
union_4 = HDC_names.intersection(PathInt, ICInt)
print "\nHDC: ", union_4
union_5 = DatSet_names.intersection(PathInt, ICInt)
print "\n*5*Dataset: ", union_5
union_6 = HDC_names.intersection(DatSet_names, PathInt, ICInt)
print "\n*6*ALL: ", union_6


# Ultimate union...
ult_union = set.union( union_1, union_2, union_3, union_4, union_5, union_6 )
print "**Ultimate Union** lenght: ", len(ult_union), "\n", ult_union

# Union for all names...
Union = set().union(HDC_names, DatSet_names, Path_names, LC_names, WUP_names, Res_names, JC_names, Lin_names)
print "\n\nUnion lenght: ", len(Union), "\n", Union