# Getting list of similar concepts

### This notebook is based on 'Similarity metrics but it iterates over multiple input concepts.

# Measuring Semantic Similarity

This program measure the similarity between pairs of words from the McRae's dataset. First it uses the HD Computing approach and then compares it with similarity metrics from the NLTK library.

### Importing libraries and HD computing functions

In [9]:
import pandas as pd
import nltk
#nltk.download('wordnet')
#nltk.download('wordnet_ic')
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic

%run HDComputing_basics.ipynb


#pathh = '../McRaedataset/'
pathh = ''

### Functions for reading dataset

In [10]:
def TranslateFeats(ListFeat):
    "It receives a list of features such as ['is_blue', 'is_rectangular'] and it returns: [['color','blue'], ['shape','rectangular']"
    # Dataframe for excel document
    df = pd.read_excel(pathh + 'FEATS_brm.xlsx') #../McRaedataset/FEATS_brm.xlsx')
    ListPairs = []
    for feat in ListFeat:
        # Row for feature...
        row = df.loc[df['Feature'] == feat]       
        # Look for values in vec_feat and vec_value
        ListPairs.append([str(row['feat_name'].tolist()[0]), str(row['feat_value'].tolist()[0])])       
    return ListPairs

def ReadDefinitions():
    "Given an xlsx file it returns all the concepts feature values as they appear in the original dataset"
    #Dataframe for excel document
    df = pd.read_excel( pathh + 'CONCS_FEATS_concstats_brm.xlsx') #../McRaeDataset/CONCS_FEATS_concstats_brm.xlsx') #MINI_
    #Create a list with all concept names
    names = set(df['Concept'])
    # Extract list of features for each name
    Concepts = []
    for n in names:
        row = df.loc[df['Concept'] == n]
        Concepts.append([str(n), map(str,list(row['Feature']))])
    return Concepts

def ClosestConcepts (concept, nc):
    "Given a concept label this function reads the distance matrix from McRae's and returns the 'nc' closests concepts in a list"
    # Excel document to data frame...
    try:
        df = pd.read_excel(pathh + 'cos_matrix_brm_IFR.xlsx','1st_200') #../McRaeDataset/cos_matrix_brm_IFR.xlsx', '1st_200')
        ordered = df.sort_values(by=concept, ascending=False)[['CONCEPT', concept]]
    except: 
        try:
            df = pd.read_excel(pathh + 'cos_matrix_brm_IFR.xlsx','2nd_200') # ('../McRaeDataset/cos_matrix_brm_IFR.xlsx', '2nd_200')
            ordered = df.sort_values(by=concept, ascending=False)[['CONCEPT', concept]]
        except:
            df = pd.read_excel(pathh + 'cos_matrix_brm_IFR.xlsx','last_141') #('../McRaeDataset/cos_matrix_brm_IFR.xlsx', 'last_141')
            ordered = df.sort_values(by=concept, ascending=False)[['CONCEPT', concept]]
    
    L1 = list(ordered['CONCEPT'][0:nc])
    L1 = map(str, L1)
    L2 = zip(L1,list(ordered[concept][0:nc]))
    L2 = map(list, L2)
    
    return L2

### Creating definitions dictionary

In [11]:
def CreateDictionary():
    global Dict_defs
    data = ReadDefinitions()
    for concept in data:
        Dict_defs[concept[0]] = TranslateFeats(concept[1])

## Storing ID vectors into memory

### Memory functions

In [12]:
def flat_list (L):
    "Recursive function that flats a list of lists (at any level)"
    if L == []:
        return L
    if type(L[0]) is list:
        return flat_list(L[0]) + flat_list(L[1:])
    return L[:1] + flat_list(L[1:])

def SaveConcepts(Dic):
    """Given a definitions dictionary it stores in memory the entire set of concepts in the dictionary (including feature vectors)"""
    keys = Dic.keys()
    vals = Dic.values()
    all_concepts = list(set(flat_list(vals) + keys))
    # Process for storing list of concepts in memory
    for concept in all_concepts:
        HDvector(N,concept) #This creates an object and store it in memory
        
def FeatureVectors(Dic):
    "It extract from the definition dictionary all the feature type vectors ('is','has','color', etc...)"
    global feature_vectors
    featt = []
    vals = Dic.values()
    for l in vals:
        for p in l:
            featt.append(p[0])
    feature_vectors = list(set(featt))
    
def CreateSemanticPointer (PairList):
    "Turns list as [[feat1,feat_val],[feat2,feat_val],[feat3,feat_val]] into vector feat1*feat_val + feat2*feat_val ..."
    vecs = []
    for pair in PairList:
        vecs.append(Dict[pair[0]] * Dict[pair[1]])
    return ADD(vecs)

def SaveDefinitions(Dic):
    """Given the definitions dictionary, and having all its concepts previously stored in memory, this functions
       creates a definition vector (semantic pointer) using HD operations and assign it as a pointer to an 
       object vector (ID vector)."""
    global feature_vectors
    # Going through all elements in dictionary
    for key, value in Dic.iteritems():
        Dict[key].setPointer(CreateSemanticPointer(value))
        
def NormalizeHammDist (Dist_list):
    "Given a distance list of the form [['name', dist], ['name', dist], ... ], it normalize each distance and return a list with the same form"
    for i in range(len(Dist_list)):
        Dist_list[i][1] = round( 1. - Dist_list[i][1] / float(N * 0.5), 3 ) #aqui meterle thr... 0.45...0.6??
    return Dist_list

### Initializing memory

In [13]:
def Init_mem():
    init()
    thr = 0.4 * N
    # Read dataset and create definition dictionary
    CreateDictionary()
    # Feature vectors
    FeatureVectors(Dict_defs)
    # Save concepts into memory (ID vectors)
    SaveConcepts(Dict_defs)
    # Associate definitions to concepts into memory (SP vectors)
    SaveDefinitions(Dict_defs)
    print "End of encoding"

Init_mem()

End of initialization
End of encoding


## Semantic Similarity using NLTK library

### Auxiliar functions for similarity metrics

In [14]:
brown_ic = wordnet_ic.ic('ic-brown.dat')

def get_concepts_list ():
    "Returns a list of strings: the names of the concepts"
    df = pd.read_excel(pathh + 'CONCS_Synset_brm.xlsx') #../McRaedataset/CONCS_Synset_brm.xlsx')
    return map(str, list(df['Concept']))
    
def get_synset (concept):
    "Given a concept name (string) it returns its synset (string)"
    # Dataframe for excel document
    df = pd.read_excel(pathh + 'CONCS_Synset_brm.xlsx') #../McRaedataset/CONCS_Synset_brm.xlsx')
    row = df.loc[df['Concept'] == concept]
    return str(list(row['Synset'] )[0])

def Apply_sim_metric ( similarity_metric, num, in_concept, corpus = None):
    "Given a similarity_metric function it returns a list of the num closest concepts to 'concept'"
    dist_list = []
    for c in Concepts:
        c_synset = wn.synset( get_synset(c) )
        if corpus:
            dist_list.append([c, round(similarity_metric(in_concept, c_synset, corpus), 3) ])
        else:
            dist_list.append([c, round(similarity_metric(in_concept, c_synset), 3) ])
    return sorted(dist_list, key = lambda r : r[1], reverse = True ) [:num]

### Getting list of closest vectors (HDcomputing and from Dataset)

In [15]:
#Test_Concepts = ['hose','piano','rope', 'sword', 'train', 'toilet'] #['airplane', 'apple', 'bed', 'coin']"
# Por familiaridad... 
Test_Concepts =  ['stove', 'bowl', 'apple','toaster', 'piano']

num_concepts_1 = 6 #20
num_concepts_2 = 10 #20

# List of concepts for NLTK similarity metrics
Concepts = get_concepts_list() 

for test_concept in Test_Concepts:
    print "\n\nStarting with...", test_concept
    # Asking closest concept of another concept's definition...
    HDC_sim = HDvector.getLabelSP(Dict[test_concept].getPointer())[:num_concepts_1]
    #HDC_sim = NormalizeHammDist(HDC_sim)
    DatSet_sim = ClosestConcepts(test_concept, num_concepts_1)
    print "HDC_sim: ", HDC_sim
    print "DatSet_sim: ", DatSet_sim
    
    concept = wn.synset( get_synset(test_concept) )
    # Path similarity
    #Path_sim = Apply_sim_metric(wn.path_similarity, num_concepts_2, concept)
    #LC_sim = Apply_sim_metric(wn.lch_similarity, num_concepts_2, concept ) 
    WUP_sim = Apply_sim_metric(wn.wup_similarity, num_concepts_2, concept )
    print "WUP_sim: ", WUP_sim
    # Information Content
    #Res_sim = Apply_sim_metric(wn.res_similarity, num_concepts_2, concept, brown_ic)
    JC_sim = Apply_sim_metric(wn.jcn_similarity, num_concepts_2, concept, brown_ic)
    #Lin_sim = Apply_sim_metric(wn.lin_similarity, num_concepts_2, concept, brown_ic)
    print "JC_sim: ", JC_sim
    
    # SETS... Performing Unions and Intersections
    # Creating name sets
    HDC_names = set([x[0] for x in HDC_sim])
    DatSet_names = set([x[0] for x in DatSet_sim])
    #Path_names = set([x[0] for x in Path_sim])
    #LC_names = set([x[0] for x in LC_sim])
    WUP_names = set([x[0] for x in WUP_sim])
    #Res_names = set([x[0] for x in Res_sim])
    JC_names = set([x[0] for x in JC_sim])
    #Lin_names = set([x[0] for x in Lin_sim])
    
    # Intersection between HDC and Dataset
    union_1 = HDC_names.union(DatSet_names)  #Era intersección... pero así esta mejor...

    # Union of intersecion Path and intersection IC...
    #PathInt = Path_names.intersection(LC_names, WUP_names)
    #ICInt = Res_names.intersection(JC_names, Lin_names)
    union_2 = WUP_names.union(JC_names)

    # Intersection between all NLTK metrics
    #union_3 = PathInt.intersection(ICInt)

    # Intersection all NLTLK metrics and HDC and Dataset
    #union_4 = HDC_names.intersection(PathInt, ICInt)
    #union_5 = DatSet_names.intersection(PathInt, ICInt)
    #union_6 = HDC_names.intersection(DatSet_names, PathInt, ICInt)

    # MENOS CONCEPTOS EN LA UNION FINAL ES MEJOR... TAL VEZ PUEDA APLICAR ESTE PROGRAMA A TODOS LOS CONCEPTOS Y 
    # SELECCIONAR LOS QUE TIENEN LA LONGITUD MENOR... 
    # Ultimate union...
    ult_union = set.union( union_1, union_2 )
    print "\nUltimate Union for", test_concept,": \n", ult_union, "\nLenght: ", len(ult_union)



Starting with... stove
HDC_sim:  [['stove', 0], ['oven', 3280], ['toaster', 3325], ['mixer', 3763], ['spatula', 3933], ['dishwasher', 4006]]
DatSet_sim:  [['stove', 1.0], ['toaster', 0.727], ['oven', 0.676], ['microwave', 0.548], ['mixer', 0.503], ['pot', 0.477]]
WUP_sim:  [['stove', 1.0], ['microwave', 0.917], ['oven', 0.917], ['toaster', 0.917], ['dishwasher', 0.833], ['fridge', 0.8], ['freezer', 0.769], ['apron', 0.636], ['belt', 0.636], ['bra', 0.636]]
JC_sim:  [['stove', 1e+300], ['oven', 0.317], ['microwave', 0.202], ['fridge', 0.185], ['pants', 0.181], ['freezer', 0.172], ['dress', 0.162], ['dishwasher', 0.154], ['coat', 0.153], ['cloak', 0.142]]

Ultimate Union for stove : 
set(['apron', 'stove', 'fridge', 'spatula', 'microwave', 'oven', 'dress', 'bra', 'belt', 'dishwasher', 'coat', 'toaster', 'cloak', 'mixer', 'freezer', 'pot', 'pants']) 
Lenght:  17


Starting with... bowl
HDC_sim:  [['bowl', 0], ['dish', 3140], ['plate', 3869], ['bucket', 4044], ['tray', 4107], ['spoon', 4

KeyboardInterrupt: 