## Encoding the knowledge base into Hyperdimensional Vectors

In this notebook the functions from the 'HDComputing' notebook are used to encode the McRae dataset. The following functions create an heteroassociative memory in which a knowledge base of Semantic Features representation of concepts is stored.

### Importing libraries and HD computing functions

In [114]:
import pandas as pd
import nltk

#Only done once... 
#nltk.download('wordnet')
#nltk.download('wordnet_ic')

from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic


%run HDComputing_basics.ipynb

### Functions for reading dataset

In [115]:
def TranslateFeats(ListFeat):
    "It receives a list of features such as ['is_blue', 'is_rectangular'] and it returns: [['color','blue'], ['shape','rectangular']"
    # Dataframe for excel document
    df = pd.read_excel(pathh + 'FEATS_brm.xlsx') #../McRaedataset/FEATS_brm.xlsx')
    ListPairs = []
    for feat in ListFeat:
        # Row for feature...
        row = df.loc[df['Feature'] == feat]       
        # Look for values in vec_feat and vec_value
        ListPairs.append([str(row['feat_name'].tolist()[0]), str(row['feat_value'].tolist()[0])])       
    return ListPairs

def ClosestConcepts (concept, nc):
    "Given a concept label, this function reads the distance matrix from McRae's and returns the 'nc' closests concepts in a list"
    # Excel document to data frame...
    try:
        df = pd.read_excel(pathh + 'cos_matrix_brm_IFR.xlsx','1st_200') #../McRaeDataset/cos_matrix_brm_IFR.xlsx', '1st_200')
        ordered = df.sort_values(by=concept, ascending=False)[['CONCEPT', concept]]
    except: 
        try:
            df = pd.read_excel(pathh + 'cos_matrix_brm_IFR.xlsx','2nd_200') # ('../McRaeDataset/cos_matrix_brm_IFR.xlsx', '2nd_200')
            ordered = df.sort_values(by=concept, ascending=False)[['CONCEPT', concept]]
        except:
            df = pd.read_excel(pathh + 'cos_matrix_brm_IFR.xlsx','last_141') #('../McRaeDataset/cos_matrix_brm_IFR.xlsx', 'last_141')
            ordered = df.sort_values(by=concept, ascending=False)[['CONCEPT', concept]]
    
    L1 = list(ordered['CONCEPT'][0:nc])
    L1 = map(str, L1)
    L2 = zip(L1,list(ordered[concept][0:nc]))
    L2 = map(list, L2)
    
    return L2

# Encoding features

### Normal encoding
All features are included in definition

(PONER ECUACIÓN....)

In [116]:
def ReadDefinitions(max_num_feats, names_list): #Con_List is the list of concepts to encode... 
    "Given an xlsx file it returns all the concepts feature values as they appear in the original dataset"
    #Dataframe for excel document
    df = pd.read_excel( pathh + 'CONCS_FEATS_concstats_brm.xlsx') #../McRaeDataset/CONCS_FEATS_concstats_brm.xlsx') #MINI_
    #Create a list with all concept names
    #names = set(df['Concept'])
    
    # Extract list of features for each name
    Concepts = []
    for n in names_list:
        row = df.loc[df['Concept'] == n]
        Concepts.append([str(n), map(str,list(row['Feature']))[:max_num_feats]])
    return Concepts

## Auxiliar function for giving weights to features

It repeats features within the feature's list...

- If the list lenght (len) is less than or equal to 3 does nothing... [[feat_1 , val_1], [feat_2, val_2], [feat_2, val_2]] -> " "

- If 3 < len <= 5 repeats first 2 two times... 
        [[feat_1, val_1],...,[feat_5, val_5]] -> [[feat_1, val_1], [feat_2, val_2], [feat_2, val_2], [feat_3, val_3], 
                                                 [feat_4, val_4], [feat_5, val_5]]
                                                                                       
- If 5 < len <= 8, repeats feat 1 and 2 three times, feats 3 and 4 twice and 5 - 8 once... 

- If 8 < len repeats feat 1 and 2 four times, 3 and 4 three times, 5 - 7 twice, and 8 -> once... 

In [117]:
def Repeat_features(selected_features):
    "It receives a list of features and returns with repeated elements. Repetition ensembles weight"
    L = []
    if len(selected_features) <= 3:
        return selected_features
    elif len(selected_features) <= 5:
        for i in range(len(selected_features)):
            if i < 2:
                L.extend([selected_features[i]]*2)
            else:
                L.extend([selected_features[i]])
                
    elif len(selected_features) <= 8:
        for i in range(len(selected_features)):
            if i < 2:
                L.extend([selected_features[i]]*3)
            elif i < 4:
                L.extend([selected_features[i]]*2)
            else:
                L.extend([selected_features[i]])
    else:
        for i in range(len(selected_features)):
            if i < 2:
                L.extend([selected_features[i]]*4)
            elif i < 4:
                L.extend([selected_features[i]]*3)
            elif i < 7:
                L.extend([selected_features[i]]*2)
            else:
                L.extend([selected_features[i]])
    return L

### Weighting by *Rank_PF*

Only the *max_num_feats* features with the highest frequency are encoded...

In [118]:
def ReadDefs_RankPF(max_num_feats, names_list):
    "Given an xlsx file it returns all the concepts feature values as they appear in the original dataset"
    
    #Dataframe for excel document
    df = pd.read_excel( pathh + 'CONCS_FEATS_concstats_brm.xlsx')
    
    #Create a list with all concept names
    #names = df['Concept'].unique().tolist()
    #names = map(str, names)
    Concepts = []
    
    # Extract list of features for each concept
    for name in names_list:
        # Locating the concept by name
        row = df.loc[df['Concept'] == name]
        
        # Reading features and Rank_PF values
        selected_features = row[['Feature','Rank_PF']].values.tolist()
        
        # Setting strings into an appropiate format
        selected_features = map(lambda x: [str(x[0]), int(x[1])], selected_features)
        
        # Sorting by Rank_PF and keeping only the most frequent (the max_num_feats most common)... 
        selected_features = sorted(selected_features, key = lambda x: x[1])[:max_num_feats]
        
        # Keeping only the feature's name (removing Rank_PF value)
        selected_features = [x[0] for x in selected_features]

        # Creating final representation 
        Concepts.append([str(name), Repeat_features(selected_features)]) 
    return Concepts

#Defs = ReadDefs_RankPF(10)
#print Defs

# Statistical measures
#numero_feats = [len(set(x[1])) for x in Defs]
#print numero_feats, len(numero_feats), max(numero_feats), min(numero_feats), sum(numero_feats)/len(numero_feats)


### Weighting by *Disting (D)*

Only the features classified as *Disting* in the dataset are selected.

In [119]:
def ReadDefs_Disting(max_num_feats, names_list):
    "Given an xlsx file it returns all the concepts feature values as they appear in the original dataset"
    
    #Dataframe for excel document
    df = pd.read_excel( pathh + 'CONCS_FEATS_concstats_brm.xlsx')
    
    #Create a list with all concept names
    #names = df['Concept'].unique().tolist()
    #names = map(str, names)
    
    Concepts = []
    # Extract list of features for each concept
    for name in names_list:
        # Locating the concept by name
        row = df.loc[df['Concept'] == name]
        
        # Reading features and Rank_PF values
        selected_features = row[['Feature','Disting']].values.tolist()
        # Setting strings into an appropiate format
        selected_features = map(lambda x: [str(x[0]), str(x[1])], selected_features) 
        # Select only the 'D' features 
        selected_features = [x[0] for x in selected_features if x[1] == 'D']

        Concepts.append([str(name), Repeat_features(selected_features[:max_num_feats])]) 
    return Concepts

#Defs = ReadDefs_Disting(6)
#print Defs

# Luego de eliminar los features con valor intercorr = 0 quiero ver cual es el promedio de features por concepto,
# así como max y min
#numero_feats = [len(set(x[1])) for x in Defs]
#print numero_feats, len(numero_feats), max(numero_feats), min(numero_feats), sum(numero_feats)/len(numero_feats)


### Weighting by *Intercorr_str_tax*

According to McRae dataset this variable: measures "*intercorrelational strength of feature for that concept*"
Not combining... pure Intercorr_str_tax and conditions...

In [120]:
def ReadDefs_Intercorr_str (max_num_feats, names_list):
    "Given an xlsx file it returns all the concepts feature values as they appear in the original dataset"
    #Dataframe for excel document
    df = pd.read_excel( pathh + 'CONCS_FEATS_concstats_brm.xlsx')
    
    #Create a list with all concept names
    #names = df['Concept'].unique().tolist()
    #names = map(str, names)
    
    Concepts = []
    # Extract list of features for each concept
    for name in names_list:
        # Locating the concept by name
        row = df.loc[df['Concept'] == name]
        
        # Reading features and Rank_PF values
        selected_features = row[['Feature','Intercorr_Str_Tax']].values.tolist()
        # Setting strings into an appropiate format
        selected_features = map(lambda x: [str(x[0]), float(x[1])], selected_features) 
        
        # Keeping only the features where Intercorr_Str_Tax is higher than 0... 
        selected_features = [x for x in selected_features if x[1] > 0]
        
        # Sorting by Intercorr_Str_Tax and keeping only the highest (the 'max_num_feats' highest)... 
        selected_features = sorted(selected_features, key = lambda x: x[1])[:max_num_feats]
        
        # Keeping only the feature's name (removing Intercorr_Str_Tax value)
        selected_features = [x[0] for x in selected_features]

        # Creating final representation 
        Concepts.append([str(name), Repeat_features(selected_features)]) 
    return Concepts

#Defs = ReadDefs_Intercorr_str(8)
#print Defs
# hacer pruebas para ver si alguna no tiene disting... o cual es el máx y cual es el min... 

# Luego de eliminar los features con valor intercorr = 0 quiero ver cual es el promedio de features por concepto,
# así como max y min
#numero_feats = [len(set(x[1])) for x in Defs]
#print numero_feats, len(numero_feats), max(numero_feats), min(numero_feats), sum(numero_feats)/len(numero_feats)


### Combining *Intercorr_str_tax* and *Rank_PF*

In this function we complement the features selected based on the *Intercorr_str_tax* variable with features selected based on frequency of mention, that is to say *Rank_PF*.
The goal is for each concept to have at least **6** features. For those concepts where the *Intercorr_str_tax* variable does not provide with 6 features, we complement the list of features by selecting the rest based on Rank_PF (frequency).

In [121]:
def ReadDefs_Intercorr_Rank (max_num_feats, names_list):
    "Given an xlsx file it returns all the concepts feature values as they appear in the original dataset"
    #Dataframe for excel document
    df = pd.read_excel( pathh + 'CONCS_FEATS_concstats_brm.xlsx')
    
    #Create a list with all concept names
    #names = df['Concept'].unique().tolist()
    #names = map(str, names)
    
    Concepts = []
    # Extract list of features for each concept
    for name in names_list:
        # Locating the concept by name
        row = df.loc[df['Concept'] == name]
        
        # Reading features and Rank_PF values
        selected_features = row[['Feature','Rank_PF','Intercorr_Str_Tax']].values.tolist()
        # Setting strings into an appropiate format
        selected_features = map(lambda x: [str(x[0]), int(x[1]), float(x[2])], selected_features) 
        
        # Keeping only the features where Intercorr_Str_Tax is higher than 0... 
        selected_features1 = [x for x in selected_features if x[2] > 0]
        
        # Sorting by Intercorr_Str_Tax and keeping only the highest (the 'max_num_feats' highest)... 
        selected_features1 = sorted(selected_features1, key = lambda x: x[2])

        # Are there at least 6 features??
        if len(selected_features1) < 6:
            # Se ordenan por Rank_PF...
            selected_features2 = sorted(selected_features, key = lambda x: x[1])
            # Appending additional features (based on Rank_PF)
            for featt in selected_features2:
                if featt not in selected_features1 and len(selected_features1) < 6:  # QUITAR ESTA SEGUNDA CONDICIÓN...
                    selected_features1.append(featt)
        
        # Keeping only the feature's name (removing Rank_PF and Intercorr_Str_Tax value)
        selected_features1 = [x[0] for x in selected_features1]

        # Creating final representation 
        Concepts.append([str(name), Repeat_features(selected_features1[:max_num_feats])]) 
    return Concepts

#Defs = ReadDefs_Intercorr_Rank(10)
#print Defs

# Luego de eliminar los features con valor intercorr = 0 quiero ver cual es el promedio de features por concepto, 
# así como max y min
#numero_feats = [len(set(x[1])) for x in Defs]
#print numero_feats, len(numero_feats), max(numero_feats), min(numero_feats), sum(numero_feats)/len(numero_feats)

## Storing ID vectors into memory

### Creating definitions dictionary

In [122]:
def CreateDictionary( mode , max_num_feats, names_list):
    global Dict_defs
    if mode == 'normal':
        data = ReadDefinitions(max_num_feats, names_list)
    elif mode == 'Rank_PF':
        data = ReadDefs_RankPF(max_num_feats, names_list)  # Based on Rank_PF (most commonly mentioned features)
    elif mode == 'Disting':
        data = ReadDefs_Disting(max_num_feats, names_list)
    elif mode == 'Intercorr_str':
        data = ReadDefs_Intercorr_str(max_num_feats, names_list)
    elif mode == 'Intercorr-Rank':
        data = ReadDefs_Intercorr_Rank(max_num_feats, names_list)
        
    for concept in data:
        Dict_defs[concept[0]] = TranslateFeats(concept[1])

### Memory functions

In [123]:
def flat_list (L):
    "Recursive function that flats a list of lists (at any level)"
    if L == []:
        return L
    if type(L[0]) is list:
        return flat_list(L[0]) + flat_list(L[1:])
    return L[:1] + flat_list(L[1:])

def FeatureVectors(Dic):
    "It extract from the definition dictionary all the feature type vectors ('is','has','color', etc...)"
    global feature_vectors
    featt = []
    vals = Dic.values()
    for l in vals:
        for p in l:
            featt.append(p[0])
    feature_vectors = list(set(featt))
    
    
def SaveConcepts(Dic):
    """Given a definitions dictionary it stores in memory the entire set of concepts in the dictionary (including feature vectors)"""
    keys = Dic.keys()
    vals = Dic.values()
    all_concepts = list(set(flat_list(vals) + keys))
    # Process for storing list of concepts in memory
    for concept in all_concepts:
        HDvector(N,concept) #This creates an object and store it in memory    
    
def CreateSemanticPointer (PairList):
    "Turns list as [[feat1,feat_val],[feat2,feat_val],[feat3,feat_val]] into vector feat1*feat_val + feat2*feat_val ..."
    if len(PairList) == 0:
        return HDvector(N)
    vecs = []
    for pair in PairList:
        vecs.append(Dict[pair[0]] * Dict[pair[1]])
    return ADD(vecs)

def SaveDefinitions(Dic):
    """Given the definitions dictionary, and having all its concepts previously stored in memory, this functions
       creates a definition vector (semantic pointer) using HD operations and assign it as a pointer to an 
       object vector (ID vector)."""
    global feature_vectors
    # Going through all elements in dictionary
    for key, value in Dic.iteritems():
        Dict[key].setPointer(CreateSemanticPointer(value))
        
def NormalizeHammDist (Dist_list):
    "Given a distance list of the form [['name', dist], ['name', dist], ... ], it normalize each distance and return a list with the same form"
    for i in range(len(Dist_list)):
        Dist_list[i][1] = round( 1. - Dist_list[i][1] / float(N), 3 ) 
    return Dist_list

### Initializing memory

In [124]:
def Init_mem( mode = 'normal', max_num_feats = 100 , names_list = None):
    init()
    print "Begining to encode dataset..."
    thr = 0.45 * N
    # Read dataset and create definition dictionary
    CreateDictionary( mode, max_num_feats, names_list )
    # Feature vectors
    FeatureVectors(Dict_defs)
    # Save concepts into memory (ID vectors)
    SaveConcepts(Dict_defs)
    # Associate definitions to concepts into memory (SP vectors)
    SaveDefinitions(Dict_defs)
    print "End of encoding"

## Reading similarity from distance matrix (McRae)

In [125]:
def McRae_simi (pair_concepts):
    "Given a pair of concepts (in a list) it consults the similarity from the cos_matrix... file"
    try: 
        df = pd.read_excel(pathh + 'cos_matrix_brm_IFR.xlsx','1st_200')
        return list(df.loc[df['CONCEPT'] == pair_concepts[0]][pair_concepts[1]])[0]
    except:
        try:
            df = pd.read_excel(pathh + 'cos_matrix_brm_IFR.xlsx','2nd_200')
            return list(df.loc[df['CONCEPT'] == pair_concepts[0]][pair_concepts[1]])[0]
        except:
            df = pd.read_excel(pathh + 'cos_matrix_brm_IFR.xlsx','last_141')
            return list(df.loc[df['CONCEPT'] == pair_concepts[0]][pair_concepts[1]])[0]

## Semantic similarity using NLTK library functions
### Auxiliar functions

In [126]:
brown_ic = wordnet_ic.ic('ic-brown.dat')

def get_concepts_list ():
    "Returns a list of strings: the names of the concepts"
    df = pd.read_excel(pathh + 'CONCS_Synset_brm.xlsx') #../McRaedataset/CONCS_Synset_brm.xlsx')
    return map(str, list(df['Concept']))
    
def get_synset (concept):
    "Given a concept name (string) it returns its synset (string)"
    # Dataframe for excel document
    df = pd.read_excel(pathh + 'CONCS_Synset_brm.xlsx') #../McRaedataset/CONCS_Synset_brm.xlsx')
    row = df.loc[df['Concept'] == concept]
    return str(list(row['Synset'] )[0])

def apply_sim_metric ( similarity_metric, num, in_concept, corpus = None):
    "Given a similarity_metric function it returns a list of the num closest concepts to 'concept'"
    dist_list = []
    for c in Concepts:
        c_synset = wn.synset( get_synset(c) )
        if corpus:
            dist_list.append([c, round(similarity_metric(in_concept, c_synset, corpus), 3) ])
        else:
            dist_list.append([c, round(similarity_metric(in_concept, c_synset), 3) ])
    return sorted(dist_list, key = lambda r : r[1], reverse = True ) [:num]

def similarity_fun ( similarity_metric, pair, corpus = None):
    "Given a similarity_metric function it returns a list of the num closest concepts to 'concept'"
    c_synset_1 = wn.synset( get_synset(pair[0]))
    c_synset_2 = wn.synset( get_synset(pair[1]))
    if corpus:
        return round(similarity_metric(c_synset_1, c_synset_2, corpus), 3)
    else:
        return round(similarity_metric(c_synset_1, c_synset_2), 3)