## Encoding the knowledge base into Hyperdimensional Vectors

In this notebook the functions from the 'HDComputing' notebook are used to encode the McRae dataset. The following functions create an heteroassociative memory in which a knowledge base of Semantic Features representation of concepts is stored.

### Importing libraries and HD computing functions

In [47]:
import pandas as pd
import nltk

#Only done once... 
#nltk.download('wordnet')
#nltk.download('wordnet_ic')

from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic


%run HDComputing_basics.ipynb

### Functions for reading dataset

In [48]:
def TranslateFeats(ListFeat):
    "It receives a list of features such as ['is_blue', 'is_rectangular'] and it returns: [['color','blue'], ['shape','rectangular']"
    # Dataframe for excel document
    df = pd.read_excel(pathh + 'FEATS_brm.xlsx') #../McRaedataset/FEATS_brm.xlsx')
    ListPairs = []
    for feat in ListFeat:
        # Row for feature...
        row = df.loc[df['Feature'] == feat]       
        # Look for values in vec_feat and vec_value
        ListPairs.append([str(row['feat_name'].tolist()[0]), str(row['feat_value'].tolist()[0])])       
    return ListPairs

def ClosestConcepts (concept, nc):
    "Given a concept label, this function reads the distance matrix from McRae's and returns the 'nc' closests concepts in a list"
    # Excel document to data frame...
    try:
        df = pd.read_excel(pathh + 'cos_matrix_brm_IFR.xlsx','1st_200') #../McRaeDataset/cos_matrix_brm_IFR.xlsx', '1st_200')
        ordered = df.sort_values(by=concept, ascending=False)[['CONCEPT', concept]]
    except: 
        try:
            df = pd.read_excel(pathh + 'cos_matrix_brm_IFR.xlsx','2nd_200') # ('../McRaeDataset/cos_matrix_brm_IFR.xlsx', '2nd_200')
            ordered = df.sort_values(by=concept, ascending=False)[['CONCEPT', concept]]
        except:
            df = pd.read_excel(pathh + 'cos_matrix_brm_IFR.xlsx','last_141') #('../McRaeDataset/cos_matrix_brm_IFR.xlsx', 'last_141')
            ordered = df.sort_values(by=concept, ascending=False)[['CONCEPT', concept]]
    
    L1 = list(ordered['CONCEPT'][0:nc])
    L1 = map(str, L1)
    L2 = zip(L1,list(ordered[concept][0:nc]))
    L2 = map(list, L2)
    
    return L2

# Encoding features

### Normal encoding
All features are included in definition

(PONER ECUACIÓN....)

In [49]:
def ReadDefinitions(max_num_feats, names_list): #Con_List is the list of concepts to encode... 
    "Given an xlsx file it returns all the concepts feature values as they appear in the original dataset"
    #Dataframe for excel document
    df = pd.read_excel( pathh + 'CONCS_FEATS_concstats_brm.xlsx') #../McRaeDataset/CONCS_FEATS_concstats_brm.xlsx') #MINI_
    #Create a list with all concept names
    #names = set(df['Concept'])
    
    # Extract list of features for each name
    Concepts = []
    for n in names_list:
        row = df.loc[df['Concept'] == n]
        Concepts.append([str(n), map(str,list(row['Feature']))[:max_num_feats]])
    return Concepts

## Auxiliar function for giving weights to features

It repeats features within the feature's list...

- If the list lenght (len) is less than or equal to 3 does nothing... [[feat_1 , val_1], [feat_2, val_2], [feat_2, val_2]] -> " "

- If 3 < len <= 5 repeats first 2 two times... 
        [[feat_1, val_1],...,[feat_5, val_5]] -> [[feat_1, val_1], [feat_2, val_2], [feat_2, val_2], [feat_3, val_3], 
                                                 [feat_4, val_4], [feat_5, val_5]]
                                                                                       
- If 5 < len <= 8, repeats feat 1 and 2 three times, feats 3 and 4 twice and 5 - 8 once... 

- If 8 < len repeats feat 1 and 2 four times, 3 and 4 three times, 5 - 7 twice, and 8 -> once... 

In [101]:
def Repeat_features(selected_features):
    "It receives a list of features and returns with repeated elements. Repetition ensembles weight"
    L = []
    if len(selected_features) <= 12:
        return selected_features
    elif len(selected_features) <= 16:
        for i in range(len(selected_features)):
            if i < 2:
                L.extend([selected_features[i]]*2)
            else:
                L.extend([selected_features[i]])
                
    elif len(selected_features) <= 18:
        for i in range(len(selected_features)):
            if i < 2:
                L.extend([selected_features[i]]*3)
            elif i < 4:
                L.extend([selected_features[i]]*2)
            else:
                L.extend([selected_features[i]])
    else:
        for i in range(len(selected_features)):
            if i < 2:
                L.extend([selected_features[i]]*4)
            elif i < 4:
                L.extend([selected_features[i]]*3)
            elif i < 7:
                L.extend([selected_features[i]]*2)
            else:
                L.extend([selected_features[i]])
    return L

def WeightFeatures2(sel_features):
    "It receives a list of features-ProdFreq and returns with repeated elements based on the value of ProdFreq"
    L = []
    for feat in sel_features:
        if 12 <= feat[1]:   #13... 15 muy bueno
            L.extend([feat[0]] * 2)
        else:
            L.extend([feat[0]])
    return L

def WeightFeatures3(sel_features):
    "It receives a list of features-ProdFreq and returns with repeated elements based on the value of ProdFreq"
    L = []
    for feat in sel_features:
        if 19 <= feat[1]:
            L.extend([feat[0]] * 3)
        elif 9 <= feat[1]:
            L.extend([feat[0]] * 2)
        else:
            L.extend([feat[0]])
    return L


def WeightFeatures4(sel_features):
    "It receives a list of features-ProdFreq and returns with repeated elements based on the value of ProdFreq"
    L = []
    for feat in sel_features:
        if 19 <= feat[1]:
            L.extend([feat[0]] * 4)
        elif 13 <= feat[1]:
            L.extend([feat[0]] * 3)
        elif 9 <= feat[1]:
            L.extend([feat[0]] * 2)
        else:
            L.extend([feat[0]])
    return L


def WeightFeatures5(sel_features):
    "It receives a list of features-ProdFreq and returns with repeated elements based on the value of ProdFreq\
    28 - 30 : 5 times   |   21 - 27 : 4 times  | 15 - 20 : 3 times  | 10 - 14 : 2 times | less than 9 : 1 time"
    L = []
    for feat in sel_features:
        if 28 <= feat[1]:
            L.extend([feat[0]] * 5)
        elif 21 <= feat[1]:
            L.extend([feat[0]] * 4)
        elif 15 <= feat[1]:
            L.extend([feat[0]] * 3)
        elif 10 <= feat[1]:
            L.extend([feat[0]] * 2)
        else:
            L.extend([feat[0]])
    return L

def WeightFeatures6(sel_features):
    "It receives a list of features-ProdFreq and returns with repeated elements based on the value of ProdFreq\
    28 - 30 : 5 times   |   21 - 27 : 4 times  | 15 - 20 : 3 times  | 10 - 14 : 2 times | less than 9 : 1 time"
    L = []
    for feat in sel_features:
        if 28 <= feat[1]:
            L.extend([feat[0]] * 6)
        elif 24 <= feat[1]:
            L.extend([feat[0]] * 5)
        elif 18 <= feat[1]:
            L.extend([feat[0]] * 4)
        elif 13 <= feat[1]:
            L.extend([feat[0]] * 3)
        elif 9 <= feat[1]:
            L.extend([feat[0]] * 2)    
        else:
            L.extend([feat[0]])
    return L



# FOR INTERCORR...
def WeightFeatures_Interr2(sel_features):
    "It receives a list of features-ProdFreq and returns with repeated elements based on the value of ProdFreq\
    28 - 30 : 5 times   |   21 - 27 : 4 times  | 15 - 20 : 3 times  | 10 - 14 : 2 times | less than 9 : 1 time"
    L = []
    for feat in sel_features:
        if feat[1] >= 6:
            L.extend([feat[0]] * 2)
        else:
            L.extend([feat[0]])
    return L

def WeightFeatures_Interr3(sel_features):
    "It receives a list of features-ProdFreq and returns with repeated elements based on the value of ProdFreq"
    L = []
    for feat in sel_features:
        if feat[1] <= 100.0:
            L.extend([feat[0]])
        elif feat[1] <= 150.0:
            L.extend([feat[0]] * 2)
        else:
            L.extend([feat[0]] * 3)
    return L

def WeightFeatures_Interr4(sel_features):
    "It receives a list of features-ProdFreq and returns with repeated elements based on the value of ProdFreq"
    L = []
    for feat in sel_features:
        if feat[1] <= 50.0:
            L.extend([feat[0]])
        elif feat[1] <= 100.0:
            L.extend([feat[0]] * 2)
        elif feat[1] <= 150.0:
            L.extend([feat[0]] * 3)
        else:
            L.extend([feat[0]] * 4)
    return L

def WeightFeatures_Interr5(sel_features):
    "It receives a list of features-ProdFreq and returns with repeated elements based on the value of ProdFreq\
    28 - 30 : 5 times   |   21 - 27 : 4 times  | 15 - 20 : 3 times  | 10 - 14 : 2 times | less than 9 : 1 time"
    L = []
    for feat in sel_features:
        if feat[1] <= 10.0:
            L.extend([feat[0]])
        elif feat[1] <= 50.0:
            L.extend([feat[0]] * 2)
        elif feat[1] <= 100.0:
            L.extend([feat[0]] * 3)
        elif feat[1] <= 150.0:
            L.extend([feat[0]] * 4)
        else:
            L.extend([feat[0]] * 5)
    return L


# Split in two... but combines PF and InterrCorr

def WeightFeatures_PF_Interr(sel_features):
    "It receives a list of features-ProdFreq-InterCorr..."
    L = []
    for feat in sel_features:
        if feat[1] >= 13 or feat[2] >= 6:
            L.extend([feat[0]] * 2)
        else:
            L.extend([feat[0]])
    return L


### Weighting by *Production Frequency (Prod_Freq)*

We weight the features based on Production Frequency values... 

In [51]:
def ReadDefs_ProdFreq(max_num_feats, names_list):
    "Given an xlsx file it returns all the concepts feature values as they appear in the original dataset"
    #Dataframe for excel document
    df = pd.read_excel( pathh + 'CONCS_FEATS_concstats_brm.xlsx')
    
    Concepts = []
    # Extract list of features for each concept
    for name in names_list:
        # Locating the concept by name
        row = df.loc[df['Concept'] == name]
        # Reading features and Rank_PF values
        selected_features = row[['Feature','Prod_Freq']].values.tolist()
        # Setting strings into an appropiate format
        selected_features = map(lambda x: [str(x[0]), float(x[1])], selected_features) 
        # Sorting
        selected_features = sorted(selected_features, key = lambda x : x[1], reverse = True)
        
        # It keeps Prof_Freq value... 
        Concepts.append([str(name), WeightFeatures2(selected_features[:max_num_feats])]) 
        
    return Concepts

# pathh = '../Data/' 
# L = ['tomato']
# Defs = ReadDefs_ProdFreq(22, L)
# print Defs

### Weighting by *Intercorr_str_tax*

According to McRae dataset this variable: measures "*intercorrelational strength of feature for that concept*"
Not combining... pure Intercorr_str_tax and conditions...

In [52]:
def ReadDefs_Intercorr_str (max_num_feats, names_list):
    "Given an xlsx file it returns all the concepts feature values as they appear in the original dataset"
    #Dataframe for excel document
    df = pd.read_excel( pathh + 'CONCS_FEATS_concstats_brm.xlsx')
    
    Concepts = []
    # Extract list of features for each concept
    for name in names_list:
        # Locating the concept by name
        row = df.loc[df['Concept'] == name]
        
        # Reading features and Rank_PF values
        selected_features = row[['Feature','Intercorr_Str_Tax']].values.tolist()
        
        # Setting strings into an appropiate format
        selected_features = map(lambda x: [str(x[0]), float(x[1])], selected_features) 
        
        # Sorting by Intercorr_Str_Tax...
        selected_features = sorted(selected_features, key = lambda x: x[1])[:max_num_feats]

        # Creating final representation 
        Concepts.append([str(name), WeightFeatures_Interr2(selected_features)]) 
    return Concepts
# pathh = '../Data/' 
# L = ['tomato']
# Defs = ReadDefs_Intercorr_str(22, L)
# print Defs

### Weighting by *Intercorr_str_no_tax*

According to McRae dataset this variable: measures "*intercorrelational strength of feature for that concept*"
Not combining... pure Intercorr_str_No_tax and conditions...

In [53]:
def ReadDefs_Intercorr_NoT_str (max_num_feats, names_list):
    "Given an xlsx file it returns all the concepts feature values as they appear in the original dataset"
    #Dataframe for excel document
    df = pd.read_excel( pathh + 'CONCS_FEATS_concstats_brm.xlsx')
    
    Concepts = []
    # Extract list of features for each concept
    for name in names_list:
        # Locating the concept by name
        row = df.loc[df['Concept'] == name]
        
        # Reading features and Rank_PF values
        selected_features = row[['Feature','Intercorr_Str_No_Tax']].values.tolist()
        
        # Setting strings into an appropiate format
        selected_features = map(lambda x: [str(x[0]), float(x[1])], selected_features) 
        
        # Sorting by Intercorr_Str_Tax...
        selected_features = sorted(selected_features, key = lambda x: x[1])[:max_num_feats]

        # Creating final representation 
        Concepts.append([str(name), WeightFeatures_Interr2(selected_features)]) 
    return Concepts
# pathh = '../Data/' 
# L = ['tomato']
# Defs = ReadDefs_Intercorr_str(22, L)
# print Defs

### Combining *Intercorr_str_tax* and *Prof_Freq*

In this function we complement the features selected based on the *Intercorr_str_tax* variable with features selected based on frequency of mention, that is to say *Rank_PF*.
The goal is for each concept to have at least **6** features. For those concepts where the *Intercorr_str_tax* variable does not provide with 6 features, we complement the list of features by selecting the rest based on Rank_PF (frequency).

In [54]:
def ReadDefs_Intercorr_PF (max_num_feats, names_list):
    "Given an xlsx file it returns all the concepts feature values as they appear in the original dataset"
    #Dataframe for excel document
    df = pd.read_excel( pathh + 'CONCS_FEATS_concstats_brm.xlsx')
    
    Concepts = []
    # Extract list of features for each concept
    for name in names_list:
        # Locating the concept by name
        row = df.loc[df['Concept'] == name]
        
        # Reading features and Rank_PF values
        selected_features = row[['Feature','Prod_Freq','Intercorr_Str_Tax']].values.tolist()
        # Setting strings into an appropiate format
        selected_features = map(lambda x: [str(x[0]), int(x[1]), float(x[2])], selected_features) 
        
        # Creating final representation 
        Concepts.append([str(name), WeightFeatures_PF_Interr(selected_features[:max_num_feats])]) 
    return Concepts

# Combining with Intercorr No Tax... 
def ReadDefs_Intercorr_NoT_PF (max_num_feats, names_list):
    "Given an xlsx file it returns all the concepts feature values as they appear in the original dataset"
    #Dataframe for excel document
    df = pd.read_excel( pathh + 'CONCS_FEATS_concstats_brm.xlsx')
    
    Concepts = []
    # Extract list of features for each concept
    for name in names_list:
        # Locating the concept by name
        row = df.loc[df['Concept'] == name]
        
        # Reading features and Rank_PF values
        selected_features = row[['Feature','Prod_Freq','Intercorr_Str_No_Tax']].values.tolist()
        # Setting strings into an appropiate format
        selected_features = map(lambda x: [str(x[0]), int(x[1]), float(x[2])], selected_features) 
        
        # Creating final representation 
        Concepts.append([str(name), WeightFeatures_PF_Interr(selected_features[:max_num_feats])]) 
    return Concepts

## Storing ID vectors into memory

### Creating definitions dictionary

In [55]:
def CreateDictionary( mode , max_num_feats, names_list):
    global Dict_defs
    if mode == 'normal':
        data = ReadDefinitions(max_num_feats, names_list)
    elif mode == 'Prod_Freq':
        data = ReadDefs_ProdFreq(max_num_feats, names_list)
    elif mode == 'Intercorr_str_tax':
        data = ReadDefs_Intercorr_str(max_num_feats, names_list)
    elif mode == 'Intercorr_str_No_tax':
        data = ReadDefs_Intercorr_NoT_str(max_num_feats, names_list)
    elif mode == 'Intercorr_PF':
        data = ReadDefs_Intercorr_PF(max_num_feats, names_list)
    elif mode == 'Intercorr_NoT_PF':
        data = ReadDefs_Intercorr_NoT_PF(max_num_feats, names_list)
        
    for concept in data:
        Dict_defs[concept[0]] = TranslateFeats(concept[1])

### Memory functions

In [56]:
def flat_list (L):
    "Recursive function that flats a list of lists (at any level)"
    if L == []:
        return L
    if type(L[0]) is list:
        return flat_list(L[0]) + flat_list(L[1:])
    return L[:1] + flat_list(L[1:])

def FeatureVectors(Dic):
    "It extract from the definition dictionary all the feature type vectors ('is','has','color', etc...)"
    global feature_vectors
    featt = []
    vals = Dic.values()
    for l in vals:
        for p in l:
            featt.append(p[0])
    feature_vectors = list(set(featt))
    
    
def SaveConcepts(Dic):
    """Given a definitions dictionary it stores in memory the entire set of concepts in the dictionary (including feature vectors)"""
    keys = Dic.keys()
    vals = Dic.values()
    all_concepts = list(set(flat_list(vals) + keys))
    # Process for storing list of concepts in memory
    for concept in all_concepts:
        HDvector(N,concept) #This creates an object and store it in memory    
    
def CreateSemanticPointer (PairList):
    "Turns list as [[feat1,feat_val],[feat2,feat_val],[feat3,feat_val]] into vector feat1*feat_val + feat2*feat_val ..."
    if len(PairList) == 0:
        return HDvector(N)
    vecs = []
    for pair in PairList:
        vecs.append(Dict[pair[0]] * Dict[pair[1]])
    return ADD(vecs)

def SaveDefinitions(Dic):
    """Given the definitions dictionary, and having all its concepts previously stored in memory, this functions
       creates a definition vector (semantic pointer) using HD operations and assign it as a pointer to an 
       object vector (ID vector)."""
    global feature_vectors
    # Going through all elements in dictionary
    for key, value in Dic.iteritems():
        Dict[key].setPointer(CreateSemanticPointer(value))
        
def NormalizeHammDist (Dist_list):
    "Given a distance list of the form [['name', dist], ['name', dist], ... ], it normalize each distance and return a list with the same form"
    for i in range(len(Dist_list)):
        Dist_list[i][1] = round( 1. - Dist_list[i][1] / float(N), 3 ) 
    return Dist_list

### Initializing memory

In [57]:
def Init_mem( mode = 'normal', max_num_feats = 100 , names_list = None):
    init()
    print "Begining to encode dataset..."
    thr = 0.45 * N
    # Read dataset and create definition dictionary
    CreateDictionary( mode, max_num_feats, names_list )
    # Feature vectors
    FeatureVectors(Dict_defs)
    # Save concepts into memory (ID vectors)
    SaveConcepts(Dict_defs)
    # Associate definitions to concepts into memory (SP vectors)
    SaveDefinitions(Dict_defs)
    print "End of encoding"

## Reading similarity from distance matrix (McRae)

In [58]:
def McRae_simi (pair_concepts):
    "Given a pair of concepts (in a list) it consults the similarity from the cos_matrix... file"
    try: 
        df = pd.read_excel(pathh + 'cos_matrix_brm_IFR.xlsx','1st_200')
        return list(df.loc[df['CONCEPT'] == pair_concepts[0]][pair_concepts[1]])[0]
    except:
        try:
            df = pd.read_excel(pathh + 'cos_matrix_brm_IFR.xlsx','2nd_200')
            return list(df.loc[df['CONCEPT'] == pair_concepts[0]][pair_concepts[1]])[0]
        except:
            df = pd.read_excel(pathh + 'cos_matrix_brm_IFR.xlsx','last_141')
            return list(df.loc[df['CONCEPT'] == pair_concepts[0]][pair_concepts[1]])[0]

## Semantic similarity using NLTK library functions
### Auxiliar functions

In [59]:
brown_ic = wordnet_ic.ic('ic-brown.dat')

def get_concepts_list ():
    "Returns a list of strings: the names of the concepts"
    df = pd.read_excel(pathh + 'CONCS_Synset_brm.xlsx') #../McRaedataset/CONCS_Synset_brm.xlsx')
    return map(str, list(df['Concept']))
    
def get_synset (concept):
    "Given a concept name (string) it returns its synset (string)"
    # Dataframe for excel document
    df = pd.read_excel(pathh + 'CONCS_Synset_brm.xlsx') #../McRaedataset/CONCS_Synset_brm.xlsx')
    row = df.loc[df['Concept'] == concept]
    return str(list(row['Synset'] )[0])

def apply_sim_metric ( similarity_metric, num, in_concept, corpus = None):
    "Given a similarity_metric function it returns a list of the num closest concepts to 'concept'"
    dist_list = []
    for c in Concepts:
        c_synset = wn.synset( get_synset(c) )
        if corpus:
            dist_list.append([c, round(similarity_metric(in_concept, c_synset, corpus), 3) ])
        else:
            dist_list.append([c, round(similarity_metric(in_concept, c_synset), 3) ])
    return sorted(dist_list, key = lambda r : r[1], reverse = True ) [:num]

def similarity_fun ( similarity_metric, pair, corpus = None):
    "Given a similarity_metric function it returns a list of the num closest concepts to 'concept'"
    c_synset_1 = wn.synset( get_synset(pair[0]))
    c_synset_2 = wn.synset( get_synset(pair[1]))
    if corpus:
        return round(similarity_metric(c_synset_1, c_synset_2, corpus), 3)
    else:
        return round(similarity_metric(c_synset_1, c_synset_2), 3)