It defines the functions to be used in *ConceptNet_VSA* notebook

## Libraries

In [1]:
import requests
import csv
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats.stats import spearmanr
import pickle
import time

# Vital function for pairwise distances...
from scipy.spatial.distance import pdist

### Reading SimLex-999

In [1]:
def Read_SimLex():
    df = pd.read_excel('../Data/SimLex.xlsx') 

    # Selecting only Nouns
    df = df[df['POS'] == 'N']

    # Pairs
    Pairs = df[['word1','word2','SimLex999']].values.tolist()
    Pairs = [[str(x[0]), str(x[1]), float(x[2])] for x  in Pairs]

    # List of concepts
    Concepts = pd.unique(df[['word1','word2']].values.ravel('K'))
    Concepts = list(map(str, Concepts))
    print("There are", len(Concepts), "concepts")
    return Pairs,Concepts

In [3]:
def find_concept_in_pairs (C, Pair_list, thr, G):
    "Given a single concept C, it returns a lists of pairs where C appears"
    for p in Pair_list:
        if C == p[0] and p[2] > thr : 
            if p[1] not in G:
                G.extend(find_concept_in_pairs(p[1], Pair_list, thr, G + [p[1]]))
        elif C == p[1] and p[2] > thr: 
            if p[0] not in G:
                G.extend(find_concept_in_pairs(p[0], Pair_list, thr, G + [p[0]]))

    return list(set(G))

### Creating partitions

In [4]:
thr_s = 0     # Threshold value

def CreatingPartitions ():
    Groups = []
    for pair in Pairs:
        g = find_concept_in_pairs(pair[0], Pairs, thr_s, [pair[0]])
        g.sort()
        if g not in Groups:
            Groups.append(g)

    # Sorting
    Groups.sort()
    Groups.sort(key = len, reverse = True)

    # Filtering out partitions of size 1 (only 1 pair)
    Groups = [x for x in Groups if len(x) > 1]

    print("There are",len(Groups),"partitions")
    return Groups

In [5]:
def Pairs_per_Partition(Save = None):
    i = -1
    GPairs = []
    for G in Groups:
        i += 1 # Contador de grupos...
        gPairs = []
        for j in range(len(G)): # Por concepto en grupo i...
            for k in range(j+1, len(G)):
                for p in Pairs:
                    if G[j] in p and G[k] in p and p[2] > thr_s:
                        gPairs.append(p)
                        #print p
        if Save:
            np.save('../Data/ClusteringData/01 Elementos/Elementos_Grupo' + str(i), G)
            np.save('../Data/ClusteringData/02 Parejas/Parejas_Grupo' + str(i), gPairs)
        
        # Lista de parejas, ordenadas de menor a mayor similitud [para cada grupo]
        GPairs.append(sorted(gPairs, key = lambda x: x[2]))
    return GPairs

### Reading semantic features file

In [6]:
def formatt2 (L):
    "Formatting the imported file into an array"
    new = L[:3] + [eval(L[3].replace(' ',','))] + [float(L[4])] + [float(L[5])]
    return new


def Reading_Feature_Matrix():
    Features_matrix = []
    # Reading csv file
    with open('../Data/ConceptNet_Semantic_Features.csv') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        count = 0
        for row in csv_reader:
            if count > 0:
                Features_matrix.append(formatt2(row))
            else:
                count = 1;
    csv_file.close()
    
    # Formatting
    concept_in = Features_matrix[0][0]  #initial concept
    Feat_matrix = [[] for i in range(751)]        

    # Loop for formatting
    i = 0
    features_concept_i = []
    for feat in Features_matrix:
        # Same concept... 
        if feat[0] == concept_in:
            # Adding 'feat' to aux array
            features_concept_i.append(feat)
        else:
           # Changing concept
            concept_in = feat[0]
           # Copying aux array to final array
            Feat_matrix[i] = features_concept_i[:]
           # Clearing aux array and increasing counter
            features_concept_i = [feat]
            i += 1

    Feat_matrix[i] = features_concept_i[:]

    # Sorting Feat_matrix based on weight (x[4])
    for con in Feat_matrix:
        con.sort(key = lambda x: x[4], reverse = True)
    
    return Feat_matrix

## Functions for clustering

### FA0) Function to extract relevant relations after clustering

In [7]:
def extrac_relation_name (string):
    "Given a file name as: 'Group_0_Relation_AtLocation.npy' it extracts the relation name ('AtLocation'"
    # Reversing string
    string = string[::-1]
    f1 = string.find('_')
    f2 = string.find('.')
    return string[f2+1:f1][::-1]
# extrac_relation_name('Group_0_Relation_IsA.npy')  -> 'IsA'

### FA1) Locating feature in a cluster

In [8]:
def feat_to_cluster (L_clus, feat_val):
    "Giving a feature value it returns the number of the cluster in which it is located..."
    for i in range(len(L_clus)):
        if feat_val in L_clus[i]:
            return i   # Cluster numbering starts in 1 (not 0)
    return []

### FA2) Contamination Function

In [9]:
def contamina_vec(array, dif_bits):
    "Dado un arreglo binario y un número entero, se regresa otro arreglo con una distancia de Hamming de dif_bits del \
    primero..."
    new_array = array.copy()
    index_randm = np.random.randint(0, len(array)-1, dif_bits)
    for i in np.unique(index_randm):
        if new_array[i] == 0:
            new_array[i] = 1
        else:
            new_array[i] = 0
    return new_array

### FA3) Función para cambiar definiciones de diccionario global

In [10]:
def change_dict_defs (list_concepts, rela, features_list, i_grupo):
    "Dados un concepto y una lista de feature_values (no-lemmatizados) cambia la definición del concepto al añadir \
    a la cadena de feature_value parte del nombre de la relación a la cual está asociado. Por ejemplo, si el concepto \
    bed tiene los features:  IsA * furniture + RelatedTo * furniture + ...  \
    suponiendo que furniture es parte de la lista list_features y la relat es [IsA, RelatedTo] \
    entonces la definición de bed se modifica como IsA * furnitureIsA + RelatedTo * furniture + ..."
    # El diccionario de definiciones es global.
    global Dict_defs
    # Bucle sobre features en definición...
    for con in list_concepts:
        for f in Dict_defs[con]:
            if f[0] == rela and f[1] in features_list: # si está en las relaciones que se están buscando...
                f[1] = f[1] + '_' + f[0][:5] + f[0][-2:] + "_g" + str(i_grupo) # Se añaden letras identificadoras de feat_name

### FA4) Get Pairs

In [11]:
def get_Pairs(concept_name):
    "Given a concept name, this function returns a list of the pairs where the input concept name is present..."
    global Groups, Gpairs
    
    concep_pairs = []
    
    for i_g in range(len(Groups)):
        if concept_name in Groups[i_g]:
            for pair_g in GPairs[i_g]:
                if pair_g[0] == concept_name or pair_g[1] == concept_name:
                    concep_pairs.append(pair_g)
    return concep_pairs

#get_Pairs('alcohol')

### FA5) get_feat_values

In [12]:
def get_feat_values(Definition_list, feat_namm):
    "Given a definition list (the ones obtained from Dict_defs) and a feat_name, it returns a list with all the feature \
    values within the definition for such feature name"
    return [x[1] for x in Definition_list if x[0] == feat_namm]

#get_feat_values(Dict_defs['cocktail'], 'RelatedTo')

### FA6) Delete_relation_code

In [13]:
def delete_relation_code (string):
    "Given a string of a feature_value + unique code, it removes the unique code and returns the original feature value \
    e.g. 'human_IsAsA_g0' -> 'human' , 'learn_church_positions_UsedFor_g0' -> 'learn_church_positions'" 
    
    nums = '0123456789'
    
    # Reverse string
    reversed_string = string[::-1]
    # find first '_'
    first = reversed_string.find('_')
    
    # Primero se debe verificar si la palabra tiene relation_code..
    # Caso 1: no hay ningun '_'...
    if first < 0:
        return string
    
    last_two = string[-first:][:2]
    if last_two[0] == 'g' and last_two[1] in nums:  # Se verifica que tiene g + number..
        # Second '_'
        second = reversed_string.find('_', first+1)
        # Final string...
        return string[:-second-1]
    else:
        return string  # tiene '_' pero no tiene g + number

#print delete_relation_code('human_IsAsA_g0')
#print delete_relation_code('bar')

### FA7) Find intersection

In [14]:
# Encontrando features que aparecen en todas las sublistas... Reduce con intersección..
def intersection(lst1, lst2): 
    lst3 = [value for value in lst1 if value in lst2] 
    return lst3