In [1]:
import pandas as pd
import pprint as pp
import networkx as nx
from operator import itemgetter, attrgetter

In [2]:
data_dir = './data/'

In [165]:
# Data
is_medium = True
get_empty = True
file_pre = 'medium' if is_medium else 'large'
empty_str = '_60percent_of_empty_profile'

graph = nx.read_gexf(data_dir + file_pre + 'Linkedin.gexf')
location = pd.read_pickle(data_dir + file_pre + 'Location.pickle')
employer = pd.read_pickle(data_dir + file_pre + 'Employer.pickle')
college = pd.read_pickle(data_dir + file_pre + 'College.pickle')

location_empty = pd.read_pickle(data_dir + file_pre + 'Location' + (empty_str if get_empty else '') + '.pickle')
employer_empty = pd.read_pickle(data_dir + file_pre + 'Employer' + (empty_str if get_empty else '') + '.pickle')
college_empty = pd.read_pickle(data_dir + file_pre + 'College' + (empty_str if get_empty else '') + '.pickle')
nodes_taken_out = pd.read_pickle(data_dir + file_pre + 'RemovedNodes_60percent_of_empty_profile.pickle')

In [4]:
# Add attributes to nodes in graph
possible_attributes = ['location', 'college', 'employer']

def addAttributes(G, attributes, attributes_name):
    nx.set_node_attributes(G, attributes, attributes_name)
        
def addAllAttributes(G):
    addAttributes(G, location_empty, 'location')
    addAttributes(G, employer_empty, 'employer')
    addAttributes(G, college_empty, 'college')

In [5]:
# Triangles 
# Check if new pair already considered in list, if not, it add it
def checkIfInPairsNewPair(triangles_pairs, node_1, node_2):
    if node_1 == node_2: return True
    for triangles_pair in triangles_pairs:
        if node_1 in triangles_pair and node_2 in triangles_pair: return True
    return False

# For one node, count number of trianlges with neighbors
def countTrianglesFromRelatedNodes(node, related_nodes, triangles_pairs):
    triangles_count = 0
    for related_node in related_nodes:  
        #print(f'checking pairs ({node}, {related_node})')
        if not checkIfInPairsNewPair(triangles_pairs, node, related_node): 
            #print(f'found coincidence')
            triangles_pairs.append((node, related_node))
            triangles_count += 0 if len(related_nodes) == 0 else 1
    return triangles_count

# Assumes, main node is already identified
# Delivers the quantity of triangles found and the array containing the pairs 
#   being the third one, the studied node
def locateTrianglesInNode(G, neighbors, node):
    triangles_count = 0
    triangles_pairs = []
    for n in neighbors:
        # convert to list in order to be able to use its properties
        neighbors_of_n = [n for n in G.neighbors(n)] 
        neighbors_of_n.remove(node) # ignore main node (node of study)
        related = [f_n for f_n in neighbors_of_n if f_n in neighbors]
        #print(f'neighbor: {n}, related {related}')
        triangles_count += countTrianglesFromRelatedNodes(n, related, triangles_pairs)
                
    return { 'triangles_count': triangles_count, 'triangles_pairs': triangles_pairs }

In [6]:
def searchFromAttributes(G, node_1, node_2, attributes):
    common_att = {}
    node_1_dict = G.nodes[node_1]
    node_2_dict = G.nodes[node_2]
    not_found_att = []
    for att in attributes:
        if att in node_1_dict and att in node_2_dict:
            node_1_att_values = node_1_dict[att]
            node_2_att_values = node_2_dict[att]
            #print(f'found att: {att}: {node_1_att_values} -- {node_2_att_values}')
            # find matches for att between both nodes
            for val in node_1_att_values:
                for val2 in node_2_att_values:
                    prepare_val = ''.join(val.lower().split(' ')) # prevent "errors of hand"
                    prepare_val2 = ''.join(val2.lower().split(' ')) #
                    if (prepare_val == prepare_val2): 
                        if not att in common_att: common_att[att] = []
                        common_att[att].append(val)
        else:
            not_found_att.append(att)
    #if (len(not_found_att) > 0): print('not found att: ', not_found_att)
    return common_att

def getPairRelationship(node_1, node_2, G, attributes):
    connections = searchFromAttributes(G, node_1, node_2, attributes)
    # --- do something else? ---
    return connections

def addValueToNode(G, node, value, attribute):
    node_attributes = G.nodes[node]
    #print(f'node att: {node_attributes}')
    new_values = []
    existant = False
    if attribute in node_attributes:
        new_values = node_attributes[attribute][:]
        prepare_val = ''.join(value.lower().split(' '))
        for val in new_values:
            joined_val = ''.join(val.lower().split(' '))
            if joined_val == prepare_val: 
                existant = True
                break
                
    new_values.append(value)
    #print(f'at node {node} new values: {new_values}')
    if not existant: addAttributes(G, { node: new_values }, attribute)

def verifyValueInAttribute(attribute, node_attributes, value):
    if attribute in node_attributes:
        new_values = node_attributes[attribute][:]
        prepare_val = ''.join(value.lower().split(' '))
        for val in new_values:
            joined_val = ''.join(val.lower().split(' '))
            if joined_val == prepare_val: 
                return True
    return False

def verifyAttributesInNode(attributes, node_attributes):
    total = len(attributes); this_total = 0
    for att in attributes: 
        if att in node_attributes: this_total += 1
    #if (this_total != total) and this_total > 0 : print(f'error? some att already: {this_total}')
    return (this_total == total)
        
def addValuesToNode(G, node, attribute, candidate_values):
    candidates = {}
    if verifyAttributesInNode(possible_attributes, G.nodes[node]): return # no need to analyze
    for index, candidate in enumerate(candidate_values):
        if index in candidates:  candidates[index] += 1
        else: candidates[index] = 1       
        
    # Decision Rules
    values_to_add = []
    if len(candidates) == 0: 
        #print('zero candidates'); 
        return
    elif len(candidates) == 1: 
        #print('one candidate');
        values_to_add = [candidate_values[0]]
    else:
        #print('several candidates');
        weights = [v for i, v in candidates.items()]
        max_weight = max(weights)
        indexes = [ i for i, v in candidates.items() if max_weight == v ]
        values_to_add = [ candidate_values[i] for i in indexes ]
        
    addAttributes(G, { node: values_to_add }, attribute)

In [7]:
# Obtain desired nodes, where to start searching, idealy in the nodes that have all the information

def checkIfAttributes(node, attributes):
    for att in attributes:
        if not att in node: return False
    return True

def checkWellLabeledNodes(G):
    well_labeled_nodes = []
    for node in G.nodes:
        if not checkIfAttributes(G.nodes[node], possible_attributes): continue
        neighbors = [n for n in G.neighbors(node)]
        completed = True
        for neighbor in neighbors:
            if not checkIfAttributes(G.nodes[neighbor], possible_attributes): 
                completed = False
                break
        if not completed : continue
        # passed both tests, good places to start:
        well_labeled_nodes.append(node)
    return well_labeled_nodes

In [23]:
# ---- RUN FOR AL DESIRED NODES -----

def firstProposal(G):
    addAllAttributes(G)
    well_labeled_nodes = checkWellLabeledNodes(G)
    pipe = [node for node in well_labeled_nodes if node in nodes_with_triangles]
    pipe = pipe + [node for node in G.nodes if node not in pipe]

    for node in pipe:
        neighbors = [n for n in G.neighbors(node)]
        triangles_in_node = locateTrianglesInNode(G, neighbors, node)

        prepare_new_values = {}
        for pair in triangles_in_node['triangles_pairs']:
            connections = getPairRelationship(pair[0], pair[1], G, possible_attributes)
            for att, matches in connections.items():
                # addValuesToNode(G, node, att, matches)
                for match in matches:
                    addValueToNode(G, node, match, att)

In [9]:
# ---- RUN FOR AL DESIRED NODES SECOND PROPOSAL -----
def secondProposal(G):
    addAllAttributes(G)
    well_labeled_nodes = checkWellLabeledNodes(G)
    pipe = [node for node in well_labeled_nodes if node in nodes_with_triangles]
    pipe = pipe + [node for node in G.nodes if node not in pipe]

    for node in pipe:
        neighbors = [n for n in G.neighbors(node)]
        triangles_in_node = locateTrianglesInNode(G, neighbors, node)

        for pair in triangles_in_node['triangles_pairs']:
            connections = getPairRelationship(pair[0], pair[1], G, possible_attributes)
            #print(connections)
            for att, matches in connections.items():
                #print(f'att: { att}, match: { matches}')
                addValuesToNode(G, node, att, matches)

In [10]:
# Get info as neeeded for evaluation
def extractFromGraph(G, attribute):
    extracted = {}
    for node in G.nodes:
        node_att = G.nodes[node]
        if attribute in node_att: extracted[node] = node_att[attribute]
    return extracted

In [17]:
def evaluation_accuracy(groundtruth, pred):
    """    Compute the accuracy of your model.

     The accuracy is the proportion of true results.

    Parameters
    ----------
    groundtruth :  : dict 
       A dict of attributes, either location, employer or college attributes. 
       key is a node, value is a list of attribute values.
    pred : dict 
       A dict of attributes, either location, employer or college attributes. 
       key is a node, value is a list of attribute values. 

    Returns
    -------
    out : float
       Accuracy.
    """
    true_positive_prediction=0   
    for p_key, p_value in pred.items():
        if p_key in groundtruth:
            # if prediction is no attribute values, e.g. [] and so is the groundtruth
            # May happen
            if not p_value and not groundtruth[p_key]:
                true_positive_prediction+=1
            # counts the number of good prediction for node p_key
            # here len(p_value)=1 but we could have tried to predict more values
            true_positive_prediction += len([c for c in p_value if c in groundtruth[p_key]])          
        # no else, should not happen: train and test datasets are consistent
    return true_positive_prediction*100/sum(len(v) for v in pred.values())


def getLeoInfo(G, att, in_dict_or_arr, groundtruth):
    los_q_quiere_leo = {}
    los_q_quiere_leo = { node: values for node, values in extractFromGraph(G, att).items() if node in in_dict_or_arr }
    print(f'# {len(los_q_quiere_leo)}')
    print(f'for {att}: {evaluation_accuracy(groundtruth,los_q_quiere_leo)}')

In [12]:
#Third Proposal dev

def updateNodeInfoInPipe(G, pipe, node, labeled_nodes, first_time = False):
    some_change = False
    neighbors = [n for n in G.neighbors(node)]
    well_labeled_neighbors = [n for n in neighbors if n in labeled_nodes]
    triangles_in_node = locateTrianglesInNode(G, neighbors, node)
    info_availability_percentage = len(well_labeled_neighbors) / len(neighbors) if triangles_in_node['triangles_count'] > 0 else 0
    node_info = {
        'node': node,
        'total_neigh': len(neighbors),
        'total_labeled_neigh': len(well_labeled_neighbors),
        'info_availability_percentage': info_availability_percentage,
        'trianlges': triangles_in_node['triangles_count'],
        'labels': len(G.nodes[node]) - 1,
    }
    if first_time: pipe.append(node_info); return True
    for node_dict in pipe:
        if node_dict['node'] == node:
            if not node_dict['info_availability_percentage'] == node_info['info_availability_percentage']: some_change = True;
            if node_dict['labels'] != len(G.nodes[node]) - 1: some_change = True
            node_dict['total_labeled_neigh'] = node_info['total_labeled_neigh']
            node_dict['info_availability_percentage'] = node_info['info_availability_percentage']
            node_dict['labels'] = len(G.nodes[node]) - 1
            break
    return some_change

def thirdProposal(G, nodes_taken_out):
    addAllAttributes(G)
    labeled_nodes = [n for n in G.nodes if n not in nodes_taken_out]

    pipe = []
    for node in nodes_taken_out:
        updateNodeInfoInPipe(G, pipe, node, labeled_nodes, True)
    
    #order the pipe
    pipe = sorted(pipe, key=itemgetter('info_availability_percentage'), reverse=True)

    some_change = True
    count = 0
    while some_change and count < 100:
        some_change = False
        count += 1
        for i, node_info in enumerate(pipe):
            node = node_info['node']
            neighbors = [n for n in G.neighbors(node)]
            triangles_in_node = locateTrianglesInNode(G, neighbors, node)
            for pair in triangles_in_node['triangles_pairs']:
                connections = getPairRelationship(pair[0], pair[1], G, possible_attributes)
                for att, matches in connections.items():
                    addValuesToNode(G, node, att, matches)
            some_change = updateNodeInfoInPipe(G, pipe, node, labeled_nodes) or some_change
            if pipe[i]['total_neigh'] == pipe[i]['total_labeled_neigh']: 
                labeled_nodes.append(pipe[i])
                del pipe[i]

In [112]:
# Studying nodes, discovered 3 kinds of them: nodes with triangles, one link nodes and several link nodes
# Obtain the nodes with trianlges, and the other 2 cases

def getDifferentQuantitiesForAttackingGraph(G):
    count_triangles = 0
    count_one_neighbor = 0
    count_plus_neighbors = 0
    nodes_with_triangles = []
    nodes_one_neighbor = []
    nodes_plus_neighbors = []
    for node in G.nodes:
        neighbors = [n for n in G.neighbors(node)]
        triangles_in_node = locateTrianglesInNode(G, neighbors, node)
        if triangles_in_node['triangles_count'] == 0 and len(neighbors) > 1: 
            count_plus_neighbors += 1
            nodes_plus_neighbors.append(node)
            #print('----SUP--- node: ', node , ' count: ', len(neighbors))
        elif triangles_in_node['triangles_count'] == 0 and len(neighbors) == 1:
            count_one_neighbor += 1
            #print('----EQUAL--- node: ', node)
            nodes_one_neighbor.append(node)
        else:
            count_triangles += 1
            nodes_with_triangles.append(node)
        #print('----TRI--- qty: ', triangles_in_node['triangles_count'])
    return { 
        'nodes_with_triangles': nodes_with_triangles,
        'nodes_one_neighbor': nodes_one_neighbor,
        'nodes_plus_neighbors': nodes_plus_neighbors,
        'count_triangles': count_triangles,
        'count_one_neighbor': count_one_neighbor,
        'count_plus_neighbors': count_plus_neighbors
    }

#print('# of nodes w/triangles: ', count_triangles, ' # of same with no: ', count_non_tri_easy, ' others: ', count_non_tri)
#print(count_triangles + count_non_tri_easy + count_non_tri)

In [95]:
# this method assumes that the values to evaluate are inside G
def accuracyEvaluation(G, attributes, groundtruth_dict, evaluate_this_nodes_dict = {}, this_nodes_title = ''):
    results = {}
    total_nodes = len(G)
    for att in attributes:
        correctness = wrongness = missingness = 0
        
        if att in groundtruth_dict:
            groundtruth = groundtruth_dict[att]
            correct = wrong = missing = 0
            for node in G.nodes:
                truth_values = groundtruth[node]
                node_att = G.nodes[node]
                if att not in node_att: missing += len(truth_values); continue;
                calculated = node_att[att]
                for val in truth_values:
                    if val in calculated: correct += 1
                    else: missing += 1
                for val in calculated:
                    if val not in truth_values: wrong += 1
            correctness = 100 * correct / (correct + missing)
            wrongness = 100 * wrong / (correct + missing)
            missingness = 100 * missing / (correct + missing)
            results[att] = { 
                '1_total_correct': correct, 
                '1_total_wrong': wrong, 
                '1_total_missing': missing,
                '3_overall_correctness': correctness, 
                '3_overall_wrongness': wrongness,
                '3_overall_missingness': missingness
            }
        if att in evaluate_this_nodes_dict and att in groundtruth_dict:
            evaluate_dict = evaluate_this_nodes_dict[att]
            groundtruth = groundtruth_dict[att]
            correct = wrong = missing = 0
            for node in evaluate_dict:
                truth_values = groundtruth[node]
                node_att = G.nodes[node]
                if att not in node_att: missing += len(truth_values); continue;
                calculated = node_att[att]
                for val in truth_values:
                    if val in calculated: correct += 1
                    else: missing += 1
                for val in calculated:
                    if val not in truth_values: wrong += 1
            correctness = 100 * correct / (correct + missing)
            wrongness = 100 * wrong / (correct + missing)
            missingness = 100 * missing / (correct + missing)
            located_str = this_nodes_title if this_nodes_title != '' else 'located'
            results[att]['4_' + located_str + '_correct'] = correct
            results[att]['4_' + located_str + '_wrong'] = wrong
            results[att]['4_' + located_str + '_missing'] = missing
            results[att]['5_' + located_str + '_correctness'] = correctness
            results[att]['5_' + located_str + '_wrongness'] = wrongness
            results[att]['5_' + located_str + '_missingness'] = missingness
        
        pp.pprint(results)
    results['graph_len'] = total_nodes
    return results
                

In [166]:
graph_first = nx.read_gexf(data_dir + file_pre + 'Linkedin.gexf')
#thirdProposal(graph_first, nodes_taken_out)
secondProposal(graph_first)

In [171]:
diff_qtt = getDifferentQuantitiesForAttackingGraph(graph_first)
pp.pprint(diff_qtt, depth=1)
nodes_with_triangles = diff_qtt['nodes_with_triangles']

{'count_one_neighbor': 341,
 'count_plus_neighbors': 40,
 'count_triangles': 430,
 'nodes_one_neighbor': [...],
 'nodes_plus_neighbors': [...],
 'nodes_with_triangles': [...]}


In [172]:
attributes = ['location', 'college', 'employer']
groundtruth_dict = { 'location': location, 'employer': employer, 'college': college }
located_to_study = {}
#located_to_study['location'] = extractFromGraph(graph_first, "location")
located_to_study['location'] = { node: 0 for node in nodes_with_triangles } 
located_to_study['employer'] = { node: 0 for node in nodes_with_triangles } 
located_to_study['college'] = { node: 0 for node in nodes_with_triangles } 
title = 'nodes_with_triangles'
results = accuracyEvaluation(graph_first, attributes, groundtruth_dict, located_to_study, title)

{'location': {'1_total_correct': 381,
              '1_total_missing': 430,
              '1_total_wrong': 51,
              '3_overall_correctness': 46.9790382244143,
              '3_overall_missingness': 53.0209617755857,
              '3_overall_wrongness': 6.288532675709002,
              '4_nodes_with_triangles_correct': 226,
              '4_nodes_with_triangles_missing': 204,
              '4_nodes_with_triangles_wrong': 51,
              '5_nodes_with_triangles_correctness': 52.55813953488372,
              '5_nodes_with_triangles_missingness': 47.44186046511628,
              '5_nodes_with_triangles_wrongness': 11.86046511627907}}


KeyError: 'U14078'

In [130]:
nodes_one_neighbor = diff_qtt['nodes_one_neighbor']
nodes_plus_neighbors = diff_qtt['nodes_plus_neighbors']
graph_all = nx.read_gexf(data_dir + file_pre + 'Linkedin.gexf')
addAttributes(graph_all, location, 'location')
addAttributes(graph_all, college, 'college')
addAttributes(graph_all, employer, 'employer')
for node in nodes_one_neighbor:
    neighbors = [n for n in graph_all.neighbors(node)]
    neighbor = neighbors[0]
    print('---- node ----')
    relationship = getPairRelationship(node, neighbor, graph_all, possible_attributes)
    
    if 'location' not in relationship and 'employer' not in relationship and 'college' not in relationship:
        print(f'--> START NOOOOOOOOT node1: {node}, node2: {neighbor}')
        pp.pprint(graph_all.nodes[node])
        pp.pprint(graph_all.nodes[neighbor])
        print('--> END NOOOOOOOOT ')
    else:
        print('--> COIN ')
        print(relationship)

---- node ----
--> COIN 
{'employer': ['university of illinois at urbana-champaign']}
---- node ----
--> COIN 
{'location': ['urbana-champaign illinois area'], 'employer': ['university of illinois at urbana-champaign']}
---- node ----
--> COIN 
{'college': ['university of illinois at urbana-champaign'], 'employer': ['university of illinois at urbana-champaign']}
---- node ----
--> START NOOOOOOOOT node1: U22859, node2: U22747
{}
{'college': ['indian institute of technology madras'],
 'employer': ['amazon',
              'ibm india research lab',
              'coordinated science lab',
              'toyota technical center'],
 'label': 'U22859',
 'location': ['bengaluru area india']}
{'college': ['university of illinois at urbana-champaign',
             'birla institute of technology'],
 'employer': ['university of illinois at urbana-champaign',
              'intel',
              'oracle india pvt ltd',
              'microsoft research india labs',
              'bhabha atomic res

              'act'],
 'label': 'U1045',
 'location': ['greater boston area']}
--> END NOOOOOOOOT 
---- node ----
--> START NOOOOOOOOT node1: U14043, node2: U13995
{}
{'college': ['university of tehran'],
 'employer': ['arcelormittal',
              'asm chicago regional chapter',
              'university of illinois at chicago'],
 'label': 'U14043',
 'location': ['greater chicago area']}
{'label': 'U13995', 'location': ['urbana-champaign illinois area']}
--> END NOOOOOOOOT 
---- node ----
--> START NOOOOOOOOT node1: U7159, node2: U7024
{}
{'label': 'U7159', 'location': ['san francisco bay area']}
{'college': ['university of illinois at urbana-champaign'],
 'employer': ['university of illinois at urbana-champaign'],
 'label': 'U7024',
 'location': ['urbana-champaign illinois area']}
--> END NOOOOOOOOT 
---- node ----
--> COIN 
{'location': ['urbana-champaign illinois area']}
---- node ----
--> START NOOOOOOOOT node1: U22838, node2: U22747
{}
{'college': ['birla institute of technology

In [132]:
node = 'U22859'
second_neighbors = [n for n in graph_all.neighbors('U22747')]
for neigh in second_neighbors:
    relationship = getPairRelationship(node, neigh, graph_all, possible_attributes)
    print(relationship)

{'location': ['bengaluru area india'], 'college': ['indian institute of technology madras'], 'employer': ['amazon', 'ibm india research lab', 'coordinated science lab', 'toyota technical center']}
{}
{}
{'location': ['bengaluru area india']}
{}
{'location': ['bengaluru area india']}
{'location': ['bengaluru area india']}
{}
{}
{'location': ['bengaluru area india']}
{}
{}
{}
{'employer': ['amazon']}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{'college': ['indian institute of technology madras']}
{}
{}
{'location': ['bengaluru area india']}
{}


In [153]:
# Understand percentages of coincidences of each attribute in first and second grade neighbors

# First
total_nodes = len(nodes_one_neighbor)
college_matches = 0
employer_matches = 0
location_matches = 0
location_percentage = 0
college_percentage = 0
employer_percentage = 0
for node in nodes_one_neighbor:
    neighbors = [n for n in graph_all.neighbors(node)]
    neighbor = neighbors[0]
    relationships = getPairRelationship(node, neighbor, graph_all, possible_attributes)
    #print('----node----')
    len_att = len(relationships)
    for att, values in relationships.items():
        if att == 'location': location_matches += 1; location_percentage += 1/len_att; #print(f'matches: {len(values)}')
        elif att == 'college': college_matches += 1; college_percentage += 1/len_att; #print(f'matches: {len(values)}')
        elif att == 'employer': employer_matches += 1; employer_percentage += 1/len_att; #print(f'matches: {len(values)}')
        else: print('error')

print(f'MATCHES -> attribute (matches with neighbor)||(matches according to total of matches of attribute)||(matches/nodes -> accuracy?)')
print(f'MATCHES -> college {college_matches}||{college_percentage}||{100*college_matches/total_nodes}')
print(f'MATCHES -> employer: {employer_matches}||{employer_percentage}||{100*employer_matches/total_nodes}') 
print(f'MATCHES -> location: {location_matches}||{location_percentage}||{100*location_matches/total_nodes}')

# Second
college_matches = 0
employer_matches = 0
location_matches = 0
location_percentage = 0
college_percentage = 0
employer_percentage = 0
total_nodes = 0
for node in nodes_one_neighbor:
    neighbors = [n for n in graph_all.neighbors(node)]
    neighbor = neighbors[0]
    second_neighbors = [n for n in graph_all.neighbors(neighbor)]
    relationships = getPairRelationship(node, neighbor, graph_all, possible_attributes)
    #
    #if len(relationships) > 0: continue
    len_second_neigh = len(second_neighbors)
    total_nodes += len_second_neigh
    for neigh in second_neighbors:
        relationships = getPairRelationship(node, neigh, graph_all, possible_attributes)
        len_att = len(relationships)
        for att, values in relationships.items():
            if att == 'location': location_matches += 1; location_percentage += 1/(len_att*len_second_neigh); #print(f'matches: {len(values)}')
            elif att == 'college': college_matches += 1; college_percentage += 1/(len_att*len_second_neigh); #print(f'matches: {len(values)}')
            elif att == 'employer': employer_matches += 1; employer_percentage += 1/(len_att*len_second_neigh); #print(f'matches: {len(values)}')
            else: print('error')
print('\n')
print(f'MATCHES -> attribute (total matches per attribute)||(percentage of fiability according to matches)||(percentage fiability according total nodes studied)')
print(f'MATCHES -> college {college_matches}||{college_percentage}||{100*college_matches/total_nodes}')
print(f'MATCHES -> employer: {employer_matches}||{employer_percentage}||{100*employer_matches/total_nodes}') 
print(f'MATCHES -> location: {location_matches}||{location_percentage}||{100*location_matches/total_nodes}')


# Second
college_matches = 0
employer_matches = 0
location_matches = 0
location_percentage = 0
college_percentage = 0
employer_percentage = 0
total_nodes = 0
for node in nodes_one_neighbor:
    neighbors = [n for n in graph_all.neighbors(node)]
    neighbor = neighbors[0]
    second_neighbors = [n for n in graph_all.neighbors(neighbor)]
    relationships = getPairRelationship(node, neighbor, graph_all, possible_attributes)
    if len(relationships) > 0: continue
    len_second_neigh = len(second_neighbors)
    total_nodes += len_second_neigh
    for neigh in second_neighbors:
        relationships = getPairRelationship(node, neigh, graph_all, possible_attributes)
        len_att = len(relationships)
        for att, values in relationships.items():
            if att == 'location': location_matches += 1; location_percentage += 1/(len_att*len_second_neigh); #print(f'matches: {len(values)}')
            elif att == 'college': college_matches += 1; college_percentage += 1/(len_att*len_second_neigh); #print(f'matches: {len(values)}')
            elif att == 'employer': employer_matches += 1; employer_percentage += 1/(len_att*len_second_neigh); #print(f'matches: {len(values)}')
            else: print('error')
print('\n')
print(f'MATCHES -> attribute (total matches per attribute)||(percentage of fiability according to matches)||(percentage fiability according total nodes studied)')
print(f'MATCHES -> college {college_matches}||{college_percentage}||{100*college_matches/total_nodes}')
print(f'MATCHES -> employer: {employer_matches}||{employer_percentage}||{100*employer_matches/total_nodes}') 
print(f'MATCHES -> location: {location_matches}||{location_percentage}||{100*location_matches/total_nodes}')

# con esto podemos ver que para cuando se tiene el vecino bien labeled, se puede asisgnar location con 56%
# de éxito y sino acudir a sus vecinos y asignar location con 23% de éxito aprox

MATCHES -> attribute (matches with neighbor)||(matches according to total of matches of attribute)||(matches/nodes -> accuracy?)
MATCHES -> college 58||43.16666666666667||17.008797653958943
MATCHES -> employer: 57||35.66666666666666||16.715542521994134
MATCHES -> location: 74||56.16666666666667||21.700879765395893


MATCHES -> attribute (total matches per attribute)||(percentage of fiability according to matches)||(percentage fiability according total nodes studied)
MATCHES -> college 1956||28.65213432954574||10.960439314132019
MATCHES -> employer: 1434||23.99486847834699||8.0354140983974
MATCHES -> location: 2652||50.33886256979451||14.86047293511151


MATCHES -> attribute (total matches per attribute)||(percentage of fiability according to matches)||(percentage fiability according total nodes studied)
MATCHES -> college 351||6.826363366828028||3.4213861000097476
MATCHES -> employer: 403||7.9485500615450615||3.9282581148260065
MATCHES -> location: 1118||23.4116846232819||10.8977483185