In [201]:
import pandas as pd
import pprint as pp
import networkx as nx
import pickle
from operator import itemgetter, attrgetter

In [2]:
data_dir = './data/'

In [261]:
# Data
is_medium = False
get_empty = True
file_pre = 'medium' if is_medium else 'large'
empty_str = '_60percent_of_empty_profile'

graph = nx.read_gexf(data_dir + file_pre + 'Linkedin.gexf')
location = pd.read_pickle(data_dir + file_pre + 'Location.pickle')
employer = pd.read_pickle(data_dir + file_pre + 'Employer.pickle')
college = pd.read_pickle(data_dir + file_pre + 'College.pickle')

location_empty = pd.read_pickle(data_dir + file_pre + 'Location' + (empty_str if get_empty else '') + '.pickle')
employer_empty = pd.read_pickle(data_dir + file_pre + 'Employer' + (empty_str if get_empty else '') + '.pickle')
college_empty = pd.read_pickle(data_dir + file_pre + 'College' + (empty_str if get_empty else '') + '.pickle')
nodes_taken_out = pd.read_pickle(data_dir + file_pre + 'RemovedNodes_60percent_of_empty_profile.pickle')

In [4]:
# Add attributes to nodes in graph
possible_attributes = ['location', 'college', 'employer']

def addAttributes(G, attributes, attributes_name):
    nx.set_node_attributes(G, attributes, attributes_name)
        
def addAllAttributes(G):
        addAttributes(G, location_empty, 'location')
    addAttributes(G, employer_empty, 'employer')
    addAttributes(G, college_empty, 'college')

In [5]:
# Triangles 
# Check if new pair already considered in list, if not, it add it
def checkIfInPairsNewPair(triangles_pairs, node_1, node_2):
    if node_1 == node_2: return True
    for triangles_pair in triangles_pairs:
        if node_1 in triangles_pair and node_2 in triangles_pair: return True
    return False

# For one node, count number of trianlges with neighbors
def countTrianglesFromRelatedNodes(node, related_nodes, triangles_pairs):
    triangles_count = 0
    for related_node in related_nodes:  
        #print(f'checking pairs ({node}, {related_node})')
        if not checkIfInPairsNewPair(triangles_pairs, node, related_node): 
            #print(f'found coincidence')
            triangles_pairs.append((node, related_node))
            triangles_count += 0 if len(related_nodes) == 0 else 1
    return triangles_count

# Assumes, main node is already identified
# Delivers the quantity of triangles found and the array containing the pairs 
#   being the third one, the studied node
def locateTrianglesInNode(G, neighbors, node):
    triangles_count = 0
    triangles_pairs = []
    for n in neighbors:
        # convert to list in order to be able to use its properties
        neighbors_of_n = [n for n in G.neighbors(n)] 
        neighbors_of_n.remove(node) # ignore main node (node of study)
        related = [f_n for f_n in neighbors_of_n if f_n in neighbors]
        #print(f'neighbor: {n}, related {related}')
        triangles_count += countTrianglesFromRelatedNodes(n, related, triangles_pairs)
                
    return { 'triangles_count': triangles_count, 'triangles_pairs': triangles_pairs }

In [6]:
def searchFromAttributes(G, node_1, node_2, attributes):
    common_att = {}
    node_1_dict = G.nodes[node_1]
    node_2_dict = G.nodes[node_2]
    not_found_att = []
    for att in attributes:
        if att in node_1_dict and att in node_2_dict:
            node_1_att_values = node_1_dict[att]
            node_2_att_values = node_2_dict[att]
            #print(f'found att: {att}: {node_1_att_values} -- {node_2_att_values}')
            # find matches for att between both nodes
            for val in node_1_att_values:
                for val2 in node_2_att_values:
                    prepare_val = ''.join(val.lower().split(' ')) # prevent "errors of hand"
                    prepare_val2 = ''.join(val2.lower().split(' ')) #
                    if (prepare_val == prepare_val2): 
                        if not att in common_att: common_att[att] = []
                        common_att[att].append(val)
        else:
            not_found_att.append(att)
    #if (len(not_found_att) > 0): print('not found att: ', not_found_att)
    return common_att

def getPairRelationship(node_1, node_2, G, attributes):
    connections = searchFromAttributes(G, node_1, node_2, attributes)
    # --- do something else? ---
    return connections

def addValueToNode(G, node, value, attribute):
    node_attributes = G.nodes[node]
    #print(f'node att: {node_attributes}')
    new_values = []
    existant = False
    if attribute in node_attributes:
        new_values = node_attributes[attribute][:]
        prepare_val = ''.join(value.lower().split(' '))
        for val in new_values:
            joined_val = ''.join(val.lower().split(' '))
            if joined_val == prepare_val: 
                existant = True
                break
                
    new_values.append(value)
    #print(f'at node {node} new values: {new_values}')
    if not existant: addAttributes(G, { node: new_values }, attribute)

def verifyValueInAttribute(attribute, node_attributes, value):
    if attribute in node_attributes:
        new_values = node_attributes[attribute][:]
        prepare_val = ''.join(value.lower().split(' '))
        for val in new_values:
            joined_val = ''.join(val.lower().split(' '))
            if joined_val == prepare_val: 
                return True
    return False

def verifyAttributesInNode(attributes, node_attributes):
    total = len(attributes); this_total = 0
    for att in attributes: 
        if att in node_attributes: this_total += 1
    #if (this_total != total) and this_total > 0 : print(f'error? some att already: {this_total}')
    return (this_total == total)
        
def addValuesToNode(G, node, attribute, candidate_values):
    candidates = {}
    if verifyAttributesInNode(possible_attributes, G.nodes[node]): return # no need to analyze
    for index, candidate in enumerate(candidate_values):
        if index in candidates:  candidates[index] += 1
        else: candidates[index] = 1       
        
    # Decision Rules
    values_to_add = []
    if len(candidates) == 0: 
        #print('zero candidates'); 
        return
    elif len(candidates) == 1: 
        #print('one candidate');
        values_to_add = [candidate_values[0]]
    else:
        #print('several candidates');
        weights = [v for i, v in candidates.items()]
        max_weight = max(weights)
        indexes = [ i for i, v in candidates.items() if max_weight == v ]
        values_to_add = [ candidate_values[i] for i in indexes ]
        
    addAttributes(G, { node: values_to_add }, attribute)

In [7]:
# Obtain desired nodes, where to start searching, idealy in the nodes that have all the information

def checkIfAttributes(node, attributes):
    for att in attributes:
        if not att in node: return False
    return True

def checkWellLabeledNodes(G):
    well_labeled_nodes = []
    for node in G.nodes:
        if not checkIfAttributes(G.nodes[node], possible_attributes): continue
        neighbors = [n for n in G.neighbors(node)]
        completed = True
        for neighbor in neighbors:
            if not checkIfAttributes(G.nodes[neighbor], possible_attributes): 
                completed = False
                break
        if not completed : continue
        # passed both tests, good places to start:
        well_labeled_nodes.append(node)
    return well_labeled_nodes

In [23]:
# ---- RUN FOR AL DESIRED NODES -----

def firstProposal(G):
    addAllAttributes(G)
    well_labeled_nodes = checkWellLabeledNodes(G)
    pipe = [node for node in well_labeled_nodes if node in nodes_with_triangles]
    pipe = pipe + [node for node in G.nodes if node not in pipe]

    for node in pipe:
        neighbors = [n for n in G.neighbors(node)]
        triangles_in_node = locateTrianglesInNode(G, neighbors, node)

        prepare_new_values = {}
        for pair in triangles_in_node['triangles_pairs']:
            connections = getPairRelationship(pair[0], pair[1], G, possible_attributes)
            for att, matches in connections.items():
                # addValuesToNode(G, node, att, matches)
                for match in matches:
                    addValueToNode(G, node, match, att)

In [9]:
# ---- RUN FOR AL DESIRED NODES SECOND PROPOSAL -----
def secondProposal(G):
    addAllAttributes(G)
    well_labeled_nodes = checkWellLabeledNodes(G)
    pipe = [node for node in well_labeled_nodes if node in nodes_with_triangles]
    pipe = pipe + [node for node in G.nodes if node not in pipe]

    for node in pipe:
        neighbors = [n for n in G.neighbors(node)]
        triangles_in_node = locateTrianglesInNode(G, neighbors, node)

        for pair in triangles_in_node['triangles_pairs']:
            connections = getPairRelationship(pair[0], pair[1], G, possible_attributes)
            #print(connections)
            for att, matches in connections.items():
                #print(f'att: { att}, match: { matches}')
                addValuesToNode(G, node, att, matches)

In [10]:
# Get info as neeeded for evaluation
def extractFromGraph(G, attribute):
    extracted = {}
    for node in G.nodes:
        node_att = G.nodes[node]
        if attribute in node_att: extracted[node] = node_att[attribute]
    return extracted

In [17]:
def evaluation_accuracy(groundtruth, pred):
    """    Compute the accuracy of your model.

     The accuracy is the proportion of true results.

    Parameters
    ----------
    groundtruth :  : dict 
       A dict of attributes, either location, employer or college attributes. 
       key is a node, value is a list of attribute values.
    pred : dict 
       A dict of attributes, either location, employer or college attributes. 
       key is a node, value is a list of attribute values. 

    Returns
    -------
    out : float
       Accuracy.
    """
    true_positive_prediction=0   
    for p_key, p_value in pred.items():
        if p_key in groundtruth:
            # if prediction is no attribute values, e.g. [] and so is the groundtruth
            # May happen
            if not p_value and not groundtruth[p_key]:
                true_positive_prediction+=1
            # counts the number of good prediction for node p_key
            # here len(p_value)=1 but we could have tried to predict more values
            true_positive_prediction += len([c for c in p_value if c in groundtruth[p_key]])          
        # no else, should not happen: train and test datasets are consistent
    return true_positive_prediction*100/sum(len(v) for v in pred.values())


def getLeoInfo(G, att, in_dict_or_arr, groundtruth):
    los_q_quiere_leo = {}
    los_q_quiere_leo = { node: values for node, values in extractFromGraph(G, att).items() if node in in_dict_or_arr }
    print(f'# {len(los_q_quiere_leo)}')
    print(f'for {att}: {evaluation_accuracy(groundtruth,los_q_quiere_leo)}')

In [12]:
#Third Proposal dev

def updateNodeInfoInPipe(G, pipe, node, labeled_nodes, first_time = False):
    some_change = False
    neighbors = [n for n in G.neighbors(node)]
    well_labeled_neighbors = [n for n in neighbors if n in labeled_nodes]
    triangles_in_node = locateTrianglesInNode(G, neighbors, node)
    info_availability_percentage = len(well_labeled_neighbors) / len(neighbors) if triangles_in_node['triangles_count'] > 0 else 0
    node_info = {
        'node': node,
        'total_neigh': len(neighbors),
        'total_labeled_neigh': len(well_labeled_neighbors),
        'info_availability_percentage': info_availability_percentage,
        'trianlges': triangles_in_node['triangles_count'],
        'labels': len(G.nodes[node]) - 1,
    }
    if first_time: pipe.append(node_info); return True
    for node_dict in pipe:
        if node_dict['node'] == node:
            if not node_dict['info_availability_percentage'] == node_info['info_availability_percentage']: some_change = True;
            if node_dict['labels'] != len(G.nodes[node]) - 1: some_change = True
            node_dict['total_labeled_neigh'] = node_info['total_labeled_neigh']
            node_dict['info_availability_percentage'] = node_info['info_availability_percentage']
            node_dict['labels'] = len(G.nodes[node]) - 1
            break
    return some_change

def thirdProposal(G, nodes_taken_out):
    addAllAttributes(G)
    labeled_nodes = [n for n in G.nodes if n not in nodes_taken_out]

    pipe = []
    for node in nodes_taken_out:
        updateNodeInfoInPipe(G, pipe, node, labeled_nodes, True)
    
    #order the pipe
    pipe = sorted(pipe, key=itemgetter('info_availability_percentage'), reverse=True)

    some_change = True
    count = 0
    while some_change and count < 100:
        some_change = False
        count += 1
        for i, node_info in enumerate(pipe):
            node = node_info['node']
            neighbors = [n for n in G.neighbors(node)]
            triangles_in_node = locateTrianglesInNode(G, neighbors, node)
            for pair in triangles_in_node['triangles_pairs']:
                connections = getPairRelationship(pair[0], pair[1], G, possible_attributes)
                for att, matches in connections.items():
                    addValuesToNode(G, node, att, matches)
            some_change = updateNodeInfoInPipe(G, pipe, node, labeled_nodes) or some_change
            if pipe[i]['total_neigh'] == pipe[i]['total_labeled_neigh']: 
                labeled_nodes.append(pipe[i])
                del pipe[i]

In [112]:
# Studying nodes, discovered 3 kinds of them: nodes with triangles, one link nodes and several link nodes
# Obtain the nodes with trianlges, and the other 2 cases

def getDifferentQuantitiesForAttackingGraph(G):
    count_triangles = 0
    count_one_neighbor = 0
    count_plus_neighbors = 0
    nodes_with_triangles = []
    nodes_one_neighbor = []
    nodes_plus_neighbors = []
    for node in G.nodes:
        neighbors = [n for n in G.neighbors(node)]
        triangles_in_node = locateTrianglesInNode(G, neighbors, node)
        if triangles_in_node['triangles_count'] == 0 and len(neighbors) > 1: 
            count_plus_neighbors += 1
            nodes_plus_neighbors.append(node)
            #print('----SUP--- node: ', node , ' count: ', len(neighbors))
        elif triangles_in_node['triangles_count'] == 0 and len(neighbors) == 1:
            count_one_neighbor += 1
            #print('----EQUAL--- node: ', node)
            nodes_one_neighbor.append(node)
        else:
            count_triangles += 1
            nodes_with_triangles.append(node)
        #print('----TRI--- qty: ', triangles_in_node['triangles_count'])
    return { 
        'nodes_with_triangles': nodes_with_triangles,
        'nodes_one_neighbor': nodes_one_neighbor,
        'nodes_plus_neighbors': nodes_plus_neighbors,
        'count_triangles': count_triangles,
        'count_one_neighbor': count_one_neighbor,
        'count_plus_neighbors': count_plus_neighbors
    }

#print('# of nodes w/triangles: ', count_triangles, ' # of same with no: ', count_non_tri_easy, ' others: ', count_non_tri)
#print(count_triangles + count_non_tri_easy + count_non_tri)

In [95]:
# this method assumes that the values to evaluate are inside G
def accuracyEvaluation(G, attributes, groundtruth_dict, evaluate_this_nodes_dict = {}, this_nodes_title = ''):
    results = {}
    total_nodes = len(G)
    for att in attributes:
        correctness = wrongness = missingness = 0
        
        if att in groundtruth_dict:
            groundtruth = groundtruth_dict[att]
            correct = wrong = missing = 0
            for node in G.nodes:
                truth_values = groundtruth[node]
                node_att = G.nodes[node]
                if att not in node_att: missing += len(truth_values); continue;
                calculated = node_att[att]
                for val in truth_values:
                    if val in calculated: correct += 1
                    else: missing += 1
                for val in calculated:
                    if val not in truth_values: wrong += 1
            correctness = 100 * correct / (correct + missing)
            wrongness = 100 * wrong / (correct + missing)
            missingness = 100 * missing / (correct + missing)
            results[att] = { 
                '1_total_correct': correct, 
                '1_total_wrong': wrong, 
                '1_total_missing': missing,
                '3_overall_correctness': correctness, 
                '3_overall_wrongness': wrongness,
                '3_overall_missingness': missingness
            }
        if att in evaluate_this_nodes_dict and att in groundtruth_dict:
            evaluate_dict = evaluate_this_nodes_dict[att]
            groundtruth = groundtruth_dict[att]
            correct = wrong = missing = 0
            for node in evaluate_dict:
                truth_values = groundtruth[node]
                node_att = G.nodes[node]
                if att not in node_att: missing += len(truth_values); continue;
                calculated = node_att[att]
                for val in truth_values:
                    if val in calculated: correct += 1
                    else: missing += 1
                for val in calculated:
                    if val not in truth_values: wrong += 1
            correctness = 100 * correct / (correct + missing)
            wrongness = 100 * wrong / (correct + missing)
            missingness = 100 * missing / (correct + missing)
            located_str = this_nodes_title if this_nodes_title != '' else 'located'
            results[att]['4_' + located_str + '_correct'] = correct
            results[att]['4_' + located_str + '_wrong'] = wrong
            results[att]['4_' + located_str + '_missing'] = missing
            results[att]['5_' + located_str + '_correctness'] = correctness
            results[att]['5_' + located_str + '_wrongness'] = wrongness
            results[att]['5_' + located_str + '_missingness'] = missingness
        
        pp.pprint(results)
    results['graph_len'] = total_nodes
    return results
                

In [267]:
graph_first = nx.read_gexf(data_dir + file_pre + 'Linkedin.gexf')

In [268]:
diff_qtt = getDifferentQuantitiesForAttackingGraph(graph_first)
pp.pprint(diff_qtt, depth=1)
nodes_with_triangles = diff_qtt['nodes_with_triangles']

{'count_one_neighbor': 5211,
 'count_plus_neighbors': 22,
 'count_triangles': 8799,
 'nodes_one_neighbor': [...],
 'nodes_plus_neighbors': [...],
 'nodes_with_triangles': [...]}


In [269]:
thirdProposal(graph_first, nodes_taken_out)
#secondProposal(graph_first)
#firstProposal(graph_first)

In [223]:
loc_att = nx.get_node_attributes(graph_first, 'location')
col_att = nx.get_node_attributes(graph_first, 'college')
emp_att = nx.get_node_attributes(graph_first, 'employer')
with open('./output/location' + 'FirstProposal' + file_pre + '.pickle', 'wb') as handle:
    pickle.dump(loc_att, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('./output/college' + 'FirstProposal' + file_pre + '.pickle', 'wb') as handle:
    pickle.dump(col_att, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('./output/employer' + 'FirstProposal' + file_pre + '.pickle', 'wb') as handle:
    pickle.dump(emp_att, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [194]:
attributes = ['location']
groundtruth_dict = { 'location': location, 'employer': employer, 'college': college }
located_to_study = {}
#located_to_study['location'] = extractFromGraph(graph_first, "location")
located_to_study['location'] = { node: 0 for node in nodes_with_triangles } 
located_to_study['employer'] = { node: 0 for node in nodes_with_triangles } 
located_to_study['college'] = { node: 0 for node in nodes_with_triangles } 
title = 'nodes_with_triangles'
results = accuracyEvaluation(graph_first, attributes, groundtruth_dict, located_to_study, title)

{'location': {'1_total_correct': 404,
              '1_total_missing': 407,
              '1_total_wrong': 73,
              '3_overall_correctness': 49.81504315659679,
              '3_overall_missingness': 50.18495684340321,
              '3_overall_wrongness': 9.001233045622689,
              '4_nodes_with_triangles_correct': 249,
              '4_nodes_with_triangles_missing': 181,
              '4_nodes_with_triangles_wrong': 73,
              '5_nodes_with_triangles_correctness': 57.906976744186046,
              '5_nodes_with_triangles_missingness': 42.093023255813954,
              '5_nodes_with_triangles_wrongness': 16.976744186046513}}


KeyError: 'U14078'

In [262]:
#nodes_one_neighbor = diff_qtt['nodes_one_neighbor']
#nodes_plus_neighbors = diff_qtt['nodes_plus_neighbors']
graph_all = nx.read_gexf(data_dir + file_pre + 'Linkedin.gexf')
addAttributes(graph_all, location, 'location')
addAttributes(graph_all, college, 'college')
addAttributes(graph_all, employer, 'employer')
"""
for node in nodes_one_neighbor:
    neighbors = [n for n in graph_all.neighbors(node)]
    neighbor = neighbors[0]
    print('---- node ----')
    relationship = getPairRelationship(node, neighbor, graph_all, possible_attributes)
    
    if 'location' not in relationship and 'employer' not in relationship and 'college' not in relationship:
        print(f'--> START NOOOOOOOOT node1: {node}, node2: {neighbor}')
        pp.pprint(graph_all.nodes[node])
        pp.pprint(graph_all.nodes[neighbor])
        print('--> END NOOOOOOOOT ')
    else:
        print('--> COIN ')
        print(relationship)"""

"\nfor node in nodes_one_neighbor:\n    neighbors = [n for n in graph_all.neighbors(node)]\n    neighbor = neighbors[0]\n    print('---- node ----')\n    relationship = getPairRelationship(node, neighbor, graph_all, possible_attributes)\n    \n    if 'location' not in relationship and 'employer' not in relationship and 'college' not in relationship:\n        print(f'--> START NOOOOOOOOT node1: {node}, node2: {neighbor}')\n        pp.pprint(graph_all.nodes[node])\n        pp.pprint(graph_all.nodes[neighbor])\n        print('--> END NOOOOOOOOT ')\n    else:\n        print('--> COIN ')\n        print(relationship)"

In [132]:
node = 'U22859'
second_neighbors = [n for n in graph_all.neighbors('U22747')]
for neigh in second_neighbors:
    relationship = getPairRelationship(node, neigh, graph_all, possible_attributes)
    print(relationship)

{'location': ['bengaluru area india'], 'college': ['indian institute of technology madras'], 'employer': ['amazon', 'ibm india research lab', 'coordinated science lab', 'toyota technical center']}
{}
{}
{'location': ['bengaluru area india']}
{}
{'location': ['bengaluru area india']}
{'location': ['bengaluru area india']}
{}
{}
{'location': ['bengaluru area india']}
{}
{}
{}
{'employer': ['amazon']}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{'college': ['indian institute of technology madras']}
{}
{}
{'location': ['bengaluru area india']}
{}


In [177]:
# Understand percentages of coincidences of each attribute in first and second grade neighbors

# First
total_nodes = len(nodes_one_neighbor)
college_matches = 0
employer_matches = 0
location_matches = 0
location_percentage = 0
college_percentage = 0
employer_percentage = 0
for node in nodes_one_neighbor:
    neighbors = [n for n in graph_all.neighbors(node)]
    neighbor = neighbors[0]
    relationships = getPairRelationship(node, neighbor, graph_all, possible_attributes)
    #print('----node----')
    len_att = len(relationships)
    for att, values in relationships.items():
        if att == 'location': location_matches += 1; location_percentage += 1/len_att; #print(f'matches: {len(values)}')
        elif att == 'college': college_matches += 1; college_percentage += 1/len_att; #print(f'matches: {len(values)}')
        elif att == 'employer': employer_matches += 1; employer_percentage += 1/len_att; #print(f'matches: {len(values)}')
        else: print('error')

print(f'MATCHES -> attribute (matches with neighbor)||(matches according to total of matches of attribute)||(matches/nodes -> accuracy?)')
print(f'MATCHES -> college {college_matches}||{college_percentage}||{100*college_matches/total_nodes}')
print(f'MATCHES -> employer: {employer_matches}||{employer_percentage}||{100*employer_matches/total_nodes}') 
print(f'MATCHES -> location: {location_matches}||{location_percentage}||{100*location_matches/total_nodes}')

# Second
college_matches = 0
employer_matches = 0
location_matches = 0
location_percentage = 0
college_percentage = 0
employer_percentage = 0
total_nodes = 0
for node in nodes_one_neighbor:
    neighbors = [n for n in graph_all.neighbors(node)]
    neighbor = neighbors[0]
    second_neighbors = [n for n in graph_all.neighbors(neighbor)]
    relationships = getPairRelationship(node, neighbor, graph_all, possible_attributes)
    #
    #if len(relationships) > 0: continue
    len_second_neigh = len(second_neighbors)
    total_nodes += len_second_neigh
    for neigh in second_neighbors:
        relationships = getPairRelationship(node, neigh, graph_all, possible_attributes)
        len_att = len(relationships)
        for att, values in relationships.items():
            if att == 'location': location_matches += 1; location_percentage += 1/(len_att*len_second_neigh); #print(f'matches: {len(values)}')
            elif att == 'college': college_matches += 1; college_percentage += 1/(len_att*len_second_neigh); #print(f'matches: {len(values)}')
            elif att == 'employer': employer_matches += 1; employer_percentage += 1/(len_att*len_second_neigh); #print(f'matches: {len(values)}')
            else: print('error')
print('\n')
print(f'MATCHES -> attribute (total matches per attribute)||(percentage of fiability according to matches)||(percentage fiability according total nodes studied)')
print(f'MATCHES -> college {college_matches}||{college_percentage}||{100*college_matches/total_nodes}')
print(f'MATCHES -> employer: {employer_matches}||{employer_percentage}||{100*employer_matches/total_nodes}') 
print(f'MATCHES -> location: {location_matches}||{location_percentage}||{100*location_matches/total_nodes}')


# Second
college_matches = 0
employer_matches = 0
location_matches = 0
location_percentage = 0
college_percentage = 0
employer_percentage = 0
total_nodes = 0
for node in nodes_one_neighbor:
    neighbors = [n for n in graph_all.neighbors(node)]
    neighbor = neighbors[0]
    second_neighbors = [n for n in graph_all.neighbors(neighbor)]
    relationships = getPairRelationship(node, neighbor, graph_all, possible_attributes)
    if len(relationships) > 0: continue
    len_second_neigh = len(second_neighbors)
    total_nodes += len_second_neigh
    most_repeated_rel = {}
    for neigh in second_neighbors:
        relationships = getPairRelationship(node, neigh, graph_all, possible_attributes)
        len_att = len(relationships)
        for att, values in relationships.items():
            if not att in most_repeated_rel: most_repeated_rel[att] = {}
            if att == 'location': 
                location_matches += 1; 
                location_percentage += 1/(len_att*len_second_neigh); #print(f'matches: {len(values)}')
            elif att == 'college': 
                college_matches += 1; 
                college_percentage += 1/(len_att*len_second_neigh); #print(f'matches: {len(values)}')
            elif att == 'employer': 
                employer_matches += 1; 
                employer_percentage += 1/(len_att*len_second_neigh); #print(f'matches: {len(values)}')
            else: print('error')
            for value in values:
                if not value in most_repeated_rel[att] : most_repeated_rel[att][value] = 0
                most_repeated_rel[att][value] += 1
    print(relationships)
    print(most_repeated_rel)
print('\n')
print(f'MATCHES -> attribute (total matches per attribute)||(percentage of fiability according to matches)||(percentage fiability according total nodes studied)')
print(f'MATCHES -> college {college_matches}||{college_percentage}||{100*college_matches/total_nodes}')
print(f'MATCHES -> employer: {employer_matches}||{employer_percentage}||{100*employer_matches/total_nodes}') 
print(f'MATCHES -> location: {location_matches}||{location_percentage}||{100*location_matches/total_nodes}')

# con esto podemos ver que para cuando se tiene el vecino bien labeled, se puede asisgnar location con 56%
# de éxito y sino acudir a sus vecinos y asignar location con 23% de éxito aprox

MATCHES -> attribute (matches with neighbor)||(matches according to total of matches of attribute)||(matches/nodes -> accuracy?)
MATCHES -> college 58||43.16666666666667||17.008797653958943
MATCHES -> employer: 57||35.66666666666666||16.715542521994134
MATCHES -> location: 74||56.16666666666667||21.700879765395893


MATCHES -> attribute (total matches per attribute)||(percentage of fiability according to matches)||(percentage fiability according total nodes studied)
MATCHES -> college 1956||28.65213432954574||10.960439314132019
MATCHES -> employer: 1434||23.99486847834699||8.0354140983974
MATCHES -> location: 2652||50.33886256979451||14.86047293511151
{}
{'location': {'bengaluru area india': 6}, 'college': {'indian institute of technology madras': 2}, 'employer': {'amazon': 2, 'ibm india research lab': 1, 'coordinated science lab': 1, 'toyota technical center': 1}}
{}
{'location': {'baltimore maryland area': 1}, 'employer': {'designing events': 1}}
{}
{'location': {'davenport iowa area

{'location': {'greater chicago area': 7}}
{}
{'location': {'greater boston area': 1}, 'college': {'beijing university of chemical technology': 1}, 'employer': {'sproxil inc': 1, 'graphics visualization &amp; interaction group of brown university': 1, 'institute of computing technology chinese academy of sciences': 1}}
{}
{'location': {'united states': 1}, 'employer': {'siemens healthcare': 1}}
{}
{'location': {'bangladesh': 11}, 'employer': {'bangladesh navy': 1}}
{'location': ['greater chicago area']}
{'location': {'greater chicago area': 20}, 'college': {'university of illinois at chicago': 2}, 'employer': {'university of illinois at chicago': 3, 'uic-illinois college advising corps': 1, 'university of illinois - illinois college advising corps': 1, 'pattycake daycare': 1, 'walgreens': 1}}
{}
{'location': {'united states': 1}}
{}
{'employer': {'university of new south wales': 2, 'microsoft': 3, 'facebook': 1, 'google': 2, 'telco in a box': 1}, 'location': {'greater seattle area': 1},

In [186]:
for node in graph_all:
    node_att = graph_all.nodes[node]
    if 'location' in node_att:
        value = graph_all.nodes[node]['location'][0]
        nx.set_node_attributes(graph_all, { node: value }, 'location')

In [224]:
nx.attribute_assortativity_coefficient(graph_all, 'location')

0.13056238121454491

In [235]:
print(len(graph_all))
print(len(graph_first))

811
811


In [238]:
centrality_degree = nx.degree_centrality(graph_first)
centrality_degree

{'U27476': 0.009876543209876543,
 'U4665': 0.012345679012345678,
 'U1313': 0.0049382716049382715,
 'U8804': 0.0012345679012345679,
 'U14078': 0.0049382716049382715,
 'U9628': 0.006172839506172839,
 'U9721': 0.0024691358024691358,
 'U2649': 0.0024691358024691358,
 'U27759': 0.0012345679012345679,
 'U7310': 0.0012345679012345679,
 'U22859': 0.0012345679012345679,
 'U1417': 0.0012345679012345679,
 'U25611': 0.003703703703703704,
 'U24095': 0.0049382716049382715,
 'U3895': 0.009876543209876543,
 'U13005': 0.0024691358024691358,
 'U2620': 0.0012345679012345679,
 'U5976': 0.008641975308641974,
 'U4249': 0.019753086419753086,
 'U9140': 0.003703703703703704,
 'U16112': 0.003703703703703704,
 'U27776': 0.0012345679012345679,
 'U27708': 0.0024691358024691358,
 'U15284': 0.006172839506172839,
 'U24549': 0.0012345679012345679,
 'U18514': 0.024691358024691357,
 'U15272': 0.007407407407407408,
 'U3933': 0.003703703703703704,
 'U15946': 0.007407407407407408,
 'U25630': 0.003703703703703704,
 'U25632'

In [241]:
order_arr = [{'node': key, 'centr': val} for key, val in centrality_degree.items()]
order_arr = sorted(order_arr, key=itemgetter('centr'), reverse=True)
order_arr

[{'node': 'U27287', 'centr': 0.15061728395061727},
 {'node': 'U7024', 'centr': 0.09135802469135802},
 {'node': 'U1045', 'centr': 0.07160493827160494},
 {'node': 'U7972', 'centr': 0.056790123456790124},
 {'node': 'U8670', 'centr': 0.056790123456790124},
 {'node': 'U7091', 'centr': 0.05555555555555555},
 {'node': 'U4562', 'centr': 0.04567901234567901},
 {'node': 'U15267', 'centr': 0.03950617283950617},
 {'node': 'U5977', 'centr': 0.03827160493827161},
 {'node': 'U4485', 'centr': 0.03580246913580247},
 {'node': 'U22747', 'centr': 0.0345679012345679},
 {'node': 'U24045', 'centr': 0.02962962962962963},
 {'node': 'U14068', 'centr': 0.028395061728395062},
 {'node': 'U6000', 'centr': 0.027160493827160494},
 {'node': 'U18514', 'centr': 0.024691358024691357},
 {'node': 'U11566', 'centr': 0.024691358024691357},
 {'node': 'U3955', 'centr': 0.023456790123456788},
 {'node': 'U2656', 'centr': 0.022222222222222223},
 {'node': 'U7912', 'centr': 0.020987654320987655},
 {'node': 'U13995', 'centr': 0.0209

In [243]:
cnt = 0
total_len = len(graph_first)
for index, node_element in enumerate(order_arr):
    if node_element['node'] in nodes_with_triangles: 
        print(f'index { index }')
        
# with this, we can see that the most central nodes are already included in the nodes with triangles study

index 0
index 1
index 2
index 3
index 4
index 5
index 6
index 7
index 8
index 9
index 10
index 11
index 12
index 13
index 14
index 15
index 16
index 17
index 18
index 19
index 20
index 21
index 22
index 23
index 24
index 25
index 26
index 27
index 28
index 29
index 30
index 31
index 32
index 33
index 34
index 35
index 36
index 37
index 38
index 39
index 40
index 41
index 42
index 43
index 44
index 45
index 46
index 47
index 48
index 49
index 50
index 51
index 52
index 53
index 54
index 55
index 56
index 57
index 58
index 59
index 60
index 61
index 62
index 63
index 64
index 65
index 66
index 67
index 68
index 69
index 70
index 71
index 72
index 73
index 74
index 75
index 76
index 77
index 78
index 79
index 80
index 81
index 82
index 83
index 84
index 85
index 86
index 87
index 88
index 89
index 90
index 91
index 92
index 93
index 94
index 95
index 96
index 97
index 98
index 99
index 100
index 101
index 102
index 103
index 104
index 105
index 106
index 107
index 108
index 109
index 110


In [248]:
nx.attribute_mixing_dict(graph_all, 'location')

{'greater new york city area': {'san francisco bay area': 7,
  'shanghai city china': 2,
  'greater seattle area': 2,
  'greater san diego area': 2,
  'china': 5,
  'urbana-champaign illinois area': 26,
  'greater boston area': 14,
  'las vegas nevada area': 1,
  'greater new york city area': 4,
  'orange county california area': 1,
  'greater detroit area': 3,
  'lafayette indiana area': 1,
  'greater los angeles area': 1,
  'ottawa canada area': 1,
  'greater nashville area': 2,
  'united states': 1,
  'mumbai area india': 2,
  'singapore': 1,
  'washington d.c. metro area': 2,
  'chandigarh area india': 2,
  'london united kingdom': 2,
  'miami fort lauderdale area': 3,
  'gainesville florida area': 1,
  'greater atlanta area': 1,
  'bengaluru area india': 1,
  'ulm area germany': 1,
  'bangladesh': 3,
  'wichita kansas area': 1,
  'toronto canada area': 1,
  'lubbock texas area': 1,
  'british columbia canada': 1,
  'greater chicago area': 3,
  'copenhagen area denmark': 1,
  'bloo

In [249]:
nx.degree_mixing_dict(graph_all)

{1: {28: 12,
  46: 46,
  58: 39,
  2: 11,
  122: 43,
  37: 7,
  8: 9,
  29: 9,
  6: 8,
  3: 6,
  74: 66,
  45: 35,
  20: 9,
  4: 5,
  10: 8,
  23: 5,
  5: 5,
  16: 5,
  18: 2,
  7: 2,
  9: 1,
  31: 2,
  17: 3,
  32: 2,
  11: 1},
 28: {1: 12, 2: 9, 4: 3, 6: 2, 3: 2},
 9: {14: 6,
  7: 13,
  10: 12,
  8: 12,
  32: 2,
  6: 9,
  22: 2,
  31: 3,
  3: 9,
  9: 12,
  13: 4,
  5: 9,
  122: 4,
  17: 4,
  4: 9,
  1: 1,
  2: 6,
  29: 1,
  16: 4,
  11: 1,
  46: 3,
  12: 5,
  23: 1,
  58: 1,
  20: 1,
  37: 1},
 14: {8: 2,
  3: 4,
  2: 2,
  9: 6,
  18: 1,
  19: 1,
  5: 6,
  7: 6,
  4: 4,
  37: 1,
  122: 1,
  6: 17,
  12: 3,
  10: 4,
  14: 6,
  32: 3,
  17: 1,
  16: 1,
  46: 1},
 7: {16: 5,
  5: 19,
  22: 2,
  2: 8,
  6: 22,
  46: 2,
  31: 4,
  9: 13,
  10: 9,
  19: 2,
  8: 5,
  122: 4,
  14: 6,
  4: 16,
  32: 4,
  7: 10,
  3: 12,
  37: 1,
  13: 4,
  1: 2,
  23: 1,
  12: 1,
  17: 3,
  29: 2,
  20: 2,
  45: 1,
  11: 1},
 10: {20: 1,
  10: 16,
  12: 8,
  37: 1,
  4: 8,
  13: 16,
  122: 7,
  24: 6,
  7: 9

In [250]:
nx.degree_pearson_correlation_coefficient(graph_all)

-0.22480574380333795

In [252]:
nx.degree_assortativity_coefficient(graph_all)

-0.22480574380333795

In [258]:
c = list(nx.algorithms.community.k_clique_communities(graph_all, 4))
c

[frozenset({'U15267',
            'U15269',
            'U15272',
            'U15284',
            'U15286',
            'U15287',
            'U15292',
            'U15294',
            'U15296',
            'U15297',
            'U15299',
            'U15307',
            'U15308',
            'U15316',
            'U15317',
            'U15318',
            'U15321',
            'U15333',
            'U15334',
            'U15335',
            'U15336',
            'U15341',
            'U15342',
            'U15344',
            'U15349',
            'U15350',
            'U15359',
            'U8702'}),
 frozenset({'U14123', 'U15168', 'U9630', 'U9710'}),
 frozenset({'U11558',
            'U11564',
            'U3894',
            'U3895',
            'U3939',
            'U3954',
            'U3955',
            'U4005',
            'U4010'}),
 frozenset({'U18514',
            'U18516',
            'U18518',
            'U18524',
            'U18549',
            'U18560',
      

In [265]:
bays = []
nodes_in_bay_all = []
for node in graph_all.nodes:
    node_att = graph_all.nodes[node]
    location = node_att['location'][0]
    if 'bay' in location.lower():
        if location not in bays: bays.append(location)
        nodes_in_bay_all.append(node)
        #print(f'node: {node}, loc: {location}')

print(bays)

['san francisco bay area', 'green bay wisconsin area']


In [270]:
key_nodes_in_bay = []
for i, node_element in enumerate(order_arr):
    if node_element['node'] in nodes_in_bay_all:
        #print(f'index: {i}, node_el {node_element}')
        key_nodes_in_bay.append(node_element['node'])

In [280]:
for i, node_element in enumerate(order_arr):
    node = node_element['node']
    if node in nodes_in_bay_all:
        if 'location' in graph_first.nodes[node]:
            print(f'node: {node}, centrality: {node_element["centr"]}, num_neigh: {len([e for e in graph_first.neighbors(node)])}, location: {graph_first.nodes[node]["location"]}')
        else:
            print('no location')

node: U8670, centrality: 0.056790123456790124, num_neigh: 174, location: ['urbana-champaign illinois area']
node: U15267, centrality: 0.03950617283950617, num_neigh: 107, location: ['san francisco bay area']
node: U24045, centrality: 0.02962962962962963, num_neigh: 98, location: ['urbana-champaign illinois area']
node: U4568, centrality: 0.01728395061728395, num_neigh: 48, location: ['greater new york city area']
node: U27661, centrality: 0.011111111111111112, num_neigh: 22, location: ['san francisco bay area']
node: U4619, centrality: 0.009876543209876543, num_neigh: 23, location: ['san francisco bay area']
node: U16128, centrality: 0.008641975308641974, num_neigh: 27, location: ['china']
node: U14577, centrality: 0.008641975308641974, num_neigh: 16, location: ['san francisco bay area']
node: U16141, centrality: 0.008641975308641974, num_neigh: 29, location: ['china']
node: U15272, centrality: 0.007407407407407408, num_neigh: 20, location: ['san francisco bay area']
node: U27460, cent