In [1]:
import pandas as pd
import pprint as pp
import networkx as nx

In [2]:
data_dir = './data/'

In [3]:
# Data
graph = nx.read_gexf(data_dir + 'mediumLinkedin.gexf')
location = pd.read_pickle(data_dir + 'mediumLocation.pickle')
employer = pd.read_pickle(data_dir + 'mediumEmployer.pickle')
college = pd.read_pickle(data_dir + 'mediumCollege.pickle')
empty_str = '_60percent_of_empty_profile'
get_empty = True
location_empty = pd.read_pickle(data_dir + 'mediumLocation' + (empty_str if get_empty else '') + '.pickle')
employer_empty = pd.read_pickle(data_dir + 'mediumEmployer' + (empty_str if get_empty else '') + '.pickle')
college_empty = pd.read_pickle(data_dir + 'mediumCollege' + (empty_str if get_empty else '') + '.pickle')

In [4]:
# Add attributes to nodes in graph
possible_attributes = ['location', 'college', 'employer']

def addAttributes(G, attributes, attributes_name):
    nx.set_node_attributes(G, attributes, attributes_name)
        
addAttributes(graph, location_empty, 'location')
addAttributes(graph, employer_empty, 'employer')
addAttributes(graph, college_empty, 'college')

In [5]:
# Check if new pair already considered in list, if not, it add it
def checkIfInPairsNewPair(triangles_pairs, node_1, node_2):
    if node_1 == node_2: return True
    for triangles_pair in triangles_pairs:
        if node_1 in triangles_pair and node_2 in triangles_pair: return True
    return False

# For one node, count number of trianlges with neighbors
def countTrianglesFromRelatedNodes(node, related_nodes, triangles_pairs):
    triangles_count = 0
    for related_node in related_nodes:  
        #print(f'checking pairs ({node}, {related_node})')
        if not checkIfInPairsNewPair(triangles_pairs, node, related_node): 
            #print(f'found coincidence')
            triangles_pairs.append((node, related_node))
            triangles_count += 0 if len(related_nodes) == 0 else 1
        else :
            continue
    return triangles_count

# Assumes, main node is already identified
# Delivers the quantity of triangles found and the array containing the pairs 
#   being the third one, the studied node
def locateTrianglesInNode(G, neighbors, node):
    triangles_count = 0
    triangles_pairs = []
    for n in neighbors:
        # convert to list in order to be able to use its properties
        neighbors_of_n = [n for n in G.neighbors(n)] 
        neighbors_of_n.remove(node) # ignore main node (node of study)
        related = [f_n for f_n in neighbors_of_n if f_n in neighbors]
        #print(f'neighbor: {n}, related {related}')
        triangles_count += countTrianglesFromRelatedNodes(n, related, triangles_pairs)
                
    return { 'triangles_count': triangles_count, 'triangles_pairs': triangles_pairs }

In [6]:
# Obtain desired nodes, where to start searching, idealy in the nodes that have all the information

def checkIfAttributes(node, attributes):
    for att in attributes:
        if not att in node: return False
    return True

def checkWellLabeledNodes(G):
    well_labeled_nodes = []
    for node in G.nodes:
        if not checkIfAttributes(graph.nodes[node], possible_attributes): continue
        neighbors = [n for n in graph.neighbors(node)]
        completed = True
        for neighbor in neighbors:
            if not checkIfAttributes(graph.nodes[neighbor], possible_attributes): 
                completed = False
                break
        if not completed : continue
        # passed both tests, good places to start:
        well_labeled_nodes.append(node)
    return well_labeled_nodes

In [7]:
# Studying nodes, discovered 3 kinds of them: nodes with triangles, one link nodes and several link nodes
# Obtain the nodes with trianlges, and the other 2 cases

count_triangles = 0
count_non_tri_easy = 0
count_non_tri = 0
nodes_with_triangles = []
nodes_one_neighbor = []
nodes_plus_neighbors = []
for node in graph.nodes:
    neighbors = [n for n in graph.neighbors(node)]
    triangles_in_node = locateTrianglesInNode(graph, neighbors, node)
    if triangles_in_node['triangles_count'] == 0 and len(neighbors) > 1: 
        count_non_tri += 1
        nodes_plus_neighbors.append(node)
        #print('----SUP--- node: ', node , ' count: ', len(neighbors))
    elif triangles_in_node['triangles_count'] == 0 and len(neighbors) == 1:
        count_non_tri_easy += 1
        #print('----EQUAL--- node: ', node)
        nodes_one_neighbor.append(node)
    else:
        count_triangles += 1
        nodes_with_triangles.append(node)
        #print('----TRI--- qty: ', triangles_in_node['triangles_count'])

#print('# of nodes w/triangles: ', count_triangles, ' # of same with no: ', count_non_tri_easy, ' others: ', count_non_tri)
#print(count_triangles + count_non_tri_easy + count_non_tri)

In [8]:
# ---- RUN FOR AL DESIRED NODES -----

well_labeled_nodes = checkWellLabeledNodes(graph)
pipe = [node for node in well_labeled_nodes if node in nodes_with_triangles]
pipe = pipe + [node for node in graph.nodes if node not in pipe]

for node in pipe:
    neighbors = [n for n in graph.neighbors(node)]
    triangles_in_node = locateTrianglesInNode(graph, neighbors, node)

    prepare_new_values = {}
    for pair in triangles_in_node['triangles_pairs']:
        connections = getPairRelationship(pair[0], pair[1], graph, possible_attributes)
        for att, matches in connections.items():
            for match in matches:
                addValueToNode(graph, node, match, att)

NameError: name 'getPairRelationship' is not defined

In [None]:
# Get info as neeeded for evaluation
def extractFromGraph(G, attribute):
    extracted = {}
    for node in G.nodes:
        node_att = G.nodes[node]
        if attribute in node_att: extracted[node] = node_att[attribute]
    return extracted

In [None]:
def evaluation_accuracy(groundtruth, pred):
    """    Compute the accuracy of your model.

     The accuracy is the proportion of true results.

    Parameters
    ----------
    groundtruth :  : dict 
       A dict of attributes, either location, employer or college attributes. 
       key is a node, value is a list of attribute values.
    pred : dict 
       A dict of attributes, either location, employer or college attributes. 
       key is a node, value is a list of attribute values. 

    Returns
    -------
    out : float
       Accuracy.
    """
    true_positive_prediction=0   
    for p_key, p_value in pred.items():
        if p_key in groundtruth:
            # if prediction is no attribute values, e.g. [] and so is the groundtruth
            # May happen
            if not p_value and not groundtruth[p_key]:
                true_positive_prediction+=1
            # counts the number of good prediction for node p_key
            # here len(p_value)=1 but we could have tried to predict more values
            true_positive_prediction += len([c for c in p_value if c in groundtruth[p_key]])          
        # no else, should not happen: train and test datasets are consistent
    return true_positive_prediction*100/sum(len(v) for v in pred.values())

print(f'for employer: {evaluation_accuracy(employer,extractFromGraph(graph, "employer"))}')
print(f'for location: {evaluation_accuracy(location,extractFromGraph(graph, "location"))}')
print(f'for college: {evaluation_accuracy(college,extractFromGraph(graph, "college"))}')