In [1]:
import pandas as pd
import pprint as pp
import networkx as nx
from operator import itemgetter, attrgetter

In [2]:
data_dir = './data/'

In [3]:
# Data
graph = nx.read_gexf(data_dir + 'mediumLinkedin.gexf')
location = pd.read_pickle(data_dir + 'mediumLocation.pickle')
employer = pd.read_pickle(data_dir + 'mediumEmployer.pickle')
college = pd.read_pickle(data_dir + 'mediumCollege.pickle')
empty_str = '_60percent_of_empty_profile'
get_empty = True
location_empty = pd.read_pickle(data_dir + 'mediumLocation' + (empty_str if get_empty else '') + '.pickle')
employer_empty = pd.read_pickle(data_dir + 'mediumEmployer' + (empty_str if get_empty else '') + '.pickle')
college_empty = pd.read_pickle(data_dir + 'mediumCollege' + (empty_str if get_empty else '') + '.pickle')
print(data_dir + 'mediumLocation' + (empty_str if get_empty else '') + '.pickle')
nodes_taken_out = pd.read_pickle(data_dir + 'largeRemovedNodes_60percent_of_empty_profile.pickle')

./data/mediumLocation_60percent_of_empty_profile.pickle


In [4]:
# Add attributes to nodes in graph
possible_attributes = ['location', 'college', 'employer']

def addAttributes(G, attributes, attributes_name):
    nx.set_node_attributes(G, attributes, attributes_name)
        
def addAllAttributes(G):
    addAttributes(G, location_empty, 'location')
    addAttributes(G, employer_empty, 'employer')
    addAttributes(G, college_empty, 'college')

In [5]:
# Check if new pair already considered in list, if not, it add it
def checkIfInPairsNewPair(triangles_pairs, node_1, node_2):
    if node_1 == node_2: return True
    for triangles_pair in triangles_pairs:
        if node_1 in triangles_pair and node_2 in triangles_pair: return True
    return False

# For one node, count number of trianlges with neighbors
def countTrianglesFromRelatedNodes(node, related_nodes, triangles_pairs):
    triangles_count = 0
    for related_node in related_nodes:  
        #print(f'checking pairs ({node}, {related_node})')
        if not checkIfInPairsNewPair(triangles_pairs, node, related_node): 
            #print(f'found coincidence')
            triangles_pairs.append((node, related_node))
            triangles_count += 0 if len(related_nodes) == 0 else 1
    return triangles_count

# Assumes, main node is already identified
# Delivers the quantity of triangles found and the array containing the pairs 
#   being the third one, the studied node
def locateTrianglesInNode(G, neighbors, node):
    triangles_count = 0
    triangles_pairs = []
    for n in neighbors:
        # convert to list in order to be able to use its properties
        neighbors_of_n = [n for n in G.neighbors(n)] 
        neighbors_of_n.remove(node) # ignore main node (node of study)
        related = [f_n for f_n in neighbors_of_n if f_n in neighbors]
        #print(f'neighbor: {n}, related {related}')
        triangles_count += countTrianglesFromRelatedNodes(n, related, triangles_pairs)
                
    return { 'triangles_count': triangles_count, 'triangles_pairs': triangles_pairs }

In [6]:
def searchFromAttributes(G, node_1, node_2, attributes):
    common_att = {}
    node_1_dict = G.nodes[node_1]
    node_2_dict = G.nodes[node_2]
    not_found_att = []
    for att in attributes:
        if att in node_1_dict and att in node_2_dict:
            node_1_att_values = node_1_dict[att]
            node_2_att_values = node_2_dict[att]
            #print(f'found att: {att}: {node_1_att_values} -- {node_2_att_values}')
            # find matches for att between both nodes
            for val in node_1_att_values:
                for val2 in node_2_att_values:
                    prepare_val = ''.join(val.lower().split(' ')) # prevent "errors of hand"
                    prepare_val2 = ''.join(val2.lower().split(' ')) #
                    if (prepare_val == prepare_val2): 
                        if not att in common_att: common_att[att] = []
                        common_att[att].append(val)
        else:
            not_found_att.append(att)
    #if (len(not_found_att) > 0): print('not found att: ', not_found_att)
    return common_att

def getPairRelationship(node_1, node_2, G, attributes):
    connections = searchFromAttributes(G, node_1, node_2, attributes)
    # --- do something else? ---
    return connections

def addValueToNode(G, node, value, attribute):
    node_attributes = G.nodes[node]
    #print(f'node att: {node_attributes}')
    new_values = []
    existant = False
    if attribute in node_attributes:
        new_values = node_attributes[attribute][:]
        prepare_val = ''.join(value.lower().split(' '))
        for val in new_values:
            joined_val = ''.join(val.lower().split(' '))
            if joined_val == prepare_val: 
                existant = True
                break
                
    new_values.append(value)
    #print(f'at node {node} new values: {new_values}')
    if not existant: addAttributes(G, { node: new_values }, attribute)

def verifyValueInAttribute(attribute, node_attributes, value):
    if attribute in node_attributes:
        new_values = node_attributes[attribute][:]
        prepare_val = ''.join(value.lower().split(' '))
        for val in new_values:
            joined_val = ''.join(val.lower().split(' '))
            if joined_val == prepare_val: 
                return True
    return False

def verifyAttributesInNode(attributes, node_attributes):
    total = len(attributes); this_total = 0
    for att in attributes: 
        if att in node_attributes: this_total += 1
    #if (this_total != total) and this_total > 0 : print(f'error? some att already: {this_total}')
    return (this_total == total)
        
def addValuesToNode(G, node, attribute, candidate_values):
    candidates = {}
    if verifyAttributesInNode(possible_attributes, G.nodes[node]): return # no need to analyze
    for index, candidate in enumerate(candidate_values):
        if index in candidates:  candidates[index] += 1
        else: candidates[index] = 1       
        
    # Decision Rules
    values_to_add = []
    if len(candidates) == 0: 
        #print('zero candidates'); 
        return
    elif len(candidates) == 1: 
        #print('one candidate');
        values_to_add = [candidate_values[0]]
    else:
        #print('several candidates');
        weights = [v for i, v in candidates.items()]
        max_weight = max(weights)
        indexes = [ i for i, v in candidates.items() if max_weight == v ]
        values_to_add = [ candidate_values[i] for i in indexes ]
        
    addAttributes(G, { node: values_to_add }, attribute)
        
        

# ---- RUN FOR ONE NODE ---- DEBUG !!! ----
node = 'U27476'
graph = nx.read_gexf(data_dir + 'mediumLinkedin.gexf')
addAllAttributes(graph)
neighbors = [n for n in graph.neighbors(node)]
triangles_in_node = locateTrianglesInNode(graph, neighbors, node)
print(triangles_in_node)

prepare_new_values = {}
for pair in triangles_in_node['triangles_pairs']:
    connections = getPairRelationship(pair[0], pair[1], graph, possible_attributes)
    print(connections)
    for att, matches in connections.items():
        print(f'att: { att}, match: { matches}')
        addValuesToNode(graph, node, att, matches)
        #for index, match in enumerate(matches): 
            #addValueToNode(graph, node, match, att)
        
            
print(graph.nodes['U14119'])
print(graph.nodes['U14068'])
print(graph.nodes['U14068'])
print(graph.nodes['U27476'])

{'triangles_count': 18, 'triangles_pairs': [('U27661', 'U27532'), ('U27661', 'U27515'), ('U27661', 'U27541'), ('U27661', 'U27287'), ('U27532', 'U27541'), ('U27532', 'U27515'), ('U27532', 'U27287'), ('U27515', 'U27477'), ('U27515', 'U27541'), ('U27515', 'U27634'), ('U27515', 'U27525'), ('U27515', 'U27287'), ('U27477', 'U27541'), ('U27477', 'U27287'), ('U27541', 'U27634'), ('U27541', 'U27287'), ('U27525', 'U27287'), ('U27634', 'U27287')]}
{}
{}
{'college': ['shanghai jiao tong university']}
att: college, match: ['shanghai jiao tong university']
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{'label': 'U14119'}
{'label': 'U14068', 'location': ['urbana-champaign illinois area'], 'employer': ['university of illinois at urbana-champaign'], 'college': ['university of illinois at urbana-champaign', 'athens university of economics and business']}
{'label': 'U14068', 'location': ['urbana-champaign illinois area'], 'employer': ['university of illinois at urbana-champaign'], 'college': ['university 

In [7]:
# Obtain desired nodes, where to start searching, idealy in the nodes that have all the information

def checkIfAttributes(node, attributes):
    for att in attributes:
        if not att in node: return False
    return True

def checkWellLabeledNodes(G):
    well_labeled_nodes = []
    for node in G.nodes:
        if not checkIfAttributes(G.nodes[node], possible_attributes): continue
        neighbors = [n for n in G.neighbors(node)]
        completed = True
        for neighbor in neighbors:
            if not checkIfAttributes(G.nodes[neighbor], possible_attributes): 
                completed = False
                break
        if not completed : continue
        # passed both tests, good places to start:
        well_labeled_nodes.append(node)
    return well_labeled_nodes

In [8]:
# Studying nodes, discovered 3 kinds of them: nodes with triangles, one link nodes and several link nodes
# Obtain the nodes with trianlges, and the other 2 cases

count_triangles = 0
count_non_tri_easy = 0
count_non_tri = 0
nodes_with_triangles = []
nodes_one_neighbor = []
nodes_plus_neighbors = []
for node in graph.nodes:
    neighbors = [n for n in graph.neighbors(node)]
    triangles_in_node = locateTrianglesInNode(graph, neighbors, node)
    if triangles_in_node['triangles_count'] == 0 and len(neighbors) > 1: 
        count_non_tri += 1
        nodes_plus_neighbors.append(node)
        #print('----SUP--- node: ', node , ' count: ', len(neighbors))
    elif triangles_in_node['triangles_count'] == 0 and len(neighbors) == 1:
        count_non_tri_easy += 1
        #print('----EQUAL--- node: ', node)
        nodes_one_neighbor.append(node)
    else:
        count_triangles += 1
        nodes_with_triangles.append(node)
        #print('----TRI--- qty: ', triangles_in_node['triangles_count'])

#print('# of nodes w/triangles: ', count_triangles, ' # of same with no: ', count_non_tri_easy, ' others: ', count_non_tri)
#print(count_triangles + count_non_tri_easy + count_non_tri)

In [9]:
# ---- RUN FOR AL DESIRED NODES -----

def firstProposal(G):
    addAllAttributes(G)
    well_labeled_nodes = checkWellLabeledNodes(G)
    pipe = [node for node in well_labeled_nodes if node in nodes_with_triangles]
    pipe = pipe + [node for node in G.nodes if node not in pipe]

    for node in pipe:
        neighbors = [n for n in G.neighbors(node)]
        triangles_in_node = locateTrianglesInNode(G, neighbors, node)

        prepare_new_values = {}
        for pair in triangles_in_node['triangles_pairs']:
            connections = getPairRelationship(pair[0], pair[1], G, possible_attributes)
            for att, matches in connections.items():
                for match in matches:
                    addValueToNode(G, node, match, att)

In [10]:
# ---- RUN FOR AL DESIRED NODES SECOND PROPOSAL -----
def secondProposal(G):
    addAllAttributes(G)
    well_labeled_nodes = checkWellLabeledNodes(G)
    pipe = [node for node in well_labeled_nodes if node in nodes_with_triangles]
    pipe = pipe + [node for node in G.nodes if node not in pipe]

    for node in pipe:
        neighbors = [n for n in G.neighbors(node)]
        triangles_in_node = locateTrianglesInNode(G, neighbors, node)

        for pair in triangles_in_node['triangles_pairs']:
            connections = getPairRelationship(pair[0], pair[1], G, possible_attributes)
            #print(connections)
            for att, matches in connections.items():
                #print(f'att: { att}, match: { matches}')
                addValuesToNode(G, node, att, matches)


In [11]:
# Get info as neeeded for evaluation
def extractFromGraph(G, attribute):
    extracted = {}
    for node in G.nodes:
        node_att = G.nodes[node]
        if attribute in node_att: extracted[node] = node_att[attribute]
    return extracted

In [12]:
def evaluation_accuracy(groundtruth, pred):
    """    Compute the accuracy of your model.

     The accuracy is the proportion of true results.

    Parameters
    ----------
    groundtruth :  : dict 
       A dict of attributes, either location, employer or college attributes. 
       key is a node, value is a list of attribute values.
    pred : dict 
       A dict of attributes, either location, employer or college attributes. 
       key is a node, value is a list of attribute values. 

    Returns
    -------
    out : float
       Accuracy.
    """
    true_positive_prediction=0   
    for p_key, p_value in pred.items():
        if p_key in groundtruth:
            # if prediction is no attribute values, e.g. [] and so is the groundtruth
            # May happen
            if not p_value and not groundtruth[p_key]:
                true_positive_prediction+=1
            # counts the number of good prediction for node p_key
            # here len(p_value)=1 but we could have tried to predict more values
            true_positive_prediction += len([c for c in p_value if c in groundtruth[p_key]])          
        # no else, should not happen: train and test datasets are consistent
    return true_positive_prediction*100/sum(len(v) for v in pred.values())


In [13]:
count = 0
for key, lieu in extractFromGraph(graph, "location").items():
    if len(lieu) > 1 : 
        #print(lieu); 
        count += 1;
print(len(location))
print(len(extractFromGraph(graph, "location").items()))
print(count)
print(len(nodes_with_triangles))
print(len(nodes_one_neighbor))
print(len(nodes_plus_neighbors))

811
336
0
430
341
40


In [25]:


def getLeoInfo(G, att, in_dict_or_arr, groundtruth):
    los_q_quiere_leo = {}
    los_q_quiere_leo = { node: values for node, values in extractFromGraph(G, att).items() if node in in_dict_or_arr }
    print(f'# {len(los_q_quiere_leo)}')
    print(f'for {att}: {evaluation_accuracy(groundtruth,los_q_quiere_leo)}')
            
getLeoInfo(graph, 'location', nodes_taken_out, location)
getLeoInfo(graph, 'college', nodes_taken_out, college)
getLeoInfo(graph, 'employer', nodes_taken_out, employer)


# 209
for location: 100.0
# 146
for college: 100.0
# 185
for employer: 100.0


In [15]:
# have all the nodes with triangle been filled?
for node in nodes_with_triangles:
    node_att = graph_second.nodes[node]
    if not verifyAttributesInNode(possible_attributes, node_att):
        if not 'location' in node_att : print(f'missing: {node}, location')
        if not 'college' in node_att : print(f'missing: {node}, college')
        if not 'employer' in node_att : print(f'missing: {node}, employer')
        

NameError: name 'graph_second' is not defined

In [16]:
print(graph_second.nodes['U4665'])

NameError: name 'graph_second' is not defined

In [17]:
#Third Proposal dev

def updateNodeInfoInPipe(G, pipe, node, labeled_nodes, first_time = False):
    some_change = False
    neighbors = [n for n in G.neighbors(node)]
    well_labeled_neighbors = [n for n in neighbors if n in labeled_nodes]
    triangles_in_node = locateTrianglesInNode(G, neighbors, node)
    info_availability_percentage = len(well_labeled_neighbors) / len(neighbors) if triangles_in_node['triangles_count'] > 0 else 0
    node_info = {
        'node': node,
        'total_neigh': len(neighbors),
        'total_labeled_neigh': len(well_labeled_neighbors),
        'info_availability_percentage': info_availability_percentage,
        'trianlges': triangles_in_node['triangles_count'],
        'labels': len(G.nodes[node]) - 1,
    }
    if first_time: pipe.append(node_info); return True
    for node_dict in pipe:
        if node_dict['node'] == node:
            #print(node_dict)
            if not node_dict['info_availability_percentage'] == node_info['info_availability_percentage']: some_change = True; print('imrpovement')
            if node_dict['labels'] != len(G.nodes[node]) - 1: some_change = True
            node_dict['total_labeled_neigh'] = node_info['total_labeled_neigh']
            node_dict['info_availability_percentage'] = node_info['info_availability_percentage']
            node_dict['labels'] = len(G.nodes[node]) - 1
            #print(node_dict)
            break
    return some_change

graph_test = nx.read_gexf(data_dir + 'mediumLinkedin.gexf')
addAllAttributes(graph_test)
nodes_taken_out = pd.read_pickle(data_dir + 'mediumRemovedNodes_60percent_of_empty_profile.pickle') # with no labels
labeled_nodes = [n for n in graph_test.nodes if n not in nodes_taken_out]
#print(labeled_nodes)
pipe = []
for node in nodes_taken_out:
    updateNodeInfoInPipe(graph_test, pipe, node, labeled_nodes, True)
    #print(len(well_labeled_neighbors) / len(neighbors))

#order the pipe
pipe = sorted(pipe, key=itemgetter('info_availability_percentage'), reverse=True)

print(graph_test.nodes['U16152'])
print([n for n in G.neighbors('U16152')])
print(graph_test.nodes['U16112'])
print(graph_test.nodes['U16128'])
print(graph_test.nodes['U16141'])
print(graph_test.nodes['U16101'])
print(pipe[0])
print(pipe[1])

G = graph_test
pipe_test = [pipe[0], pipe[1], pipe[2], pipe[3], pipe[4]]

some_change = True
count = 0
while some_change and count < 100:
    print(f'---- count: { count }')
    some_change = False
    count += 1
    for i, node_info in enumerate(pipe):
        
        node = node_info['node']
        neighbors = [n for n in G.neighbors(node)]
        triangles_in_node = locateTrianglesInNode(G, neighbors, node)
        #print(f'----- node: {node}')
        #print(f'neigh: {neighbors}')
        #print(f'triangles: {triangles_in_node}')
        #for n in neighbors: print(G.nodes[n])
        for pair in triangles_in_node['triangles_pairs']:
            connections = getPairRelationship(pair[0], pair[1], G, possible_attributes)
            #print(f'con: {connections}')
            for att, matches in connections.items():
                #print(f'att: { att}, match: { matches}')
                addValuesToNode(G, node, att, matches)
        some_change = updateNodeInfoInPipe(G, pipe, node, labeled_nodes) or some_change
        if pipe[i]['total_neigh'] == pipe[i]['total_labeled_neigh']: del pipe[i]
        print(f'some_change: {some_change}, length: { len(pipe) }, node: {node}, att: {len(G.nodes[node])-1}, total_neigh: { pipe[i]["total_neigh"] }, total_labeled: { pipe[i]["total_labeled_neigh"] }')
    


print(graph_test.nodes['U22825'])
print(graph_test.nodes['U14109'])
print(f'count: { count }, len_nodes: { len(pipe) }')


getLeoInfo(G, 'location', nodes_taken_out, location)
getLeoInfo(G, 'college', nodes_taken_out, college)
getLeoInfo(G, 'employer', nodes_taken_out, employer)
print(f'for employer: {evaluation_accuracy(employer,extractFromGraph(G, "employer"))}')
print(f'for location: {evaluation_accuracy(location,extractFromGraph(G, "location"))}')
print(f'for college: {evaluation_accuracy(college,extractFromGraph(G, "college"))}')

{'label': 'U16152'}


NameError: name 'G' is not defined

In [18]:
def addValueToListIfExistant(matches_list, array):
    for match in matches_list:
        prepare_match = ''.join(match.lower().split(' ')) 
        in_array = False
        for element in array:
            prepare_element = ''.join(element.lower().split(' ')) 
            if prepare_element == prepare_match: 
                in_array = True
                break
        if not in_array: array.append(match)


# GRAPH
graph_test = nx.read_gexf(data_dir + 'mediumLinkedin.gexf')
addAllAttributes(graph_test)

pipe = []
for node in graph_test.nodes:
    neighbors = [n for n in graph.neighbors(node)]
    num_neigh = len(neighbors)
    
    triangles_in_node = locateTrianglesInNode(graph, neighbors, node)
    for pair in triangles_in_node['triangles_pairs']:
        connections = getPairRelationship(pair[0], pair[1], graph, possible_attributes)
        employer_conn = []
        location_conn = []
        college_conn = []
        for att, matches in connections.items():
            if 'location' == att : addValueToListIfExistant(matches, location_conn)
            if 'college' == att : addValueToListIfExistant(matches, college_conn)
            if 'employer' == att : addValueToListIfExistant(matches, employer_conn)            
    
        all_conn = { 
            'college': len(college_conn), 
            'location': len(location_conn),
            'employer': len(employer_conn),
            'total': len(college_conn) + len(location_conn) + len(employer_conn),
        }
        
        
        qty_tri = triangles_in_node['triangles_count']
        if all_conn['college'] / qty_tri < 0.5: print('not able to decide')
        # ...
        
        
        
        

not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to 

not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to decide
not able to 

In [19]:
def updateNodeInfoInPipe(G, pipe, node, labeled_nodes, first_time = False):
    some_change = False
    neighbors = [n for n in G.neighbors(node)]
    well_labeled_neighbors = [n for n in neighbors if n in labeled_nodes]
    triangles_in_node = locateTrianglesInNode(G, neighbors, node)
    info_availability_percentage = len(well_labeled_neighbors) / len(neighbors) if triangles_in_node['triangles_count'] > 0 else 0
    node_info = {
        'node': node,
        'total_neigh': len(neighbors),
        'total_labeled_neigh': len(well_labeled_neighbors),
        'info_availability_percentage': info_availability_percentage,
        'trianlges': triangles_in_node['triangles_count'],
        'labels': len(G.nodes[node]) - 1,
    }
    if first_time: pipe.append(node_info); return True
    for node_dict in pipe:
        if node_dict['node'] == node:
            if not node_dict['info_availability_percentage'] == node_info['info_availability_percentage']: some_change = True;
            if node_dict['labels'] != len(G.nodes[node]) - 1: some_change = True
            node_dict['total_labeled_neigh'] = node_info['total_labeled_neigh']
            node_dict['info_availability_percentage'] = node_info['info_availability_percentage']
            node_dict['labels'] = len(G.nodes[node]) - 1
            break
    return some_change

In [43]:
def thirdProposal(G, nodes_taken_out):
    addAllAttributes(G)
    labeled_nodes = [n for n in G.nodes if n not in nodes_taken_out]

    pipe = []
    for node in nodes_taken_out:
        updateNodeInfoInPipe(G, pipe, node, labeled_nodes, True)
    
    #order the pipe
    pipe = sorted(pipe, key=itemgetter('info_availability_percentage'), reverse=True)

    some_change = True
    count = 0
    while some_change and count < 100:
        some_change = False
        count += 1
        for i, node_info in enumerate(pipe):
            node = node_info['node']
            neighbors = [n for n in G.neighbors(node)]
            triangles_in_node = locateTrianglesInNode(G, neighbors, node)
            for pair in triangles_in_node['triangles_pairs']:
                connections = getPairRelationship(pair[0], pair[1], G, possible_attributes)
                for att, matches in connections.items():
                    addValuesToNode(G, node, att, matches)
            some_change = updateNodeInfoInPipe(G, pipe, node, labeled_nodes) or some_change
            if pipe[i]['total_neigh'] == pipe[i]['total_labeled_neigh']: 
                labeled_nodes.append(pipe[i])
                del pipe[i]


In [29]:
is_medium = True
file_pre = 'medium' if is_medium else 'large'

location = pd.read_pickle(data_dir + file_pre + 'Location.pickle')
employer = pd.read_pickle(data_dir + file_pre + 'Employer.pickle')
college = pd.read_pickle(data_dir + file_pre + 'College.pickle')
empty_str = '_60percent_of_empty_profile'
get_empty = True
location_empty = pd.read_pickle(data_dir + file_pre + 'Location' + (empty_str if get_empty else '') + '.pickle')
employer_empty = pd.read_pickle(data_dir + file_pre + 'Employer' + (empty_str if get_empty else '') + '.pickle')
college_empty = pd.read_pickle(data_dir + file_pre + 'College' + (empty_str if get_empty else '') + '.pickle')
nodes_taken_out = pd.read_pickle(data_dir + file_pre + 'RemovedNodes_60percent_of_empty_profile.pickle')
print(data_dir + file_pre + 'Location' + (empty_str if get_empty else '') + '.pickle')


graph_first = nx.read_gexf(data_dir + file_pre + 'Linkedin.gexf')
firstProposal(graph_first)
getLeoInfo(graph_first, 'location', nodes_taken_out, location)
getLeoInfo(graph_first, 'college', nodes_taken_out, college)
getLeoInfo(graph_first, 'employer', nodes_taken_out, employer)
print(f'for employer: {evaluation_accuracy(employer,extractFromGraph(graph_first, "employer"))}')
print(f'for location: {evaluation_accuracy(location,extractFromGraph(graph_first, "location"))}')
print(f'for college: {evaluation_accuracy(college,extractFromGraph(graph_first, "college"))}')
graph_second = nx.read_gexf(data_dir + file_pre + 'Linkedin.gexf')
secondProposal(graph_second)
getLeoInfo(graph_second, 'location', nodes_taken_out, location)
getLeoInfo(graph_second, 'college', nodes_taken_out, college)
getLeoInfo(graph_second, 'employer', nodes_taken_out, employer)
print(f'for employer: {evaluation_accuracy(employer,extractFromGraph(graph_second, "employer"))}')
print(f'for location: {evaluation_accuracy(location,extractFromGraph(graph_second, "location"))}')
print(f'for college: {evaluation_accuracy(college,extractFromGraph(graph_second, "college"))}')
graph_third = nx.read_gexf(data_dir + file_pre + 'Linkedin.gexf')
thirdProposal(graph_third, nodes_taken_out)
getLeoInfo(graph_third, 'location', nodes_taken_out, location)
getLeoInfo(graph_third, 'college', nodes_taken_out, college)
getLeoInfo(graph_third, 'employer', nodes_taken_out, employer)
print(f'for employer: {evaluation_accuracy(employer,extractFromGraph(graph_third, "employer"))}')
print(f'for location: {evaluation_accuracy(location,extractFromGraph(graph_third, "location"))}')
print(f'for college: {evaluation_accuracy(college,extractFromGraph(graph_third, "college"))}')

./data/largeLocation_60percent_of_empty_profile.pickle
# 4123
for location: 22.291913883709103
# 3106
for college: 15.909090909090908
# 3454
for employer: 9.982611444830857
for employer: 40.999099909990996
for location: 35.038151297144104
for college: 29.569973029667366
# 3795
for location: 49.85507246376812
# 2789
for college: 33.739691645751165
# 3037
for employer: 27.162075592568865
for employer: 81.69965870307168
for location: 74.73100749918487
for college: 59.98716508904219
# 4686
for location: 44.51557831839522
# 4014
for college: 25.7847533632287
# 4153
for employer: 21.82248520710059
for employer: 78.98320183252736
for location: 74.23701942132382
for college: 56.36443533030614


In [40]:
import pprint as pp
print(len(location))
print(len(graph_third))
print(len(nodes_taken_out))
extracted = extractFromGraph(graph_third, "location")
print(len(extracted))

for key, values in extracted.items():
    if (len(values) > 1): print(values)

29035
14032
8626
10092


In [None]:
graph_third = nx.read_gexf(data_dir + file_pre + 'Linkedin.gexf')
thirdProposal(graph_third, nodes_taken_out)
getLeoInfo(graph_third, 'location', nodes_taken_out, location)
getLeoInfo(graph_third, 'college', nodes_taken_out, college)
getLeoInfo(graph_third, 'employer', nodes_taken_out, employer)
evaluation_employer = evaluation_accuracy(employer,extractFromGraph(graph_third, "employer"))
evaluation_location = evaluation_accuracy(location,extractFromGraph(graph_third, "location"))
evaluation_college = evaluation_accuracy(college,extractFromGraph(graph_third, "college"))


pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass


In [None]:
evaluation_employer