In [2]:
import networkx as nx
# https://networkx.github.io/documentation/stable/reference/index.html
import matplotlib.pyplot as plt
from matplotlib import pylab
import numpy as np
import pandas as pd
from collections import Counter

# Which nodes' profile do we have to fill?

In [3]:
# Sub-problem: fill the profiles of 'empty' nodes
df_empty = pd.read_csv('empty.csv', sep='\t', header='infer')

In [4]:
df_empty.shape

(475, 1)

In [5]:
# Get a list from a Series
empty = df_empty['name'].drop_duplicates().tolist()
len(empty)

475

# Profil data understanding

In [6]:
# We use pandas dataframes to load attributes
# Nodes are characterized by a list of colleges, a list of employers, a list of location
# The order means nothing. Nothing allow us to determine the current employer...
df_e = pd.read_csv('employer_with_60percent_of_nodes_remoded.csv', sep='\t', header='infer')
df_e.head()

Unnamed: 0,name,employer
0,U1313,discovery education
1,U1313,ctb mcgraw-hill
2,U1313,university of charleston university of south c...
3,U1313,south carolina department of education
4,U1313,chesapeake va and sumter sc


In [7]:
len(df_e[['name']])

923

In [8]:
# How many users have 1 and more declared employers?
len(df_e[['name']].drop_duplicates())

297

In [9]:
df_l = pd.read_csv('location_with_60percent_of_nodes_remoded.csv', sep='\t', header='infer')
df_l.head()

Unnamed: 0,name,location
0,U1313,norfolk virginia area
1,U8804,bologna area italy
2,U2649,urbana-champaign illinois area
3,U7310,greater seattle area
4,U22859,bengaluru area india


In [10]:
len(df_l[['name']])

336

In [11]:
len(df_l[['name']].drop_duplicates())

336

In [12]:
# load the graph
# The graph is an extraction from LinkedIn Social Network
G = nx.read_gexf("mediumLinkedin.gexf")
print("%d nodes have no employer attributes among the %d users in the graph" % (G.number_of_nodes()-len(df_e[['name']].drop_duplicates()), G.number_of_nodes()))

514 nodes have no employer attributes among the 811 users in the graph


In [13]:
# How many employers / user?
df_e.groupby('name').count().describe()

Unnamed: 0,employer
count,297.0
mean,3.107744
std,1.976682
min,1.0
25%,1.0
50%,3.0
75%,4.0
max,13.0


In [14]:
df_e['employer'].value_counts()

university of illinois at urbana-champaign    76
microsoft                                     15
google                                        15
university of illinois at chicago              5
measured progress                              5
                                              ..
camp jrf                                       1
undisclosed                                    1
ara                                            1
pearson                                        1
women's resource center                        1
Name: employer, Length: 723, dtype: int64

In [15]:
# Boolean row selection by values in a column
df_e.loc[df_e['employer'].isin(['google','google inc']),:].head()


Unnamed: 0,name,employer
30,U3895,google
126,U15289,google
136,U24080,google
145,U24046,google
161,U7151,google


# Exploration of the graph : relational data understanding, focus on homophily

In [16]:
# networkx short summary of information for the graph g
print(nx.info(G))

Name: 
Type: Graph
Number of nodes: 811
Number of edges: 1597
Average degree:   3.9383


# Strategy 0 to fill the empty profiles = your baseline: naive method

The assumption is that two connected nodes are likely to share the same attribute values. Here we choose the most frequently used attribute value among the neighbors.

In [17]:
naive_predicted_values={}

In [18]:
def naive_method(graph, empty, df):
    """   Predict the missing attribute with a simple but effective
    relational classifier. 
    
    The assumption is that two connected nodes are 
    likely to share the same attribute value. Here we chose the most frequently
    used attribute by the neighbors
    
    Parameters
    ----------
    graph : graph
       A networkx graph
    empty : list
       The nodes with empty attributes 
    df : pandas dataframe 
       Either location, employer or college dataframe. 

    Returns
    -------
    predicted_values : dict 
       A dict of attributes, either location, employer or college attributes. 
       key is a node (from empty), value is a list of attribute values. Here 
       only 1 value in the list.
     """
    predicted_values={}
    for n in empty:
        nbrs_attr_values=[] 
        for nbr in graph.neighbors(n):
            # attribute values for the node nbr
            attr = df_e.loc[df_e.name == nbr ,'employer'].values
            if attr.size > 0: #array not empty
                for val in attr:
                    nbrs_attr_values.append(val)
        predicted_values[n]=[]
        if nbrs_attr_values: # non empty list
            # count the number of occurrence each value and returns a dict
            cpt=Counter(nbrs_attr_values)
            # take the most represented attribute value among neighbors
            a,nb_occurrence=max(cpt.items(), key=lambda t: t[1])
            predicted_values[n].append(a)
    return predicted_values


In [19]:
naive_predicted_values = naive_method(G, empty, df_e)

# Strategy 1 to fill the empty profiles

In [20]:
predicted_values_1={}

In [21]:
def strategy_1(graph, empty, df):
    predicted_values={}
    page_rank=nx.pagerank(G)
    closeness_list=sorted(page_rank.items(),key=lambda x:x[1],reverse=True)
    
    #loseness_centrality=nx.closeness_centrality(G)
    #loseness_list=sorted(closeness_centrality.items(),key=lambda x:x[1],reverse=True)

    for i in range (len(closeness_list)):
        node = closeness_list[i][0]
        if node not in empty:
            employer=df_e.loc[df_e.name == node ,'employer'].values
            
            for nbr in graph.neighbors(node):
                if nbr in empty:
                    
                    predicted_values[nbr]=[]
                    
                    predicted_values[nbr].append(df_e.loc[df_e.name == node ,'employer'].values)
            
        else:
            nbrs_attr_values=[] 
            for nbr in graph.neighbors(node):
                # attribute values for the node nbr
                attr = df_e.loc[df_e.name == nbr ,'employer'].values
                if attr.size > 0: #array not empty
                    for val in attr:
                        nbrs_attr_values.append(val)
            predicted_values[node]=[]
            if nbrs_attr_values: # non empty list
                # count the number of occurrence each value and returns a dict
                cpt=Counter(nbrs_attr_values)
                # take the most represented attribute value among neighbors
                a,nb_occurrence=max(cpt.items(), key=lambda t: t[1])
                predicted_values[node].append(a)
            
            
    return predicted_values
    

In [22]:
predicted_values_1 = strategy_1(G, empty, df_e)

# Strategy 2 to fill the empty profiles

In [23]:
predicted_values_2={}

In [24]:
def strategy_2(graph, empty, df):
    predicted_values={}
    
    closeness_centrality=nx.closeness_centrality(G)
    closeness_list=sorted(closeness_centrality.items(),key=lambda x:x[1],reverse=True)

    for i in range (len(closeness_list)):
        node = closeness_list[i][0]
        if node not in empty:
            employer=df_e.loc[df_e.name == node ,'employer'].values
            
            for nbr in graph.neighbors(node):
                if nbr in empty:
                    
                    predicted_values[nbr]=[]
                    a,nb_occurrence=max(Counter(employer).items(), key=lambda t: t[1])
                    predicted_values[node].append(a)
                    
                    predicted_values[nbr].append(df_e.loc[df_e.name == node ,'employer'].values)
            
        else:
            nbrs_attr_values=[] 
            for nbr in graph.neighbors(node):
                # attribute values for the node nbr
                attr = df_e.loc[df_e.name == nbr ,'employer'].values
                if attr.size > 0: #array not empty
                    for val in attr:
                        nbrs_attr_values.append(val)
            predicted_values[node]=[]
            if nbrs_attr_values: # non empty list
                # count the number of occurrence each value and returns a dict
                cpt=Counter(nbrs_attr_values)
                # take the most represented attribute value among neighbors
                a,nb_occurrence=max(cpt.items(), key=lambda t: t[1])
                predicted_values[node].append(a)
            
            
    return predicted_values
    

In [25]:
predicted_values_2 = strategy_1(G, empty, df_e)

# Evaluation


In [26]:
df_e_truth = pd.read_csv('./groundtruth/employer.csv', sep='\t', header='infer')
df_e_truth.head()

Unnamed: 0,name,employer
0,U21998,illinois college advising corps
1,U21998,victoria amplifiers
2,U21998,university of illinois at chicago
3,U27476,ibm
4,U27476,nyse euronext


In [27]:
df_e_truth.loc[df_e_truth.name == 'U21998',df_e_truth.columns[1]].values

array(['illinois college advising corps', 'victoria amplifiers',
       'university of illinois at chicago'], dtype=object)

In [28]:
df_l_truth = pd.read_csv('./groundtruth/location.csv', sep='\t', header='infer')


In [29]:
def evaluation_accuracy(groundtruth, pred):
    """    Compute the accuracy of your model.

     The accuracy is the proportion of true results.

    Parameters
    ----------
    groundtruth : pandas dataframe 
       Either location, employer or college dataframe. 
    pred : dict 
       A dict of attributes, either location, employer or college attributes. 
       key is a node, value is a list of attribute values. 

    Returns
    -------
    out : float
       Accuracy.
    """
    true_positive_prediction=0
    predicted=0
    for p_key, p_value in pred.items():
        attr = groundtruth.loc[groundtruth.name == p_key ,groundtruth.columns[1]].values
        # if prediction is empty, e.g. [], and so is the groundtruth
        # May happen, we count it as a true prediction
        if not p_value and attr.size == 0:
            true_positive_prediction+=1
            predicted+=1
        # counts the number of good prediction for node p_key
        # here len(p_value)=1 but we could have tried to predict more values
        # if p_value == [], we add 0
        true_positive_prediction += len([c for c in p_value if c in attr])
        predicted += len([c for c in p_value])
    return true_positive_prediction*100/predicted
 

In [30]:
result=evaluation_accuracy(df_e_truth, naive_predicted_values)
print("%f%% of the predictions are true" % result)
print("Very poor result!!! Try to do better!!!!")

27.397260% of the predictions are true
Very poor result!!! Try to do better!!!!


In [31]:
result_1 = evaluation_accuracy(df_e_truth, predicted_values_1)
print("%f%% of the predictions are true" % result_1)

21.575342% of the predictions are true




In [1]:
result_loc = evaluation_accuracy(df_e_truth, predicted_values_2)
print("%f%% of the predictions are true" % result_loc)

NameError: name 'evaluation_accuracy' is not defined