In [46]:
import os
import pickle5 as pickle
import json
import utils
import numpy as np
import pandas as pd
import networkx as nx

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(font_scale=2)
sns.set_style('whitegrid')

from tqdm import tqdm
from sklearn import metrics

%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Helper Functions

In [50]:
def process_df(df, G):
    '''
    Processes the input dataframe so that it only contains cell nodes with degree greater than 1.

    Arguments
    -------
        df (pandas dataframe): dataframe with BC for each node in the graph 

        G (networkx graph): Input graph corresponding to the dataframe
       
    Returns
    -------
    Updated dataframe
    '''
    # Filter down to only cell nodes with degree greater than 1
    df = df[df['node_type'] == 'cell']
    cell_nodes = df['node'].tolist()
    nodes_with_degree_greater_than_1 = [n for n in cell_nodes if G.degree[n] > 1]
    df = df.loc[df['node'].isin(nodes_with_degree_greater_than_1)]
    print('There are', len(nodes_with_degree_greater_than_1), 'cell nodes with a degree greater than 1')

    return df

def get_num_meanings_groundtruth(df, G):
    '''
    Given a dataframe with the 'is_homograph' groundtruth, find the groundtruth number of meanings
    for each graph

    Returns an updated dataframe with the new column 'num_meanings_groundtruth'
    '''

    df['num_meanings_groundtruth'] = 1

    df_with_homographs = df[df['is_homograph'] == True]

    # Assign the groundtruth number of meanings for each homograph in the dataframe
    for idx, row in tqdm(df_with_homographs.iterrows(), total=df_with_homographs.shape[0]):
        df.loc[idx, 'num_meanings_groundtruth'] = len(utils.get_cell_node_column_names(G, row['node']))

    return df

def get_clustering_evaluation_score(true_labels, predicted_labels, measure='adj_rand_index'):
    '''
    Given a list of the `true_labels` and the `predicted_labels` corresponding to each member in the cluster,
    evaluate the cluster quality using the specified `measure`.

    Notice that the true and predicted labels are strings and will be first converted to integers IDs as required
    by sklearn.metrics.cluster
    '''
    true_label_to_id_dict = dict([(y,x) for x,y in enumerate(sorted(set(true_labels)))])
    predicted_label_to_id_dict = dict([(y,x) for x,y in enumerate(sorted(set(predicted_labels)))])

    true_labels_ids = [true_label_to_id_dict[x] for x in true_labels]
    predicted_labels_ids = [predicted_label_to_id_dict[x] for x in predicted_labels]

    if measure == 'adj_rand_index':
        score = metrics.adjusted_rand_score(labels_true=true_labels_ids, labels_pred=predicted_labels_ids)
    elif measure == 'adj_mutual_info':
        score = metrics.adjusted_mutual_info_score(labels_true=true_labels_ids, labels_pred=predicted_labels_ids)
    elif measure == 'norm_mutual_info':
        score = metrics.normalized_mutual_info_score(labels_true=true_labels_ids, labels_pred=predicted_labels_ids)

    return score

def get_predicted_and_gt_semantic_types_for_node(node, graph, cell_node_to_semantic_type_dict):
    gt_types = utils.graph_helpers.get_cell_node_column_names(graph, node)
    pred_types = cell_node_to_semantic_type_dict[node]
    return gt_types, pred_types

# Synthetic Benchmark Large (Number of Meanings)

In [51]:
output_path = 'output/synthetic_benchmark_large/'
g_path = 'graph_representations/synthetic_benchmark_large/bipartite.graph'

with open(output_path + "graph_stats_with_groundtruth_df.pickle", "rb") as fh:
    df = pickle.load(fh)
G = pickle.load(open(g_path, "rb"))

# Remove nodes with degree 1 from the dataframe
df = process_df(df, G)

# Compute the groundtruth for the number of meanings for each homograph
df = get_num_meanings_groundtruth(df, G)
df_homs = df[df['is_homograph']==True].sort_values(by='num_meanings_groundtruth', ascending=False)
df_homs

100%|██████████| 180/180 [00:00<00:00, 4859.86it/s]

There are 6502 cell nodes with a degree greater than 1





Unnamed: 0,node,node_type,betweenness_centrality,is_homograph,num_meanings_groundtruth
300,Lincoln,cell,4.879061e-03,True,3
546,Virginia,cell,1.466842e-03,True,3
1401,Aurora,cell,1.690384e-03,True,3
2012,Montana,cell,2.380734e-03,True,3
258,Cuba,cell,1.537346e-03,True,2
...,...,...,...,...,...
1839,AZ,cell,6.340709e-07,True,2
1840,TN,cell,6.340709e-07,True,2
1873,MG,cell,6.235608e-04,True,2
1884,Georgia,cell,1.850850e-04,True,2


In [52]:
column_node_to_semantic_type_dict = pd.read_pickle(output_path+'column_node_to_semantic_type_dict.pickle')
cell_node_to_semantic_type_dict = pd.read_pickle(output_path+'cell_node_to_semantic_type_dict.pickle')
predicted_homographs = pd.read_pickle(output_path+'predicted_homographs.pickle')
gt_homographs = set(df_homs['node'])

## Homograph Number of Meanings Evaluation 

In [53]:
df_homs['num_meanings_sherlock'] = np.nan
df_homs['adj_rand_index'] = np.nan
df_homs['adj_mutual_info'] = np.nan
df_homs['norm_mutual_info'] = np.nan

for idx, row in df_homs.iterrows():
    # Get number of meanings based on how many semantic types were assigned to it
    df_homs.loc[idx, 'num_meanings_sherlock'] = int(len(cell_node_to_semantic_type_dict[row['node']]))

    # For each homograph in `df_homs` extract the groundtruth and predicted labels to evaluate the
    # clustering quality (i.e., adjusted_rand_score)
    gt_col_names = []
    predicted_types = []
    for col_node in utils.graph_helpers.get_attribute_of_instance(G, row['node']):
        gt_col_names.append(G.nodes[col_node]['column_name'])
        predicted_types.append(column_node_to_semantic_type_dict[col_node])

    adj_rand_index_score = get_clustering_evaluation_score(true_labels=gt_col_names, predicted_labels=predicted_types, measure='adj_rand_index')
    adj_mutual_info_score = get_clustering_evaluation_score(true_labels=gt_col_names, predicted_labels=predicted_types, measure='adj_mutual_info')
    norm_mutual_info_score = get_clustering_evaluation_score(true_labels=gt_col_names, predicted_labels=predicted_types, measure='norm_mutual_info')
    df_homs.loc[idx, 'adj_rand_index'] = adj_rand_index_score
    df_homs.loc[idx, 'adj_mutual_info'] = adj_mutual_info_score
    df_homs.loc[idx, 'norm_mutual_info'] = norm_mutual_info_score

df_homs['is_num_meanings_correct'] = df_homs['num_meanings_groundtruth'] == df_homs['num_meanings_sherlock']
df_homs

Unnamed: 0,node,node_type,betweenness_centrality,is_homograph,num_meanings_groundtruth,num_meanings_sherlock,adj_rand_index,adj_mutual_info,norm_mutual_info,is_num_meanings_correct
300,Lincoln,cell,4.879061e-03,True,3,3.0,1.000000,1.000000,1.00000,True
546,Virginia,cell,1.466842e-03,True,3,3.0,1.000000,1.000000,1.00000,True
1401,Aurora,cell,1.690384e-03,True,3,3.0,1.000000,1.000000,1.00000,True
2012,Montana,cell,2.380734e-03,True,3,2.0,0.705882,0.727608,0.81329,False
258,Cuba,cell,1.537346e-03,True,2,2.0,1.000000,1.000000,1.00000,True
...,...,...,...,...,...,...,...,...,...,...
1839,AZ,cell,6.340709e-07,True,2,2.0,1.000000,1.000000,1.00000,True
1840,TN,cell,6.340709e-07,True,2,2.0,1.000000,1.000000,1.00000,True
1873,MG,cell,6.235608e-04,True,2,2.0,1.000000,1.000000,1.00000,True
1884,Georgia,cell,1.850850e-04,True,2,2.0,1.000000,1.000000,1.00000,True


In [32]:
print("Number of meanings precision:", df_homs['is_num_meanings_correct'].value_counts()[True] / len(df_homs.index))

Number of meanings precision: 0.7944444444444444


In [55]:
print("Average adjusted rand index:", df_homs['adj_rand_index'].mean())
print("Average adjusted mutual information:", df_homs['adj_mutual_info'].mean())
print("Average normalized mutual information:", df_homs['norm_mutual_info'].mean())

Average adjusted rand index: 0.7816993464052286
Average adjusted mutual information: 0.4873755996616282
Average normalized mutual information: 0.8104067016486789


## Homograph Detection Evaluation

In [14]:
precision = len(gt_homographs & predicted_homographs) / len(predicted_homographs)
recall = len(gt_homographs & predicted_homographs) / len(gt_homographs)
f1_score = (2* precision * recall) / (precision + recall)

print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1_score)

Precision: 0.1192
Recall: 0.8277777777777777
F1-Score: 0.2083916083916084


In [15]:
gt_not_predicted_by_sherlock = gt_homographs - predicted_homographs
print("GT homographs not predicted by Sherlock:", gt_not_predicted_by_sherlock)

GT homographs not predicted by Sherlock: {'Mariner', 'Fredi', 'Focus', 'Harland', 'Laurence', 'Say', 'Starbuck', 'Charity', 'Tracy', 'Alleyn', 'Franklyn', 'Calley', 'Marguerite', 'Barrie', 'Rosie', 'Equinox', 'Crossfire', 'Gantz', 'Hugo', 'Townsend', 'Amy', 'Valerie', 'Magnolia', 'Lettuce', 'Smith', 'Verna', 'Palm', 'Mirage', 'Ransom', 'Glen', 'Pathfinder'}


In [17]:
for value in gt_not_predicted_by_sherlock:
    gt_types, pred_types = get_predicted_and_gt_semantic_types_for_node(node=value, graph=G, cell_node_to_semantic_type_dict=cell_node_to_semantic_type_dict)
    print(value, "  \t\t GT types:", gt_types, '\t\t predicted types:', pred_types)

Mariner   		 GT types: ['last_name', 'car_model'] 		 predicted types: {'name'}
Fredi   		 GT types: ['last_name', 'first_name'] 		 predicted types: {'name'}
Focus   		 GT types: ['movie_title', 'car_model'] 		 predicted types: {'name'}
Harland   		 GT types: ['last_name', 'first_name'] 		 predicted types: {'name'}
Laurence   		 GT types: ['last_name', 'first_name'] 		 predicted types: {'name'}
Say   		 GT types: ['last_name', 'first_name'] 		 predicted types: {'name'}
Starbuck   		 GT types: ['movie_title', 'last_name'] 		 predicted types: {'name'}
Charity   		 GT types: ['first_name', 'plant_name'] 		 predicted types: {'name'}
Tracy   		 GT types: ['last_name', 'first_name'] 		 predicted types: {'name'}
Alleyn   		 GT types: ['last_name', 'first_name'] 		 predicted types: {'name'}
Franklyn   		 GT types: ['movie_title', 'first_name'] 		 predicted types: {'name'}
Calley   		 GT types: ['last_name', 'first_name'] 		 predicted types: {'name'}
Marguerite   		 GT types: ['last_name', 'firs

In [18]:
non_homographs_predicted_by_sherlock = list(predicted_homographs - gt_homographs)
print("Non-GT homographs predicted by Sherlock:", non_homographs_predicted_by_sherlock[:20])

Non-GT homographs predicted by Sherlock: ['Placemat - Scallop, White', 'Janine', 'Darcee', 'Nicola', 'Piping Jelly - All Colours', 'Athene', 'Bread - Rolls, Corn', 'Brig', 'Nut - Macadamia', 'Wine - Red, Metus Rose', 'Centidel', 'Marybeth', 'Tomatoes - Plum, Canned', 'Juice - Lime', 'Flavouring - Rum', 'Marcelline', 'Salmon Atl.whole 8 - 10 Lb', 'Tart Shells - Sweet, 4', 'Brainsphere', 'Tagopia']


In [19]:
for value in non_homographs_predicted_by_sherlock[:20]:
    gt_types, pred_types = get_predicted_and_gt_semantic_types_for_node(node=value, graph=G, cell_node_to_semantic_type_dict=cell_node_to_semantic_type_dict)
    print(value, "  \t\t GT types:", gt_types, '\t\t predicted types:', pred_types)

Placemat - Scallop, White   		 GT types: ['grocery'] 		 predicted types: {'description', 'product'}
Janine   		 GT types: ['first_name'] 		 predicted types: {'name', 'sex'}
Darcee   		 GT types: ['first_name'] 		 predicted types: {'name', 'sex'}
Nicola   		 GT types: ['first_name'] 		 predicted types: {'name', 'sex'}
Piping Jelly - All Colours   		 GT types: ['grocery'] 		 predicted types: {'description', 'product'}
Athene   		 GT types: ['first_name'] 		 predicted types: {'name', 'sex'}
Bread - Rolls, Corn   		 GT types: ['grocery'] 		 predicted types: {'description', 'product'}
Brig   		 GT types: ['first_name'] 		 predicted types: {'name', 'sex'}
Nut - Macadamia   		 GT types: ['grocery'] 		 predicted types: {'description', 'product'}
Wine - Red, Metus Rose   		 GT types: ['grocery'] 		 predicted types: {'description', 'product'}
Centidel   		 GT types: ['company_name'] 		 predicted types: {'creator', 'location'}
Marybeth   		 GT types: ['first_name'] 		 predicted types: {'name', 's