In [2]:
import os
import pickle5 as pickle
import json
import utils
import numpy as np
import pandas as pd
import networkx as nx

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(font_scale=2)
sns.set_style('whitegrid')

from tqdm import tqdm
from sklearn import metrics

%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Helper Functions

In [3]:
def process_df(df, G):
    '''
    Processes the input dataframe so that it only contains cell nodes with degree greater than 1.

    Arguments
    -------
        df (pandas dataframe): dataframe with BC for each node in the graph 

        G (networkx graph): Input graph corresponding to the dataframe
       
    Returns
    -------
    Updated dataframe
    '''
    # Filter down to only cell nodes with degree greater than 1
    df = df[df['node_type'] == 'cell']
    cell_nodes = df['node'].tolist()
    nodes_with_degree_greater_than_1 = [n for n in cell_nodes if G.degree[n] > 1]
    df = df.loc[df['node'].isin(nodes_with_degree_greater_than_1)]
    print('There are', len(nodes_with_degree_greater_than_1), 'cell nodes with a degree greater than 1')

    return df

def get_num_meanings_groundtruth(df, G):
    '''
    Given a dataframe with the 'is_homograph' groundtruth, find the groundtruth number of meanings
    for each graph

    Returns an updated dataframe with the new column 'num_meanings_groundtruth'
    '''

    df['num_meanings_groundtruth'] = 1

    df_with_homographs = df[df['is_homograph'] == True]

    # Assign the groundtruth number of meanings for each homograph in the dataframe
    for idx, row in tqdm(df_with_homographs.iterrows(), total=df_with_homographs.shape[0]):
        df.loc[idx, 'num_meanings_groundtruth'] = len(utils.get_cell_node_column_names(G, row['node']))

    return df

def get_clustering_evaluation_score(true_labels, predicted_labels, measure='adj_rand_index'):
    '''
    Given a list of the `true_labels` and the `predicted_labels` corresponding to each member in the cluster,
    evaluate the cluster quality using the specified `measure`.

    Notice that the true and predicted labels are strings and will be first converted to integers IDs as required
    by sklearn.metrics.cluster
    '''
    true_label_to_id_dict = dict([(y,x) for x,y in enumerate(sorted(set(true_labels)))])
    predicted_label_to_id_dict = dict([(y,x) for x,y in enumerate(sorted(set(predicted_labels)))])

    true_labels_ids = [true_label_to_id_dict[x] for x in true_labels]
    predicted_labels_ids = [predicted_label_to_id_dict[x] for x in predicted_labels]

    if measure == 'adj_rand_index':
        score = metrics.adjusted_rand_score(labels_true=true_labels_ids, labels_pred=predicted_labels_ids)
    elif measure == 'adj_mutual_info':
        score = metrics.adjusted_mutual_info_score(labels_true=true_labels_ids, labels_pred=predicted_labels_ids)
    elif measure == 'norm_mutual_info':
        score = metrics.normalized_mutual_info_score(labels_true=true_labels_ids, labels_pred=predicted_labels_ids)

    return score

def get_predicted_and_gt_semantic_types_for_node(node, graph, cell_node_to_semantic_type_dict):
    gt_types = utils.graph_helpers.get_cell_node_column_names(graph, node)
    pred_types = cell_node_to_semantic_type_dict[node]
    return gt_types, pred_types

# Synthetic Benchmark Large (Number of Meanings)

In [51]:
output_path = 'output/synthetic_benchmark_large/'
g_path = 'graph_representations/synthetic_benchmark_large/bipartite.graph'

with open(output_path + "graph_stats_with_groundtruth_df.pickle", "rb") as fh:
    df = pickle.load(fh)
G = pickle.load(open(g_path, "rb"))

# Remove nodes with degree 1 from the dataframe
df = process_df(df, G)

# Compute the groundtruth for the number of meanings for each homograph
df = get_num_meanings_groundtruth(df, G)
df_homs = df[df['is_homograph']==True].sort_values(by='num_meanings_groundtruth', ascending=False)
df_homs

100%|██████████| 180/180 [00:00<00:00, 4660.77it/s]

There are 6502 cell nodes with a degree greater than 1





Unnamed: 0,node,node_type,betweenness_centrality,is_homograph,num_meanings_groundtruth
300,Lincoln,cell,4.879061e-03,True,3
546,Virginia,cell,1.466842e-03,True,3
1401,Aurora,cell,1.690384e-03,True,3
2012,Montana,cell,2.380734e-03,True,3
258,Cuba,cell,1.537346e-03,True,2
...,...,...,...,...,...
1839,AZ,cell,6.340709e-07,True,2
1840,TN,cell,6.340709e-07,True,2
1873,MG,cell,6.235608e-04,True,2
1884,Georgia,cell,1.850850e-04,True,2


In [52]:
column_node_to_semantic_type_dict = pd.read_pickle(output_path+'column_node_to_semantic_type_dict.pickle')
cell_node_to_semantic_type_dict = pd.read_pickle(output_path+'cell_node_to_semantic_type_dict.pickle')
predicted_homographs = pd.read_pickle(output_path+'predicted_homographs.pickle')
gt_homographs = set(df_homs['node'])

## Homograph Number of Meanings Evaluation 

In [53]:
df_homs['num_meanings_sherlock'] = np.nan
df_homs['adj_rand_index'] = np.nan
df_homs['adj_mutual_info'] = np.nan
df_homs['norm_mutual_info'] = np.nan

for idx, row in df_homs.iterrows():
    # Get number of meanings based on how many semantic types were assigned to it
    df_homs.loc[idx, 'num_meanings_sherlock'] = int(len(cell_node_to_semantic_type_dict[row['node']]))

    # For each homograph in `df_homs` extract the groundtruth and predicted labels to evaluate the
    # clustering quality (i.e., adjusted_rand_score)
    gt_col_names = []
    predicted_types = []
    for col_node in utils.graph_helpers.get_attribute_of_instance(G, row['node']):
        gt_col_names.append(G.nodes[col_node]['column_name'])
        predicted_types.append(column_node_to_semantic_type_dict[col_node])

    adj_rand_index_score = get_clustering_evaluation_score(true_labels=gt_col_names, predicted_labels=predicted_types, measure='adj_rand_index')
    adj_mutual_info_score = get_clustering_evaluation_score(true_labels=gt_col_names, predicted_labels=predicted_types, measure='adj_mutual_info')
    norm_mutual_info_score = get_clustering_evaluation_score(true_labels=gt_col_names, predicted_labels=predicted_types, measure='norm_mutual_info')
    df_homs.loc[idx, 'adj_rand_index'] = adj_rand_index_score
    df_homs.loc[idx, 'adj_mutual_info'] = adj_mutual_info_score
    df_homs.loc[idx, 'norm_mutual_info'] = norm_mutual_info_score

df_homs['is_num_meanings_correct'] = df_homs['num_meanings_groundtruth'] == df_homs['num_meanings_sherlock']
df_homs

Unnamed: 0,node,node_type,betweenness_centrality,is_homograph,num_meanings_groundtruth,num_meanings_sherlock,adj_rand_index,adj_mutual_info,norm_mutual_info,is_num_meanings_correct
300,Lincoln,cell,4.879061e-03,True,3,3.0,1.000000,1.000000,1.00000,True
546,Virginia,cell,1.466842e-03,True,3,3.0,1.000000,1.000000,1.00000,True
1401,Aurora,cell,1.690384e-03,True,3,3.0,1.000000,1.000000,1.00000,True
2012,Montana,cell,2.380734e-03,True,3,2.0,0.705882,0.727608,0.81329,False
258,Cuba,cell,1.537346e-03,True,2,2.0,1.000000,1.000000,1.00000,True
...,...,...,...,...,...,...,...,...,...,...
1839,AZ,cell,6.340709e-07,True,2,2.0,1.000000,1.000000,1.00000,True
1840,TN,cell,6.340709e-07,True,2,2.0,1.000000,1.000000,1.00000,True
1873,MG,cell,6.235608e-04,True,2,2.0,1.000000,1.000000,1.00000,True
1884,Georgia,cell,1.850850e-04,True,2,2.0,1.000000,1.000000,1.00000,True


In [54]:
print("Number of meanings precision:", df_homs['is_num_meanings_correct'].value_counts()[True] / len(df_homs.index))

Number of meanings precision: 0.7944444444444444


In [55]:
print("Average adjusted rand index:", df_homs['adj_rand_index'].mean())
print("Average adjusted mutual information:", df_homs['adj_mutual_info'].mean())
print("Average normalized mutual information:", df_homs['norm_mutual_info'].mean())

Average adjusted rand index: 0.7816993464052286
Average adjusted mutual information: 0.4873755996616282
Average normalized mutual information: 0.8104067016486789


## Homograph Detection Evaluation

In [56]:
precision = len(gt_homographs & predicted_homographs) / len(predicted_homographs)
recall = len(gt_homographs & predicted_homographs) / len(gt_homographs)
f1_score = (2* precision * recall) / (precision + recall)

print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1_score)

Precision: 0.1192
Recall: 0.8277777777777777
F1-Score: 0.2083916083916084


In [57]:
gt_not_predicted_by_sherlock = gt_homographs - predicted_homographs
print("GT homographs not predicted by Sherlock:", gt_not_predicted_by_sherlock)

GT homographs not predicted by Sherlock: {'Lettuce', 'Pathfinder', 'Townsend', 'Amy', 'Charity', 'Laurence', 'Focus', 'Tracy', 'Equinox', 'Mariner', 'Mirage', 'Harland', 'Ransom', 'Barrie', 'Marguerite', 'Rosie', 'Hugo', 'Crossfire', 'Valerie', 'Magnolia', 'Smith', 'Verna', 'Fredi', 'Gantz', 'Starbuck', 'Glen', 'Alleyn', 'Say', 'Calley', 'Franklyn', 'Palm'}


In [58]:
for value in gt_not_predicted_by_sherlock:
    gt_types, pred_types = get_predicted_and_gt_semantic_types_for_node(node=value, graph=G, cell_node_to_semantic_type_dict=cell_node_to_semantic_type_dict)
    print(value, "  \t\t GT types:", gt_types, '\t\t predicted types:', pred_types)

Lettuce   		 GT types: ['last_name', 'plant_name'] 		 predicted types: {'name'}
Pathfinder   		 GT types: ['movie_title', 'car_model'] 		 predicted types: {'name'}
Townsend   		 GT types: ['first_name', 'last_name'] 		 predicted types: {'name'}
Amy   		 GT types: ['first_name', 'last_name'] 		 predicted types: {'name'}
Charity   		 GT types: ['first_name', 'plant_name'] 		 predicted types: {'name'}
Laurence   		 GT types: ['first_name', 'last_name'] 		 predicted types: {'name'}
Focus   		 GT types: ['movie_title', 'car_model'] 		 predicted types: {'name'}
Tracy   		 GT types: ['first_name', 'last_name'] 		 predicted types: {'name'}
Equinox   		 GT types: ['movie_title', 'car_model'] 		 predicted types: {'name'}
Mariner   		 GT types: ['car_model', 'last_name'] 		 predicted types: {'name'}
Mirage   		 GT types: ['movie_title', 'car_model'] 		 predicted types: {'name'}
Harland   		 GT types: ['first_name', 'last_name'] 		 predicted types: {'name'}
Ransom   		 GT types: ['movie_title', 'f

In [59]:
non_homographs_predicted_by_sherlock = list(predicted_homographs - gt_homographs)
print("Non-GT homographs predicted by Sherlock:", non_homographs_predicted_by_sherlock[:20])

Non-GT homographs predicted by Sherlock: ['Tagfeed', 'Wine - Savigny - Les - Beaune', 'Shrimp - Baby, Warm Water', 'Talyah', 'Island Oasis - Magarita Mix', 'Bertie', 'Fudge - Chocolate Fudge', 'Edgepulse', 'Eugenio', 'Grand Marnier', 'Soup - Tomato Mush. Florentine', 'Eazzy', 'Realcube', 'Garwin', 'Thoughtbridge', 'Neile', 'Marzipan 50/50', 'Daisy', 'Oodoo', 'Red Desert (Deserto rosso, Il)']


In [60]:
for value in non_homographs_predicted_by_sherlock[:20]:
    gt_types, pred_types = get_predicted_and_gt_semantic_types_for_node(node=value, graph=G, cell_node_to_semantic_type_dict=cell_node_to_semantic_type_dict)
    print(value, "  \t\t GT types:", gt_types, '\t\t predicted types:', pred_types)

Tagfeed   		 GT types: ['company_name'] 		 predicted types: {'creator', 'location', 'name'}
Wine - Savigny - Les - Beaune   		 GT types: ['grocery'] 		 predicted types: {'description', 'product'}
Shrimp - Baby, Warm Water   		 GT types: ['grocery'] 		 predicted types: {'description', 'product'}
Talyah   		 GT types: ['first_name'] 		 predicted types: {'sex', 'name'}
Island Oasis - Magarita Mix   		 GT types: ['grocery'] 		 predicted types: {'description', 'product'}
Bertie   		 GT types: ['first_name'] 		 predicted types: {'sex', 'name'}
Fudge - Chocolate Fudge   		 GT types: ['grocery'] 		 predicted types: {'description', 'product'}
Edgepulse   		 GT types: ['company_name'] 		 predicted types: {'creator', 'location', 'name'}
Eugenio   		 GT types: ['first_name'] 		 predicted types: {'sex', 'name'}
Grand Marnier   		 GT types: ['grocery'] 		 predicted types: {'description', 'product'}
Soup - Tomato Mush. Florentine   		 GT types: ['grocery'] 		 predicted types: {'description', 'product

In [61]:
gt_types, pred_types = get_predicted_and_gt_semantic_types_for_node(node="Montana", graph=G, cell_node_to_semantic_type_dict=cell_node_to_semantic_type_dict)
print(value, "  \t\t GT types:", gt_types, '\t\t predicted types:', pred_types)

Red Desert (Deserto rosso, Il)   		 GT types: ['movie_title', 'state', 'car_model'] 		 predicted types: {'state', 'name'}


# D4-Education Dataset

In [81]:
output_path = 'output/D4-Education/'
g_path = 'graph_representations/D4-Education/bipartite.graph'

with open(output_path+'input_nodes.json') as f:
  input_nodes = json.load(f)['input_nodes']

with open(output_path + "graph_stats_with_groundtruth_df.pickle", "rb") as fh:
    df = pickle.load(fh)
G = pickle.load(open(g_path, "rb"))

# Remove nodes with degree 1 from the dataframe
df = process_df(df, G)

# Use the 100 highest BC values as a small GT
df_homs = df[df['node'].isin(input_nodes)]
df_homs

There are 98 cell nodes with a degree greater than 1


Unnamed: 0,node,node_type,betweenness_centrality,dense_rank,is_homograph,num_meanings_groundtruth
45335,Chemistry,cell,10.858448,14.0,True,4
37275,--,cell,5.823986,20.0,True,3
62626,Early College,cell,5.68875,21.0,True,5
1062913,166 ESSEX STREET,cell,5.050365,23.0,True,3
1396771,Please contact the school for more information.,cell,4.9193,24.0,True,4
879122,CEDAR STREET,cell,3.471863,32.0,True,2
426115,EDGEWOOD AVENUE,cell,3.471863,32.0,True,2
634170,"CUSHMAN & WAKEFIELD, INC.",cell,2.972433,33.0,True,2
963488,26 COURT STREET,cell,2.902326,34.0,True,2
54531,Kelly,cell,2.867587,35.0,True,2


In [82]:
# Fix the ground truth
df_homs.loc[df_homs['node']=='Early College', 'is_homograph']=False
df_homs.loc[df_homs['node']=='166 ESSEX STREET', 'is_homograph']=False
df_homs.loc[df_homs['node']=='CEDAR STREET', 'is_homograph']=False
df_homs.loc[df_homs['node']=='EDGEWOOD AVENUE', 'is_homograph']=False
df_homs.loc[df_homs['node']=='Academic Comprehensive Program', 'is_homograph']=False
df_homs.loc[df_homs['node']=='64 AVENUE', 'is_homograph']=False

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [84]:
column_node_to_semantic_type_dict = pd.read_pickle(output_path+'column_node_to_semantic_type_dict.pickle')
cell_node_to_semantic_type_dict = pd.read_pickle(output_path+'cell_node_to_semantic_type_dict.pickle')
predicted_homographs = pd.read_pickle(output_path+'predicted_homographs.pickle')
gt_homographs = set(df_homs[df_homs['is_homograph']==True]['node'])

In [85]:
print("Sherlock predicted:", len(predicted_homographs), 'unique homographs over cell values found in more than one column')

Sherlock predicted: 44818 unique homographs over cell values found in more than one column


## Homograph Number of Meanings Evaluation

In [86]:
df_homs['num_meanings_sherlock'] = np.nan
df_homs['adj_rand_index'] = np.nan
df_homs['adj_mutual_info'] = np.nan
df_homs['norm_mutual_info'] = np.nan

for idx, row in df_homs.iterrows():
    # Get number of meanings based on how many semantic types were assigned to it
    df_homs.loc[idx, 'num_meanings_sherlock'] = int(len(cell_node_to_semantic_type_dict[row['node']]))

    # For each homograph in `df_homs` extract the groundtruth and predicted labels to evaluate the
    # clustering quality (i.e., adjusted_rand_score)
    gt_col_names = []
    predicted_types = []
    for col_node in utils.graph_helpers.get_attribute_of_instance(G, row['node']):
        gt_col_names.append(G.nodes[col_node]['column_name'])
        predicted_types.append(column_node_to_semantic_type_dict[col_node])

    adj_rand_index_score = get_clustering_evaluation_score(true_labels=gt_col_names, predicted_labels=predicted_types, measure='adj_rand_index')
    adj_mutual_info_score = get_clustering_evaluation_score(true_labels=gt_col_names, predicted_labels=predicted_types, measure='adj_mutual_info')
    norm_mutual_info_score = get_clustering_evaluation_score(true_labels=gt_col_names, predicted_labels=predicted_types, measure='norm_mutual_info')
    df_homs.loc[idx, 'adj_rand_index'] = adj_rand_index_score
    df_homs.loc[idx, 'adj_mutual_info'] = adj_mutual_info_score
    df_homs.loc[idx, 'norm_mutual_info'] = norm_mutual_info_score

df_homs['is_num_meanings_correct'] = df_homs['num_meanings_groundtruth'] == df_homs['num_meanings_sherlock']
df_homs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

Unnamed: 0,node,node_type,betweenness_centrality,dense_rank,is_homograph,num_meanings_groundtruth,num_meanings_sherlock,adj_rand_index,adj_mutual_info,norm_mutual_info,is_num_meanings_correct
45335,Chemistry,cell,10.858448,14.0,True,4,3.0,0.761905,0.761905,0.90485,False
37275,--,cell,5.823986,20.0,True,3,3.0,1.0,1.0,1.0,True
62626,Early College,cell,5.68875,21.0,False,5,4.0,0.470588,0.512599,0.774958,False
1062913,166 ESSEX STREET,cell,5.050365,23.0,False,3,1.0,0.0,0.0,0.0,False
1396771,Please contact the school for more information.,cell,4.9193,24.0,True,4,4.0,1.0,0.0,1.0,True
879122,CEDAR STREET,cell,3.471863,32.0,False,2,1.0,0.0,0.0,0.0,False
426115,EDGEWOOD AVENUE,cell,3.471863,32.0,False,2,1.0,0.0,0.0,0.0,False
634170,"CUSHMAN & WAKEFIELD, INC.",cell,2.972433,33.0,True,2,1.0,0.0,0.0,0.0,False
963488,26 COURT STREET,cell,2.902326,34.0,True,2,1.0,0.0,0.0,0.0,False
54531,Kelly,cell,2.867587,35.0,True,2,1.0,0.0,0.0,0.0,False


In [87]:
print("Number of meanings precision:", df_homs['is_num_meanings_correct'].value_counts()[True] / len(df_homs.index))

Number of meanings precision: 0.3103448275862069


In [67]:
print("Average adjusted rand index:", df_homs['adj_rand_index'].mean())
print("Average adjusted mutual information:", df_homs['adj_mutual_info'].mean())
print("Average normalized mutual information:", df_homs['norm_mutual_info'].mean())

Average adjusted rand index: 0.38349592710647473
Average adjusted mutual information: 0.21668200187039124
Average normalized mutual information: 0.5213024991115666


## Homograph Detection Evaluation

In [88]:
intersection = gt_homographs & predicted_homographs
homographs_from_gt_not_found = gt_homographs - predicted_homographs
print("Intersection size:", len(intersection))
print("Number of GT homographs not found:", len(homographs_from_gt_not_found))
print("GT homographs not found:", homographs_from_gt_not_found)

Intersection size: 15
Number of GT homographs not found: 8
GT homographs not found: {'Expected as the school grows', 'CUSHMAN & WAKEFIELD, INC.', 'John', '2001152.0', '26 COURT STREET', 'Kelly', 'Moses', 'Dominick'}


In [89]:
for value in homographs_from_gt_not_found:
    gt_types, pred_types = get_predicted_and_gt_semantic_types_for_node(node=value, graph=G, cell_node_to_semantic_type_dict=cell_node_to_semantic_type_dict)
    print(value, "  \t\t GT types:", gt_types, '\t\t predicted types:', pred_types)

Expected as the school grows   		 GT types: ['online_language_courses', 'online_ap_courses'] 		 predicted types: {'language'}
CUSHMAN & WAKEFIELD, INC.   		 GT types: ['corporationname', 'prequalified_vendor_name'] 		 predicted types: {'company'}
John   		 GT types: ['prequalified_vendor_contact_person', 'firstname'] 		 predicted types: {'name'}
2001152.0   		 GT types: ['s4_ac_name_fsf_hs', 'bin'] 		 predicted types: {'address'}
26 COURT STREET   		 GT types: ['businessstreetname', 'prequalified_vendor_address'] 		 predicted types: {'address'}
Kelly   		 GT types: ['lastname', 'sculptor'] 		 predicted types: {'name'}
Moses   		 GT types: ['firstname', 'name'] 		 predicted types: {'name'}
Dominick   		 GT types: ['prequalified_vendor_contact_person', 'firstname'] 		 predicted types: {'name'}


In [90]:
for value in df_homs['node']:
    gt_types, pred_types = get_predicted_and_gt_semantic_types_for_node(node=value, graph=G, cell_node_to_semantic_type_dict=cell_node_to_semantic_type_dict)
    print(value, "  \t\t GT types:", gt_types, '\t\t predicted types:', pred_types)

Chemistry   		 GT types: ['online_ap_courses', 'core_course_ms_core_and_9_12_only_', 'name', 'fileorder'] 		 predicted types: {'category', 'language', 'name'}
--   		 GT types: ['p_s_234_282_grennwich_st', 'prequalified_vendor_phone_number', 'corporationname'] 		 predicted types: {'result', 'duration', 'company'}
Early College   		 GT types: ['academicopportunities1', 'program1', 'program', 'program_highlights', 'academicopportunities3'] 		 predicted types: {'description', 'education', 'requirement', 'name'}
166 ESSEX STREET   		 GT types: ['businessstreetname', 'address', 'street_address'] 		 predicted types: {'address'}
Please contact the school for more information.   		 GT types: ['online_language_courses', 'school_sports', 'open_house_info', 'email'] 		 predicted types: {'description', 'address', 'notes', 'language'}
CEDAR STREET   		 GT types: ['businessstreetname', 'street_address'] 		 predicted types: {'address'}
EDGEWOOD AVENUE   		 GT types: ['businessstreetname', 'street_add