# Wikidata evaluation

Evaluate the EKE methodology w.r.t the baseline (i.e TF-IDF)

## Define filepaths

In [1]:
root_dir = '../..'
data_dir = 'data'
corpus_dir = 'corpus'
rank_dir = 'terms'

In [2]:
version = 'v2'

## Load ground truth

Load each dataset

In [3]:
import os
import pandas as pd

In [4]:
wikidata_dir = 'cities_wikidata'

In [5]:
datasets_df = []

for filename in os.listdir(os.path.join(root_dir, data_dir, corpus_dir, wikidata_dir)):
    filepath = os.path.join(root_dir, data_dir, corpus_dir, wikidata_dir, filename)
    entry_df = pd.read_excel(filepath, index_col=0)
    datasets_df.append(entry_df)

len(datasets_df)

329

Merge all datasets into a single DataFrame

In [6]:
df = pd.concat(datasets_df, axis=0, ignore_index=True)
df = df[['entity', 'target']].drop_duplicates()
df.head()

Unnamed: 0,entity,target
0,Q42053,"Isfahan (City in Iran, Isfahan province, cente..."
17,Q1367759,Isfahan Province (Province of Iran)
18,Q25433162,Isfahan (electoral district in Iran)
30,Q21998562,Isfahan (1967 jazz song by Duke Ellington)
45,Q1282082,"Isfahan County (county in Isfahan, Iran)"


In [7]:
gt_dict = pd.Series(df['target'].values,index=df['entity']).to_dict()

In [8]:
del datasets_df
del df

## Load rankings

### LIME-based

In [9]:
lime_filename = f'ranking_wikidata_nn_{version}.xlsx'
lime_filepath = os.path.join(root_dir, data_dir, rank_dir, lime_filename)

In [10]:
lime_df = pd.read_excel(lime_filepath, index_col=0)
# Replace '_' character with whitespaces
lime_df['term'] = lime_df['term'].map(lambda x: x.replace('_', ' '))
lime_df.head()

Unnamed: 0,label,term,weight,rank,count
0,Q1005682,hamburg,6.433465,1.0,11
1,Q1005682,states,2.278421,2.454545,11
2,Q1005682,513,0.418916,3.0,2
3,Q1005682,minnesota,1.838184,3.1,10
4,Q1005682,united,2.012429,3.636364,11


In [11]:
lime_df.shape

(3829, 5)

### Baseline

In [12]:
baseline_filename = f'ranking_wikidata_baseline_{version}.xlsx'
baseline_filepath = os.path.join(root_dir, data_dir, rank_dir, baseline_filename)

In [13]:
baseline_df = pd.read_excel(baseline_filepath, index_col=0)
# Replace '_' character with whitespaces
baseline_df['term'] = baseline_df['term'].map(lambda x: x.replace('_', ' '))
baseline_df.head()

Unnamed: 0,label,term
0,Q1754,stockholm
1,Q1754,sthlm
2,Q1754,city
3,Q1754,sweden
4,Q1754,capital


In [14]:
baseline_df.shape

(895050, 2)

## Compute precision and recall

In [15]:
src_dir = 'src'

In [16]:
import sys
sys.path.append(os.path.join(root_dir, src_dir))

from training import TrainingCorpus

array(['stockholm', 'sthlm', 'city', ..., '鄂尔多斯沙漠', '鄂爾多斯', '鄂爾多斯沙漠'],
      dtype=object)

In [24]:
baseline_df.term.values.shape

(895050,)

In [45]:
def compute_fscore(p, r, beta):
    try:
        return (1 + beta**2) * p * r/(beta**2 * p + r)
    except:
        return 0

In [47]:
def compute_metrics(entity_id, gt_dict, terms):
    size = terms.shape[0]
    target_tokens = TrainingCorpus.tokenize(gt_dict[entity_id].lower())
    target_tokens_len = len(target_tokens)
    
    tokens_count = 0
    tokens_found = []
    
    precision_list = []
    recall_list = []
    f1score_list = []
    f2score_list = []
    f05score_list = []
    
    for i in range(size):
        selected_terms = terms[i].split()
        
        for term in selected_terms:
            tokens_count += 1
            if term in target_tokens and term not in tokens_found:
                tokens_found.append(term)
        
        precision = len(tokens_found)/tokens_count
        precision_list.append(precision)
        
        recall = len(tokens_found)/target_tokens_len
        recall_list.append(recall)

        f1score = compute_fscore(precision, recall, 1)
        f1score_list.append(f1score)
        
        f2score = compute_fscore(precision, recall, 2)
        f2score_list.append(f2score)
        
        f05score = compute_fscore(precision, recall, 0.5)
        f05score_list.append(f05score)
    
    # debug only
    print(precision_list)
    print(recall_list)
    print(f1score_list)
    print(f2score_list)
    print(f05score_list)
    
    metrics_dict = {}
    metrics_dict['entity'] = entity_id
    metrics_dict['k_recall'] = -1
    metrics_dict['max_f1scores'] = [-1] * size
    metrics_dict['k_f1score'] = -1
    metrics_dict['max_f2scores'] = [-1] * size
    metrics_dict['max_f05scores'] = [-1] * size
    metrics_dict['max_recall'] = -1
    metrics_dict['max_precision'] = -1
    
    # compute k_recall
    k_recall_threshold = 0.9
    not_found = True
    i = 0
    while i < size and not_found:
        if recall_list[i] > 0.9:
            not_found = False
            metrics_dict['k_recall'] = i + 1 # starts from 0
        i += 1
    
    # compute max_f1scores
    max_f1score = -1
    for i in range(size):
        if f1score_list[i] > max_f1score:
            max_f1score = f1score_list[i]
        metrics_dict['max_f1scores'][i] = max_f1score
    
    # compute k_f1score
    metrics_dict['k_f1score'] = max(list(range(size)), key=lambda i: f1score_list[i])
    
    # compute max_f2scores
    max_f2score = -1
    for i in range(size):
        if f2score_list[i] > max_f2score:
            max_f2score = f2score_list[i]
        metrics_dict['max_f2scores'][i] = max_f2score
    
    # compute max_f05scores
    max_f05score = -1
    for i in range(size):
        if f05score_list[i] > max_f05score:
            max_f05score = f05score_list[i]
        metrics_dict['max_f05scores'][i] = max_f05score
    
    # compute max_recall
    p_threshold = 0.9
    selected_idxs = [idx for idx, p in enumerate(precision_list) if p > p_threshold]
    metrics_dict['max_recall'] = max([recall_list[idx] for idx in selected_idxs])
    
    # compute max_precision
    r_threshold = 0.9
    selected_idxs = [idx for idx, r in enumerate(recall_list) if r > r_threshold]
    metrics_dict['max_precision'] = max([precision_list[idx] for idx in selected_idxs])
    
    return metrics_dict 

In [32]:
test_df = baseline_df[baseline_df['label'] == 'Q1754'].head(10)
test_df

Unnamed: 0,label,term
0,Q1754,stockholm
1,Q1754,sthlm
2,Q1754,city
3,Q1754,sweden
4,Q1754,capital
5,Q1754,baltic
6,Q1754,ericsson
7,Q1754,fourteen islands
8,Q1754,globe
9,Q1754,hall


In [42]:
TrainingCorpus.tokenize(gt_dict['Q1754'].lower())

['stockholm', 'capital', 'sweden']

TODO:
 - test function
 - plot
 - plot for alaska

In [48]:
compute_metrics('Q1754', gt_dict, test_df['term'].values)

[1.0, 0.5, 0.3333333333333333, 0.5, 0.6, 0.5, 0.42857142857142855, 0.3333333333333333, 0.3, 0.2727272727272727]
[0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.6666666666666666, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
[0.5, 0.4, 0.3333333333333333, 0.5714285714285715, 0.7499999999999999, 0.6666666666666666, 0.6, 0.5, 0.4615384615384615, 0.42857142857142855]
[0.3846153846153846, 0.3571428571428571, 0.3333333333333333, 0.625, 0.8823529411764706, 0.8333333333333334, 0.7894736842105262, 0.7142857142857143, 0.6818181818181818, 0.6521739130434783]
[0.7142857142857143, 0.45454545454545453, 0.3333333333333333, 0.5263157894736842, 0.6521739130434783, 0.5555555555555556, 0.48387096774193544, 0.3846153846153846, 0.3488372093023256, 0.3191489361702128]


{'entity': 'Q1754',
 'k_recall': 5,
 'max_f1scores': [0.5,
  0.5,
  0.5,
  0.5714285714285715,
  0.7499999999999999,
  0.7499999999999999,
  0.7499999999999999,
  0.7499999999999999,
  0.7499999999999999,
  0.7499999999999999],
 'k_f1score': 4,
 'max_f2scores': [0.3846153846153846,
  0.3846153846153846,
  0.3846153846153846,
  0.625,
  0.8823529411764706,
  0.8823529411764706,
  0.8823529411764706,
  0.8823529411764706,
  0.8823529411764706,
  0.8823529411764706],
 'max_f05scores': [0.7142857142857143,
  0.7142857142857143,
  0.7142857142857143,
  0.7142857142857143,
  0.7142857142857143,
  0.7142857142857143,
  0.7142857142857143,
  0.7142857142857143,
  0.7142857142857143,
  0.7142857142857143],
 'max_recall': 0.3333333333333333,
 'max_precision': 0.6}

In [46]:
compute_fscore(0, 0, 1)

0