# Novelty analysis
In this notebook, we'll look at the trade-offs between novelty and accuracy in semantic matching.

In [1]:
import logging
logging.getLogger().setLevel(logging.INFO)

import takco
import pandas as pd

conf = takco.config.parse('resources/config-dbpedia.toml')
tables = takco.TableSet.load('output/t2d-v2-baseline-2/1-link/*')

t2dv2 = takco.config.build('t2d-v2', conf)
scored_tables = tables.score(t2dv2, keycol_only=True)
scored_tables.tables.persist()

takco.preview( t for t in scored_tables if any(t.get('gold', {}).values()) )

INFO:root:Loading data from resources/t2d_fix.csv
INFO:root:Read 512 tables from data/t2d-v2/tables
INFO:root:Read 512 entity tables from data/t2d-v2/instance
INFO:root:Loaded 514 annotated tables


?,0,1,2
∈,Book,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Unnamed: 1_level_2,author,Unnamed: 3_level_2
Unnamed: 0_level_3,Title,Author,Source
,Adventures of Huckleberry Finn  ❌  💡,Mark Twain,ALA [11]
,The Adventures of Super Diaper Baby  ❌  💡,Dav Pilkey,ALA [47]
,The Adventures of Tom Sawyer  ❌  💡,Mark Twain,ALA
,Alice series  ❌  💡,Phyllis Reynolds Naylor,ALA [2]
,All the King's Men  ❌  💡,Robert Penn Warren,Rad

?,0,1,2
∈,Unnamed: 1_level_1,Newspaper,Unnamed: 3_level_1
Unnamed: 0_level_2,#,Media,MIX
,1,Dainik Jagran  ❌  💡,27.5
,2,Dainik Bhaskar  ❌  💡,14.0
,3,Aajtak TV,7.0
,4,CNN Editions (International)  ❓,6.0
,5,Dinakaran  ❌  💡,5.0

?,0,1,2,3,4,5,6
∈,Unnamed: 1_level_1,Unnamed: 2_level_1,Building,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,location,floorCount💡,Unnamed: 6_level_2,openingDate💡
Unnamed: 0_level_3,#,Geb?ude,Geb?ude,Stadt,Etagen,H?he,Jahr
,1,,Burj Khalifa  ❌  💡,Dubai,163,2.717 ft,2010
,2,,Makkah Clock Royal Tower [Abraj Al Bait]  ❌  💡,Mekka,95,1.972 ft,2012
,3,,Taipei 101  ❌  💡,Taipei,101,1.671 ft,2004
,4,,Shanghai World Financial Center  ❌  💡,Shanghai,101,1.614 ft,2008
,5,,International Commerce Centre [Union Square]  ❌  💡,Hong Kong,118,1.588 ft,2010

?,0,1,2,3,4
∈,Unnamed: 1_level_1,Company,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Unnamed: 1_level_2,Unnamed: 2_level_2,industry,Unnamed: 4_level_2,Unnamed: 5_level_2
Unnamed: 0_level_3,Rank,Company,Industry,Temkin Experience Rating (TER),Company TER vs Industry TER
,1,Sam's Club  ❌  💡,Retailer,85%,13.0
,2,Publix  ❌  💡,Grocery Chain,81%,4.9
,3,A credit union,Bank,80%,14.5
,3,Chick-fil-A  ❌  💡,Fast Food Chain,80%,6.2
,3,Subway  ❌  💡,Fast Food Chain,80%,6.4

?,0,1,2,3,4,5,6
∈,Mountain,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Unnamed: 0_level_2,PEAK,RANKING,MAP,GUIDE,GRID REF,ALT (ft),ALT (m)
,Allen Crags  ❌  💡,43,SW,E,NY 236 085,2572,784
,Angletarn Pikes  ❌  💡,143,NE,FE,NY 414 148,1857,566
,Ard Crags  ❌  💡,142,NW,NW,NY 207 197,1860,567
,Armboth Fell  ❌  💡,182,NW,C,NY 297 159,1570,479
,Arnison Crag  ❌  💡,194,NE,E,NY 394 150,1424,434

?,0,1,2,3,4
∈,Unnamed: 1_level_1,Country,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,frenchName💡,Unnamed: 2_level_2,region💡,capital,timeZone💡
Unnamed: 0_level_3,A,Nom en anglais,Endroit,Capitale,Heure
,Afghanistan,Afghanistan  ❌  💡,Asie,Kabul,4.5
,Afrique du Sud,South Afrique  💡,Afrique,Pretoria,2.0
,Albanie,Albania  ❌  💡,Europe,Tirane,1.0
,Alderney (UK) voir les Anglo-Normandes,Alderney  ❌  💡,Europe,,0.0
,Algrie,Algeria  ❌  💡,Afrique,Algiers,1.0

?,0,1,2,3,4,5,6
∈,Unnamed: 1_level_1,Country,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Unnamed: 0_level_2,Unnamed: 1_level_2,Country Name:,Population,Area (Sq. Km.),Population Density (Sq. Km.),Area (Sq. Mi.),Population Density (Sq. Mi.)
,36,China  ❌  💡,1339190000,9596960.0,139.54,3705405.45,361.42
,77,India  ❌  💡,1184639000,3287590.0,360.34,1269345.07,933.27
,183,United States of America  ❌  💡,309975000,9629091.0,32.19,3717811.29,83.38
,78,Indonesia  ❌  💡,234181400,1919440.0,122.01,741099.62,315.99
,24,Brazil  ❌  💡,193364000,8511965.0,22.72,3286486.71,58.84

?,0,1,2,3,4,5
∈,VideoGame,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,Unnamed: 1_level_2,publisher,releaseDate💡,releaseDate💡,Unnamed: 5_level_2,Unnamed: 6_level_2
Unnamed: 0_level_3,Title,Publisher,EU Release Date,AU Release Date,PEGI,ACB
,Donkey Kong Country  ❌  💡,Nintendo,2006-12-08,2006-12-07,7,G
,F-Zero  ❌  💡,Nintendo,2006-12-08,2006-12-07,3,G
,SimCity  ❌  💡,Nintendo,2006-12-29,2006-12-29,3,G
,Super Castlevania IV  ❌  💡,Konami,2006-12-29,2006-12-29,3,PG
,Street Fighter II: The World Warrior  ❌  💡,Capcom,2007-01-19,2007-01-19,12,PG

?,0,1,2,3,4
∈,Unnamed: 1_level_1,RadioStation,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Unnamed: 1_level_2,Unnamed: 2_level_2,programmeFormat💡,city  ❓,Unnamed: 5_level_2
Unnamed: 0_level_3,Dial Location,Call Letters,Format,Address,Telephone
,AM 790,KABC (ABC Radio Networks)  ❌  💡,News/Talk,"3321 S La Cienega Blvd, Los Angeles 90016",(310) 840-4900
,AM 900,KALI AM  ❌  💡,Spanish News/Talk,"747 E Green St, Pasadena 91101",(626) 844-8882
,AM 1300,KAZN (Asian Radio)  ❌  💡,Chinese Variety,"747 E Green St, Pasadena 91101",(626) 568-1300
,AM 1580,KBLA  ❌  💡,Spanish News/Talk,"123 Figueroa St, #101A, Los Angeles 90012",(213) 628-8700
,AM 740,KBRT (K-Bright)  ❌  💡,Religious Talk,"3183-D Airway Ave, Costa Mesa 92626",(714) 754-4450

?,0,1,2
∈,Unnamed: 1_level_1,Hospital,Unnamed: 3_level_1
Unnamed: 0_level_2,Local Health Boards,Hospital name,Link Surgeons
,Abertawe Bro Morgannwg University LHB,Morriston Hospital (Swansea)  ❌  💡,Roger Morgan
,,Singleton Hospital (Swansea)  ❌  💡,Roger Morgan
,,Princess of Wales Hospital (Bridgend)  ❌  💡,Roger Morgan
,Aneurin Bevan LHB,Neville Hall Hospital (Abergavenny)  ❌  💡,Richard Blackett
,,Royal Gwent Hospital (Newport)  ❌  💡,Ahmed Shandall


In [21]:
db = takco.config.build('dbpedia_t2ksubset', conf)

novelty_tables = scored_tables.triples().novelty(db)
novelty_tables.tables.persist()

len(list(novelty_tables))



235

In [22]:
report = novelty_tables.report(keycol_only=True)

display(pd.DataFrame.from_dict(report['scores'], orient='index').style.set_caption('Predictions:'))

def reform_dict(dictionary, t=tuple(), reform={}):
    for key, val in dictionary.items():
        t = t + (key,)
        if isinstance(val, dict) and all(isinstance(v, dict) for v in val.values()):
            reform_dict(val, t, reform)
        else:
            reform.update({t: val})
        t = t[:-1]
    return reform

display()
pd.DataFrame.from_dict(reform_dict(report['novelty']), orient='index').style.set_caption('Extractions:')

INFO:root:Collected 26104 gold and 24061 pred for task entities
INFO:root:Collected 434 gold and 157 pred for task properties
INFO:root:Collected 235 gold and 235 pred for task classes


Unnamed: 0,precision,recall,f1-score,support,predictions
entities,0.86767,0.799762,0.832333,26104,24061
properties,0.77707,0.281106,0.41286,434,157
classes,0.740426,0.740426,0.740426,235,235


Unnamed: 0,Unnamed: 1,Unnamed: 2,tp,fn,fp,precision,recall,f1
dbpedia_t2ksubset,label,existing,9471,1027,728,0.92862,0.902172,0.915205
dbpedia_t2ksubset,label,attnovel,2549,2424,1030,0.71221,0.512568,0.596118
dbpedia_t2ksubset,label,valnovel,351,468,349,0.501429,0.428571,0.462146
dbpedia_t2ksubset,class,existing,5676,2190,3530,0.616554,0.721587,0.664948
dbpedia_t2ksubset,class,attnovel,1886,1464,1332,0.586078,0.562985,0.5743
dbpedia_t2ksubset,class,valnovel,62,1546,432,0.125506,0.038557,0.058991
dbpedia_t2ksubset,property,existing,3100,580,994,0.757206,0.842391,0.79753
dbpedia_t2ksubset,property,attnovel,2858,2450,2204,0.564599,0.538433,0.551205
dbpedia_t2ksubset,property,valnovel,3422,412,2257,0.602571,0.89254,0.719437
