In [1]:
import re
import xapian
import pandas as pd
import numpy as np
import csv

In [2]:
pd.options.display.max_colwidth = 100

In [8]:
DBPATH = "ner_index"
#SOURCES = ['dev', 'train']
SOURCES = ['dev']
SEARCH_RESULT_PATH = "data/search_results_v2.csv"

In [9]:
!xapian-delve $DBPATH

UUID = fbb19864-3553-4f7e-bd55-ffd6bdaa19a8
number of documents = 5365422
average document length = 174.506
document length lower bound = 1
document length upper bound = 68701
highest document id ever used = 5365422
has positional information = true
revision = 628
currently open for writing = false


In [10]:
!xapian-delve -r 40000 -d $DBPATH

Data for record #40000:
{"doc_id": "1992_Los_Angeles_Raiders_season", "shard": "001", "text": "The 1992 Los Angeles Raiders season was their 33rd in the National Football League -LRB- NFL -RRB- . They were unable to improve upon their previous season 's output of 9 -- 7 , winning only seven games . This was the first time in three seasons the team failed to qualify for the playoffs .", "keywords": ["(NFL", "1992_Los_Angeles_Raiders_season", "1992", "33rd", "Los_Angeles_Raiders", "National_Football_League", "NFL"]}
Term List for record #40000: 1992 33rd 7 9 K(nfl K1992 K1992_los_angeles_raiders_season K33rd Klos_angeles_raiders Knational_football_league Knfl Q1992_Los_Angeles_Raiders_season Zangel Zfail Zfirst Zfootbal Zfor Zgame Zimprov Zin Zleagu Zlos Zlrb Znation Znfl Zof Zonli Zoutput Zplayoff Zprevious Zqualifi Zraider Zrrb Zs Zseason Zseven Zteam Zthe Ztheir Zthey Zthis Zthree Ztime Zto Zunabl Zupon Zwas Zwere Zwin angeles failed first football for games improve in league los lr

In [11]:
import spacy
import en_core_web_lg

nlp = en_core_web_lg.load()
def parse_with_spacy(text):
    doc = nlp(text)
    record = dict(
        named_entities = [{'entity': span.text, 'label': span.label_, 'root': span.root.text} for span in doc.ents],
        noun_phrases = [{'noun_phrase': span.text, 'root': span.root.text} for span in doc.noun_chunks]
    )
    return record

def preprocess_ner(s):
    s = re.sub("^(The|the|A|a|An|an)\s", "", s)
    s = re.sub("\s", "_", s) 
    return s

def obtain_nouns(v):
    ents = {preprocess_ner(item['entity']) for item in v['named_entities']}
    nps = {preprocess_ner(item['noun_phrase']) for item in v['noun_phrases']}
    return ents.union(nps)

# Retrieve documents relevant to claims

In [12]:
def get_doc_id(match):
    for term in match.document.termlist():
        term = term.term.decode("utf-8") 
        m = re.match("Q(.*)", term)
        if m:
            return m[1]
    return None

In [13]:
# Prepare enquiry object

# Open the database we're going to search.
db = xapian.Database(DBPATH)

# Set up a QueryParser with a stemmer and suitable prefixes
queryparser = xapian.QueryParser()
queryparser.set_stemmer(xapian.Stem("en"))
queryparser.set_stemming_strategy(queryparser.STEM_SOME)
queryparser.add_prefix('keywords', 'K')

# Use an Enquire object on the database to run the query
enquire = xapian.Enquire(db)    

In [63]:
query = 'keywords:"1992_Los_Angeles_Raiders_season nfl" first time in three seasons the team failed'


query = queryparser.parse_query(query)
print(query)
enquire.set_query(query)
matches = enquire.get_mset(0, pagesize)

query_results = []
for match in matches:
    result = dict(
        claim_id = claim_id,
        found_doc = get_doc_id(match),
        rank = match.rank + 1,
        percentage = match.percent,
        weight = match.weight,            
    )
    query_results.append(result)
query_results

Query(((K1992_los_angeles_raiders_season@1 PHRASE 2 Knfl@2) OR (Zfirst@3 OR Ztime@4 OR Zin@5 OR Zthree@6 OR Zseason@7 OR Zthe@8 OR Zteam@9 OR Zfail@10)))


[{'claim_id': 266,
  'found_doc': '1948_New_York_Yankees_-LRB-AAFC-RRB-_season',
  'percentage': 80,
  'rank': 1,
  'weight': 13.67887837346255},
 {'claim_id': 266,
  'found_doc': '1976_St._Louis_Cardinals_-LRB-NFL-RRB-_season',
  'percentage': 79,
  'rank': 2,
  'weight': 13.674872179375331},
 {'claim_id': 266,
  'found_doc': '1955_Detroit_Lions_season',
  'percentage': 79,
  'rank': 3,
  'weight': 13.637192076563455},
 {'claim_id': 266,
  'found_doc': '1992_Los_Angeles_Raiders_season',
  'percentage': 76,
  'rank': 4,
  'weight': 13.070145719873858},
 {'claim_id': 266,
  'found_doc': '1984_New_York_Cosmos_season',
  'percentage': 73,
  'rank': 5,
  'weight': 12.526454740185518},
 {'claim_id': 266,
  'found_doc': '1950_Philadelphia_Eagles_season',
  'percentage': 72,
  'rank': 6,
  'weight': 12.327565268136006},
 {'claim_id': 266,
  'found_doc': '1936_Chicago_Cardinals_season',
  'percentage': 71,
  'rank': 7,
  'weight': 12.176479813690499},
 {'claim_id': 266,
  'found_doc': '1970–71

In [14]:
claim_df = pd.read_json('data/claims_lm.json').sort_index()

In [15]:
mask = claim_df.source.isin(SOURCES)
claim_df = claim_df[mask]

In [19]:
%%time
claim_df['search_words'] = claim_df.claim.apply(parse_with_spacy).apply(obtain_nouns)
#(claim_df.np_phrase + claim_df.np_roots).apply(lambda x: list(np.unique(x)))

CPU times: user 37.4 s, sys: 158 ms, total: 37.5 s
Wall time: 37.6 s


In [20]:
claim_df.head()

Unnamed: 0,claim,source,named_entities,noun_phrases,entity_count,entity_types,entity_types_count,np_count,np_phrase,np_roots,search_words
12,Carlos Santana disbanded Santana in 1965.,dev,"[{'entity': 'Carlos Santana', 'label': 'PERSON', 'root': 'Santana'}, {'entity': 'Santana', 'labe...","[{'noun_phrase': 'Carlos Santana', 'root': 'Santana'}, {'noun_phrase': 'Santana', 'root': 'Santa...",3,"[PERSON, DATE]",2,2,"[Carlos Santana, Santana]",[Santana],"{Santana, Carlos_Santana, 1965}"
70,David Packouz was born in February of 1982.,dev,"[{'entity': 'David Packouz', 'label': 'PERSON', 'root': 'Packouz'}, {'entity': 'February of 1982...","[{'noun_phrase': 'David Packouz', 'root': 'Packouz'}, {'noun_phrase': 'February', 'root': 'Febru...",2,"[PERSON, DATE]",2,2,"[David Packouz, February]","[February, Packouz]","{February_of_1982, February, David_Packouz}"
97,Craig David is a pop music performer.,dev,"[{'entity': 'Craig David', 'label': 'PERSON', 'root': 'David'}]","[{'noun_phrase': 'Craig David', 'root': 'David'}, {'noun_phrase': 'a pop music performer', 'root...",1,[PERSON],1,2,"[a pop music performer, Craig David]","[performer, David]","{pop_music_performer, Craig_David}"
98,Craig David is a performer that does pop music.,dev,"[{'entity': 'Craig David', 'label': 'PERSON', 'root': 'David'}]","[{'noun_phrase': 'Craig David', 'root': 'David'}, {'noun_phrase': 'a performer', 'root': 'perfor...",1,[PERSON],1,3,"[pop music, a performer, Craig David]","[performer, David, music]","{Craig_David, performer, pop_music}"
158,Wish Upon was released in France.,dev,"[{'entity': 'France', 'label': 'GPE', 'root': 'France'}]","[{'noun_phrase': 'France', 'root': 'France'}]",1,[GPE],1,1,[France],[France],{France}


In [21]:
claim_df.shape

(5001, 11)

In [22]:
search_column = 'search_words'

In [25]:
%%time
fields = ['claim_id', 'found_doc', 'rank', 'percentage', 'weight']
pagesize = 100
i = 0
results = []

with open(SEARCH_RESULT_PATH, 'w') as csvFile:
    writer = csv.DictWriter(csvFile, fieldnames=fields)
    writer.writeheader()

    for claim_id, claim in claim_df[search_column].items():
        if i % 100 == 0:
            print(i // 100, claim_id)
        i += 1

        claim = 'keywords:"{}"'.format(" ".join(claim))
        
        #if isinstance(claim, list):
        #    claim = ' '.join(claim)
        #print(claim)
        
        
        query = queryparser.parse_query(claim)
        print(query)
        enquire.set_query(query)
        matches = enquire.get_mset(0, pagesize)

        query_results = []
        for match in matches:
            result = dict(
                claim_id = claim_id,
                found_doc = get_doc_id(match),
                rank = match.rank + 1,
                percentage = match.percent,
                weight = match.weight,            
            )
            query_results.append(result)
        writer.writerows(query_results)
        results += query_results
csvFile.close()

0 12
Query((Ksantana@1 PHRASE 3 Kcarlos_santana@2 PHRASE 3 K1965@3))
Query((Kfebruary_of_1982@1 PHRASE 3 Kfebruary@2 PHRASE 3 Kdavid_packouz@3))
Query((Kpop_music_performer@1 PHRASE 2 Kcraig_david@2))
Query((Kcraig_david@1 PHRASE 3 Kperformer@2 PHRASE 3 Kpop_music@3))
Query(Kfrance@1)
Query((Kstate_of_palestine@1 PHRASE 5 Kwestern_asia@2 PHRASE 5 Kterritory@3 PHRASE 5 Kpalestine@4 PHRASE 5 Kstate@5))
Query((Kastronaut@1 PHRASE 2 Kjohn_krasinski@2))
Query((Ksheryl_lee@1 PHRASE 4 Kamerican@2 PHRASE 4 Kamerican_romantic_comedy@3 PHRASE 4 Kdrama_film@4))
Query((Ksheryl_lee@1 PHRASE 5 Kamerican_romantic_comedy@2 PHRASE 5 Kdrama_film@3 PHRASE 5 Kamerican@4 PHRASE 5 Kaward@5))
Query((Kandrew_kevin_walker@1 PHRASE 2 Kdirector@2))
Query((Knuuk@1 PHRASE 3 Kseat@2 PHRASE 3 Kgovernment@3))
Query((Ksinger@1 PHRASE 2 Kpaul_nicholls@2))
Query((Ksoul_food@1 PHRASE 3 Konly_film@2 PHRASE 3 K1997@3))
Query((Ksaxony@1 PHRASE 2 Konly_a_town@2))
Query((Ktottenham_hotspur_f@1 PHRASE 5 Kc@2 PHRASE 5 Kfootball

3 14977
Query((Kblack_canary@1 PHRASE 3 Ktelevision@2 PHRASE 3 Kcharacter@3))
Query((Kaaron_burr@1 PHRASE 3 Knew_jersey@2 PHRASE 3 Kalexander_hamilton@3))
Query((Kaaron_burr@1 PHRASE 4 Kseaside_heights@2 PHRASE 4 Kalexander_hamilton@3 PHRASE 4 Knew_jersey@4))
Query((Kwildfang@1 PHRASE 3 Kjuly_2010@2 PHRASE 3 Kjuly@3))
Query((Kvocal_group_hall_of_fame@1 PHRASE 4 Kfame@2 PHRASE 4 Kvocal_group_hall@3 PHRASE 4 Kjoe_walsh@4))
Query((Ktheir_flagship_event@1 PHRASE 5 Kstarrcade@2 PHRASE 5 Kwcw@3 PHRASE 5 Knwa@4 PHRASE 5 Kdecade@5))
Query((Kcelebrity@1 PHRASE 2 Kkendall_jenner@2))
Query((Kmarjorie_gross@1 PHRASE 4 Kwriter@2 PHRASE 4 Kcbs_television_program@3 PHRASE 4 Kcbs@4))
Query((Kgreat_britain@1 PHRASE 3 Kit@2 PHRASE 3 Kgin@3))
Query((Kenglish@1 PHRASE 3 Ktim_roth@2 PHRASE 3 Kenglish_actor@3))
Query((Kperson@1 PHRASE 2 Kkendall_jenner@2))
Query((K1927_@1 PHRASE 5 K_1941@2 PHRASE 5 K1927_@3 PHRASE 5 K_1941_mount_rushmore@4 PHRASE 5 Kmount_rushmore@5))
Query((Keight_men@1 PHRASE 7 Kxhamster'

Query((Kfirst@1 PHRASE 4 Khalsey@2 PHRASE 4 Kher_first_recording_contract@3 PHRASE 4 K2014@4))
Query((Kcaribbean@1 PHRASE 5 Kconcacaf_champions_league@2 PHRASE 5 Kfootball_clubs@3 PHRASE 5 Kit@4 PHRASE 5 Kmany_people@5))
Query((Khindi_language_film@1 PHRASE 2 Kvedam@2))
Query((Kall@1 PHRASE 4 Kfemale_cast@2 PHRASE 4 Khenry_viii@3 PHRASE 4 Ktv_serial@4))
Query((Ksinger@1 PHRASE 2 Keric_church@2))
Query((K22@1 PHRASE 5 Kclose_to_22_beats@2 PHRASE 5 Kheart@3 PHRASE 5 Kminute@4 PHRASE 5 Kresting_rate@5))
Query((Kconcacaf_champions_league@1 PHRASE 3 Kfootball_clubs@2 PHRASE 3 Kevil_realm@3))
Query(K21st_century@1)
Query((Kidea@1 PHRASE 2 Kadvertising@2))
Query((Kideology@1 PHRASE 2 Kadvertising@2))
Query((Khulu@1 PHRASE 2 Ktrollhunters@2))
Query((Kjoe_dirt_2@1 PHRASE 5 K_beautiful_loser@2 PHRASE 5 Kdavid_spade@3 PHRASE 5 Kbeautiful_loser@4 PHRASE 5 Kjoe_dirt@5))
Query((Kmethod@1 PHRASE 3 Ksilence@2 PHRASE 3 Kmorse_code@3))
Query((Knew_york_knicks@1 PHRASE 6 Kknicks@2 PHRASE 6 Knew_york@3 PH

Query((K40,000_sheets@1 PHRASE 4 K40,000@2 PHRASE 4 Kbangladesh@3 PHRASE 4 Kconcert@4))
Query((Kbret_easton_ellis@1 PHRASE 4 Khe@2 PHRASE 4 Kcanyons@3 PHRASE 4 Kscreenplay@4))
Query((Ksikkim@1 PHRASE 3 Keastern_himalaya@2 PHRASE 3 Kpart@3))
Query((Kme@1 PHRASE 2 Kreign@2))
Query((Klarge_human_settlement@1 PHRASE 3 Knew_jersey@2 PHRASE 3 Kcamden@3))
Query((Knovel@1 PHRASE 2 Kjackpot@2))
Query((K14_ad@1 PHRASE 2 Kaugustus@2))
Query((Kfrance@1 PHRASE 2 Kaugustus@2))
Query((Kvedam@1 PHRASE 2 Kfilm@2))
Query((Krabies@1 PHRASE 2 Kbrain@2))
Query((Kseohyun@1 PHRASE 2 Kperformer@2))
Query((Kharris_jayaraj@1 PHRASE 2 Kidaho@2))
Query((Kparcel@1 PHRASE 4 Kjohn_deighton@2 PHRASE 4 Kgold@3 PHRASE 4 Kland@4))
Query((Khermione_granger@1 PHRASE 2 Knoel_fisher@2))
Query((Kdaggering@1 PHRASE 2 Kjamaica@2))
Query((Kdaggering@1 PHRASE 2 Kjamaica@2))
Query((Kfidel_castro@1 PHRASE 3 Kbrother@2 PHRASE 3 Kraúl_castro@3))
Query((Kactor@1 PHRASE 5 Kadventures@2 PHRASE 5 K1961@3 PHRASE 5 Kpluto@4 PHRASE 5 Knash

Query((Kapple_ii@1 PHRASE 3 Kapple@2 PHRASE 3 Ksteve_wozniak@3))
Query(Kangela_bassett@1)
Query((Kjapanese@1 PHRASE 3 Kmultiple_japanese_actors@2 PHRASE 3 Kmud_stars@3))
Query((Krequired_course@1 PHRASE 3 Kcalifornia@2 PHRASE 3 Kpsych@3))
Query((Kit@1 PHRASE 6 Klondon@2 PHRASE 6 Kvictoria_palace_theatre@3 PHRASE 6 Kculture@4 PHRASE 6 Kcentral_london_railway_terminus@5 PHRASE 6 Kplace@6))
Query((Kplaywright@1 PHRASE 3 Kdancer@2 PHRASE 3 Kjanelle_monáe@3))
Query((Kdaily_show@1 PHRASE 2 Kpolitical_figures@2))
Query((Kregion@1 PHRASE 4 Kmacedonia@2 PHRASE 4 Krepublic@3 PHRASE 4 Krepublic_of_macedonia@4))
Query((Khell@1 PHRASE 2 Kripon_college@2))
Query((Kcity@1 PHRASE 2 Kbahamas@2))
Query((Knovel@1 PHRASE 2 Kwallace@2))
Query((Kmusic@1 PHRASE 2 Kdavid_packouz@2))
Query((Kmuscarinic_acetylcholine_receptors@1 PHRASE 3 Kbiological_units@2 PHRASE 3 Kmuscarinic@3))
Query((Kfloridian@1 PHRASE 2 Kann_richards@2))
Query((Kharold_macmillan@1 PHRASE 2 Ksenator@2))
Query((Kjennifer_lopez@1 PHRASE 2 K

Query((Khundred_years'_war@1 PHRASE 2 Kedwardian_era_war@2))
Query((Zkeyword@1 OR (what@2 OR new@3) OR (_scooby@4 PHRASE 2 doo@5) OR (taylor_lautner@6 OR 3@7 OR 3_different_voice_roles@8)))
Query((Kher_acting_career@1 PHRASE 2 Kmiranda_otto@2))
Query((Kstage@1 PHRASE 2 Ksayyeshaa@2))
Query((Kman@1 PHRASE 2 Kcolombiana@2))
Query((Kgal_gadot@1 PHRASE 5 Kbar_refaeli@2 PHRASE 5 Kisrael@3 PHRASE 5 Khighest_earning_actress@4 PHRASE 5 Kmodels@5))
Query((Kapple_ii@1 PHRASE 2 Ksteve_wozniak@2))
Query((Kjoe_walsh@1 PHRASE 2 Knew_museum@2))
Query((Krob_sheridan@1 PHRASE 2 Kseptember@2))
Query(Kabc@1)
Query((Krutgers_university@1 PHRASE 4 Knew_jersey@2 PHRASE 4 Kcamden@3 PHRASE 4 Khome@4))
Query((Konly_location@1 PHRASE 4 Krutgers_university@2 PHRASE 4 Knew_jersey@3 PHRASE 4 Kcamden@4))
Query((Kother_girl_groups@1 PHRASE 4 Ksongs@2 PHRASE 4 Klittle_mix@3 PHRASE 4 Kdna@4))
Query((Kfred_armisen@1 PHRASE 3 Kacting@2 PHRASE 3 Kprofession@3))
Query((Kjames_jones@1 PHRASE 3 Kall@2 PHRASE 3 Kstar_game@3)

Query((Kjim_morrison@1 PHRASE 4 Kjohn_buscema@2 PHRASE 4 Kstan_lee@3 PHRASE 4 Kprowler@4))
Query((Kturkish_citizens@1 PHRASE 6 Karmenian_genocide@2 PHRASE 6 Kwho@3 PHRASE 6 Karmenians@4 PHRASE 6 Kextermination@5 PHRASE 6 Kturkish@6))
Query((Kyara_shahidi@1 PHRASE 2 Kdog@2))
Query((Kbret_easton_ellis@1 PHRASE 3 Kscreenplay@2 PHRASE 3 Kcanyons@3))
Query((Kmusic@1 PHRASE 3 Kform@2 PHRASE 3 Kdaggering@3))
Query((Kdarwin@1 PHRASE 7 K2008_film@2 PHRASE 7 Kaustralia@3 PHRASE 7 Kproduction@4 PHRASE 7 Klocation@5 PHRASE 7 Kplace@6 PHRASE 7 K2008@7))
Query((Kbret_easton_ellis@1 PHRASE 5 K2013@2 PHRASE 5 Khe@3 PHRASE 5 Kscreenplay@4 PHRASE 5 K2013_film@5))
Query((Kretreat@1 PHRASE 4 Kpermafrost@2 PHRASE 4 Kexpected_outcome@3 PHRASE 4 Kglobal_warming@4))
Query((Kmember@1 PHRASE 4 Kron_weasley@2 PHRASE 4 Khufflepuff@3 PHRASE 4 Khufflepuff_house@4))
Query((Khis_injury@1 PHRASE 4 K1515@2 PHRASE 4 Kfrance@3 PHRASE 4 Kfrancis_i@4))
Query((Kdirector@1 PHRASE 2 Ksidse_babett_knudsen@2))
Query((Kother_sta

Query((Khigh_grossing_television_series@1 PHRASE 2 Kwilhelmina_slater@2))
Query((Kharold_macmillan@1 PHRASE 4 Kfebruary@2 PHRASE 4 Kfebruary_20@3 PHRASE 4 K_1894@4))
Query((Kkesha@1 PHRASE 3 Kliving@2 PHRASE 3 Ksongs@3))
Query((K2013@1 PHRASE 3 Kseries_finale@2 PHRASE 3 Kit@3))
Query((Kjohn_f@1 PHRASE 2 K_kennedy@2))
Query((Kmiranda_otto@1 PHRASE 4 Kactress_gracie_otto@2 PHRASE 4 Konly_sister@3 PHRASE 4 Kgracie_otto@4))
Query((Kits_main_flavour@1 PHRASE 3 Kjuniper_berries@2 PHRASE 3 Kgin@3))
Query((K1993@1 PHRASE 2 Kingushetia@2))
Query((K2001@1 PHRASE 5 Kbernardo_bertolucci@2 PHRASE 5 K2001_film@3 PHRASE 5 Keva_green@4 PHRASE 5 Kher_film@5))
Query(Kblack_canary@1)
Query((Kchinese@1 PHRASE 2 Kluis_fonsi@2))
Query((Kmuscarinic_acetylcholine_receptors@1 PHRASE 3 Kmuscarinic@2 PHRASE 3 Kcell_complexes@3))
Query((K2001@1 PHRASE 2 Kjoe_walsh@2))
Query((Kmars@1 PHRASE 2 Kaleister_crowley@2))
Query((Kjanet_leigh@1 PHRASE 2 Kperson@2))
Query((Kaward@1 PHRASE 2 Kanushka_sharma@2))
Query((Kraees

Query((Kmusic_judges@1 PHRASE 2 Kartpop@2))
Query((Kunited_states@1 PHRASE 7 Kone@2 PHRASE 7 Konly_gaga's_fifth_consecutive_number@3 PHRASE 7 Kone_record@4 PHRASE 7 Kgaga@5 PHRASE 7 Kartpop@6 PHRASE 7 Kfifth@7))
Query((Kartpop@1 PHRASE 2 Kcopies@2))
Query((Kunited_states_billboard_200@1 PHRASE 5 Knumber_three@2 PHRASE 5 Kunited_states_billboard@3 PHRASE 5 Knumber@4 PHRASE 5 Kartpop@5))
Query((K2009@1 PHRASE 9 Kunited_states@2 PHRASE 9 Kgaga's_second_consecutive_number@3 PHRASE 9 Kone_record@4 PHRASE 9 Kgaga@5 PHRASE 9 Ksecond@6 PHRASE 9 Kartpop@7 PHRASE 9 Knumber@8 PHRASE 9 Kone@9))
Query((Kabout_258,000@1 PHRASE 8 Kfirst@2 PHRASE 8 Kweek_sales@3 PHRASE 8 Kartpop@4 PHRASE 8 K1999@5 PHRASE 8 Kabout_258,000_copies@6 PHRASE 8 Kfirst@7 PHRASE 8 Kweek@8))
Query((Kabout_757,000_cats@1 PHRASE 3 Kartpop@2 PHRASE 3 Kabout_757,000@3))
Query((Kunited_states@1 PHRASE 7 Kgaga's_second_consecutive_number@2 PHRASE 7 Kone_record@3 PHRASE 7 Ksecond@4 PHRASE 7 Knumber@5 PHRASE 7 Kone@6 PHRASE 7 Kartpop@

Query((Kian_brennan@1 PHRASE 3 Kapril@2 PHRASE 3 Kapril_1978@3))
Query((Kdetroit@1 PHRASE 2 Kian_brennan@2))
Query((Kian_brennan's_year@1 PHRASE 5 Kbirth@2 PHRASE 5 K1978@3 PHRASE 5 Kyear_of_birth@4 PHRASE 5 Kian_brennan's@5))
Query(Kian_brennan@1)
Query((Kmedia@1 PHRASE 2 Kian_brennan@2))
Query((Kshakespearean_actor@1 PHRASE 2 Kian_brennan@2))
Query((Kmedia_creation@1 PHRASE 3 Kian_brennan@2 PHRASE 3 Kno_role@3))
Query((Keminem@1 PHRASE 4 Kyou@2 PHRASE 4 Klove_the_way@3 PHRASE 4 Kway@4))
Query((Kyou@1 PHRASE 7 K100@2 PHRASE 7 Klove_the_way@3 PHRASE 7 Kbillboard@4 PHRASE 7 Kway@5 PHRASE 7 Kseven_weeks@6 PHRASE 7 Kbillboard_hot@7))
Query((Kyou@1 PHRASE 6 Keminem's_worst@2 PHRASE 6 Kselling_single@3 PHRASE 6 Keminem@4 PHRASE 6 Klove_the_way@5 PHRASE 6 Kway@6))
Query((Kyou@1 PHRASE 6 Kway@2 PHRASE 6 Kfive@3 PHRASE 6 Kfive_grammy_nominations@4 PHRASE 6 Klove_the_way@5 PHRASE 6 Kgrammy@6))
Query((Klgbt_community@1 PHRASE 3 Kroland_emmerich@2 PHRASE 3 Klgbt@3))
Query((Kcampaigner@1 PHRASE 2 

Query(Kanthology@1)
Query((Kannie_james@1 PHRASE 4 Kmotel_life@2 PHRASE 4 Kdakota_fanning@3 PHRASE 4 Kfilm@4))
Query((Kcherie_currie@1 PHRASE 5 Kamerican_film@2 PHRASE 5 Kdakota_fanning@3 PHRASE 5 Kamerican@4 PHRASE 5 Krunaways@5))
Query((Kchinese@1 PHRASE 3 Kchinese_film@2 PHRASE 3 Kdakota_fanning@3))
Query((Kjane@1 PHRASE 4 Kdakota_fanning@2 PHRASE 4 Ktwilight_saga@3 PHRASE 4 Kfilm@4))
Query((Kcaptain_america@1 PHRASE 3 Kdakota_fanning@2 PHRASE 3 Kfilm@3))
Query((Kfilm@1 PHRASE 3 Kdakota_fanning@2 PHRASE 3 Kcoraline@3))
Query((Kdracula@1 PHRASE 3 Kdakota_fanning@2 PHRASE 3 Kfilm@3))
Query((Ktelevision_film@1 PHRASE 2 Kdakota_fanning@2))
Query((Kamerican@1 PHRASE 3 Kamerican_film_roles@2 PHRASE 3 Kdakota_fanning@3))
Query((K2009@1 PHRASE 4 K2009_film@2 PHRASE 4 Kdakota_fanning@3 PHRASE 4 Kcoraline@4))
Query((Kmale_sibling@1 PHRASE 2 Ktamerlan_tsarnaev@2))
Query((Ktamerlan_tsarnaev@1 PHRASE 2 Kminivan@2))
Query((Ktamerlan_tsarnaev@1 PHRASE 3 Kpolice@2 PHRASE 3 Ktheater_production@3))
Q

Query(((Kmissy@1 PHRASE 3 Kmel_b@2 PHRASE 3 Kmissy_@3) OR misdemeanor@4 OR (_elliott@5 PHRASE 2 misdemeanor@6) OR _elliott@7))
Query((Kmel_b@1 PHRASE 4 Ksong@2 PHRASE 4 Kvirgin_records@3 PHRASE 4 K2007@4))
Query((K2013@1 PHRASE 2 Keasy_a@2))
Query((Keasy_a@1 PHRASE 3 K2010_novel@2 PHRASE 3 K2010@3))
Query((Keasy_a@1 PHRASE 3 Kstrictly_canadian_teen_comedy_novel@2 PHRASE 3 Kcanadian@3))
Query((Keasy_a@1 PHRASE 3 Kwill_gluck@2 PHRASE 3 K2010@3))
Query((Kamerican_teen_comedy_poem@1 PHRASE 3 Kamerican@2 PHRASE 3 Keasy_a@3))
Query((Keasy_a@1 PHRASE 3 K2010_food_dish@2 PHRASE 3 K2010@3))
Query((Kbert_v@1 PHRASE 3 K_royal@2 PHRASE 3 Keasy_a@3))
Query((K2009@1 PHRASE 3 K2009_film@2 PHRASE 3 Keasy_a@3))
Query((Keasy_a@1 PHRASE 2 Kedgar_allen_poe@2))
Query((Keasy_a@1 PHRASE 3 Kunited_states@2 PHRASE 3 Kteen_comedy_movie@3))
Query((Keasy_a@1 PHRASE 2 Kstork@2))
Query((Keasy_a@1 PHRASE 3 Kfilm@2 PHRASE 3 K2010@3))
Query((Kfrench@1 PHRASE 4 Kmy_french@2 PHRASE 4 Kamerican_rapper's_album@3 PHRASE 4 

# Analyse Found Documents

## Read search results

In [26]:
results_df = pd.read_csv(SEARCH_RESULT_PATH)

In [27]:
results_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7070 entries, 0 to 7069
Data columns (total 5 columns):
claim_id      7070 non-null int64
found_doc     7070 non-null object
rank          7070 non-null int64
percentage    7070 non-null int64
weight        7070 non-null float64
dtypes: float64(1), int64(3), object(1)
memory usage: 276.2+ KB


In [28]:
#results_df.sort_values('percentage', ascending=False).head(100)
results_df.sort_values('weight', ascending=False).head(10)

Unnamed: 0,claim_id,found_doc,rank,percentage,weight
1746,39018,Duane_Chapman,1,37,35.223637
5230,148204,Duane_Chapman,1,37,35.223637
3527,93250,2007_Champ_Car_season,1,50,28.555285
3528,93250,Zero4_Champ,2,49,28.201177
3529,93250,Emilio_Falla,3,49,28.199538
3530,93250,2007_Atlantic_Championship,4,49,28.134959
3531,93250,Mont-Tremblant_Champ_Car_Grand_Prix,5,49,28.046805
3532,93250,Champ_Island,6,49,28.028061
3533,93250,2004_Champ_Car_season,7,48,27.859703
3534,93250,Champ_-LRB-food-RRB-,8,48,27.838128


In [77]:
mask = results_df['rank'] <= 1

In [78]:
func = lambda x: set(x)
found_docs_df = results_df[mask].pivot_table(index='claim_id', values='found_doc', aggfunc=func)
found_docs_df.head()

Unnamed: 0_level_0,found_doc
claim_id,Unnamed: 1_level_1
158,{Zoviet_France}
823,{...De_Piel_Negra}
1831,"{5,6-Dihydroxycytosine}"
6876,{Adam_Gerrond_McDougall}
8504,{Briana_Loves_Jenna}


## Read claims

In [79]:
labelled_claims_df = pd.read_json('data/l_claims.json').sort_index()
mask = labelled_claims_df.source.isin(SOURCES)
labelled_claims_df = labelled_claims_df[mask]

In [80]:
labelled_claims_df = labelled_claims_df.join(found_docs_df)

In [81]:
labelled_claims_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5001 entries, 12 to 229319
Data columns (total 9 columns):
claim                      5001 non-null object
evidence                   5001 non-null object
label                      5001 non-null object
source                     5001 non-null object
evidence_docs              5001 non-null object
evidence_doc_count         5001 non-null int64
evidence_sentence_count    5001 non-null int64
evidence_set_str           5001 non-null object
found_doc                  136 non-null object
dtypes: int64(2), object(7)
memory usage: 550.7+ KB


In [82]:
def func(x):
    if isinstance(x['found_doc'], set):
        return set(x['evidence_docs']).difference(x['found_doc'])
    return set()

labelled_claims_df['missed_docs'] = labelled_claims_df.apply(func, axis=1)
labelled_claims_df['missed_count'] = labelled_claims_df['missed_docs'].apply(len)

In [83]:
labelled_claims_df[labelled_claims_df.evidence_doc_count > 10].head()

Unnamed: 0,claim,evidence,label,source,evidence_docs,evidence_doc_count,evidence_sentence_count,evidence_set_str,found_doc,missed_docs,missed_count
32815,Eric Church is a singer.,"[[Drink_in_My_Hand, 0], [Like_a_Wrecking_Ball, 0], [The_Only_Way_I_Know, 0], [Eric_Church, 0], [...",SUPPORTS,dev,"[Drink_in_My_Hand, Like_a_Wrecking_Ball, Love_Your_Love_the_Most, Two_Pink_Lines, Cold_One, Give...",15,22,"{'Drink_in_My_Hand', 'Like_a_Wrecking_Ball', 'Love_Your_Love_the_Most', 'Two_Pink_Lines', 'Cold_...",,{},0
34228,Eric Church is unable to write songs.,"[[Hell_on_the_Heart, 0], [Drink_in_My_Hand, 0], [Eric_Church, 9], [Smoke_a_Little_Smoke, 0], [Ta...",REFUTES,dev,"[Drink_in_My_Hand, Like_a_Wrecking_Ball, Love_Your_Love_the_Most, Homeboy_-LRB-Eric_Church_song-...",14,18,"{'Drink_in_My_Hand', 'Like_a_Wrecking_Ball', 'Love_Your_Love_the_Most', 'Homeboy_-LRB-Eric_Churc...",,{},0
79208,Arizona is not a part of the United States.,"[[Arizona, 1], [Phoenix,_Arizona, 0], [Western_United_States, 0], [Arizona, 16], [Arizona, 9], [...",REFUTES,dev,"[Alpine,_Arizona, U.S._state, Grand_Canyon, Four_Corners, List_of_U.S._states_and_territories_by...",11,22,"{'Alpine,_Arizona', 'U.S._state', 'Grand_Canyon', 'Four_Corners', 'List_of_U.S._states_and_terri...",,{},0
194922,Stripes had a person appear in it.,"[[Bill_Paxton, 3], [John_Larroquette, 0], [Stripes_-LRB-film-RRB-, 0], [Bill_Murray, 6], [Joe_Fl...",SUPPORTS,dev,"[Timothy_Busfield, Stripes_-LRB-film-RRB-, Bill_Paxton, Judge_Reinhold, John_Larroquette, Harold...",16,52,"{'Timothy_Busfield', 'Stripes_-LRB-film-RRB-', 'Bill_Paxton', 'Judge_Reinhold', 'John_Larroquett...",,{},0


In [84]:
recall_df = labelled_claims_df.pivot_table(index='missed_count', columns='source',# margins=True,
                                                values='claim', aggfunc='count', fill_value=0)
recall_df.head(10)

source,dev
missed_count,Unnamed: 1_level_1
0,4909
1,85
2,5
3,1
5,1


In [85]:
(recall_df / recall_df.sum() * 100).head(1)

source,dev
missed_count,Unnamed: 1_level_1
0,98.160368


## Evidence found

In [67]:
found_df = labelled_claims_df[(labelled_claims_df.missed_count == 0) & (labelled_claims_df.label != 'NOT ENOUGH INFO')]

In [54]:
found_df.evidence_docs.astype(str).value_counts().to_frame()

Unnamed: 0,evidence_docs
['Murda_Beatz'],13
['AMGTV'],13
['James_VI_and_I'],12
['The_Endless_River'],12
['Harris_Jayaraj'],11
['A_View_to_a_Kill'],11
['Invasion_literature'],11
['Luis_Fonsi'],11
['Despicable_Me_2'],11
['Richard_Dawkins'],11


## Evidence missed

In [55]:
missed_df = labelled_claims_df[labelled_claims_df.missed_count > 0]

In [56]:
missed_df = missed_df.join(claim_df[['named_entities', 'np_phrase']])

In [57]:
cols = ['claim', 'named_entities', 'np_phrase', 'evidence_docs',
       'evidence_doc_count', 'found_doc', 'missed_docs', 'missed_count']
missed_df.sort_values(['missed_count', 'evidence_set_str'], ascending=[False, True])[cols].head(100)

Unnamed: 0,claim,named_entities,np_phrase,evidence_docs,evidence_doc_count,found_doc,missed_docs,missed_count
117862,Humphrey Bogart does acting.,"[{'entity': 'Humphrey Bogart', 'label': 'PERSON', 'root': 'Bogart'}]",[Humphrey Bogart],"[To_Have_and_Have_Not_-LRB-film-RRB-, The_Harder_They_Fall, Leading_man, Humphrey_Bogart, Academ...",5,"{Bold_Venture, San_Quentin_-LRB-1937_film-RRB-, Broadway's_Like_That, Crime_School, Up_the_River...","{Academy_Award_for_Best_Actor, The_Harder_They_Fall, Leading_man, To_Have_and_Have_Not_-LRB-film...",5
71959,Jennifer Lopez made a single.,"[{'entity': 'Jennifer Lopez', 'label': 'PERSON', 'root': 'Lopez'}]",[Jennifer Lopez],"[Como_Ama_una_Mujer, Jennifer_Lopez, J.Lo_-LRB-album-RRB-]",3,"{Feel_the_Light, Follow_the_Leader_-LRB-Wisin_&_Yandel_song-RRB-, J.Lo_by_Jennifer_Lopez, Jennif...","{Jennifer_Lopez, Como_Ama_una_Mujer, J.Lo_-LRB-album-RRB-}",3
166853,Drake Bell doesn't sing.,"[{'entity': 'Drake Bell', 'label': 'PERSON', 'root': 'Bell'}]",[Drake Bell],"[Found_a_Way, Drake_Bell]",2,"{List_of_songs_recorded_by_Drake_Bell, College_-LRB-2008_film-RRB-, I_Know_-LRB-Drake_Bell_song-...","{Found_a_Way, Drake_Bell}",2
58926,Rabies is infectious.,[],[Rabies],"[Rabies, Viral_disease]",2,"{Rafivirumab, Rabies_-LRB-Ruoska_album-RRB-, Jan_Salter, Rabies_-LRB-2010_film-RRB-, Rabies_-LRB...","{Rabies, Viral_disease}",2
105361,Croatia is lawless.,"[{'entity': 'Croatia', 'label': 'GPE', 'root': 'Croatia'}]",[Croatia],"[Sovereign_state, Croatia]",2,"{1919_in_Croatia, 1913_in_Croatia, 1957_World_Women's_Handball_Championship, 1931_in_Croatia, 19...","{Croatia, Sovereign_state}",2
150925,The dress was online.,[],[The dress],"[The_dress, Social_networking_service]",2,"{How_to_Dress_Well, Khmer_clothing, Dress_-LRB-film-RRB-, John_Luard, List_of_Say_Yes_to_the_Dre...","{The_dress, Social_networking_service}",2
159580,Dan O'Bannon died.,"[{'entity': 'Dan O'Bannon', 'label': 'PERSON', 'root': 'O'Bannon'}]",[Dan O'Bannon],[Dan_O'Bannon],1,"{The_Long_Tomorrow_-LRB-comics-RRB-, List_of_accolades_received_by_the_Alien_franchise, Invaders...",{Dan_O'Bannon},1
14019,Tatum O'Neal is childless.,"[{'entity': 'Tatum O'Neal', 'label': 'PERSON', 'root': 'O'Neal'}]",[Tatum O'Neal],[Tatum_O'Neal],1,"{Little_Darlings, Nickelodeon_-LRB-film-RRB-, Circle_of_Two, Last_Will_-LRB-film-RRB-, The_Scoun...",{Tatum_O'Neal},1
108481,Always was directed by Gandhi.,"[{'entity': 'Gandhi', 'label': 'PERSON', 'root': 'Gandhi'}]",[Gandhi],[Always_-LRB-1989_film-RRB-],1,"{Ciclos_-LRB-Gandhi_album-RRB-, Bios_-LRB-album-RRB-, Geetha_Gandhi, Buddhan_Yesu_Gandhi, Federi...",{Always_-LRB-1989_film-RRB-},1
118414,Always is a film.,[],[a film],[Always_-LRB-1989_film-RRB-],1,"{Film_&_History, BAFTA_Award_for_Best_Actor_in_a_Leading_Role, Women_in_Film_and_Television_Inte...",{Always_-LRB-1989_film-RRB-},1


In [43]:
missed_df.missed_docs.astype(str).value_counts().to_frame()

Unnamed: 0,missed_docs
{'Wish_Upon'},6
{'Monster'},4
{'The_Wallace_-LRB-poem-RRB-'},3
{'Mel_B'},2
{'Duane_Chapman'},2
{'Always_-LRB-1989_film-RRB-'},2
{'Monk'},2
{'Knocked_Up'},2
{'Simón_Bolívar'},2
{'Live_Through_This'},2
