In [1]:
import re
import xapian
import pandas as pd
import numpy as np
import csv

In [2]:
pd.options.display.max_colwidth = 100

In [3]:
DBPATH = "full_index"
SOURCES = ['dev']
SEARCH_RESULT_PATH = "data/search_results_v2.csv"

In [4]:
!xapian-delve $DBPATH

UUID = b940719a-3fb5-4d57-8b01-3186790b1ade
number of documents = 3833466
average document length = 170.939
document length lower bound = 1
document length upper bound = 31154
highest document id ever used = 3833466
has positional information = true
revision = 392
currently open for writing = false


In [6]:
!xapian-delve -r 1833129 -d $DBPATH

Data for record #1833129:

Term List for record #1833129: 17 2004 2007 3 QKim_Hyde Skim_hyde ZSkim_hyd Za Zand Zappear Zaustralian Zaway Zby Zcharact Zchris Zdepart Zfebruari Zfiction Zfirst Zfrom Zhe Zhemsworth Zhis Zhome Zhyde Zjonathan Zjuli Zkim Zkimber Zmade Zon Zopera Zplay Zscreen Zsoap Zthe Zwas a and appearance australian away by character chris departed february fictional first from he hemsworth his home hyde jonathan july kim kimberly made on opera played screen soap the was


# Retrieve documents relevant to claims

In [7]:
def get_doc_id(match):
    for term in match.document.termlist():
        term = term.term.decode("utf-8") 
        m = re.match("Q(.*)", term)
        if m:
            return m[1]
    return None

In [8]:
# Prepare enquiry object

# Open the database we're going to search.
db = xapian.Database(DBPATH)

# Set up a QueryParser with a stemmer and suitable prefixes
queryparser = xapian.QueryParser()
queryparser.set_stemmer(xapian.Stem("en"))
queryparser.set_stemming_strategy(queryparser.STEM_SOME)

# Use an Enquire object on the database to run the query
enquire = xapian.Enquire(db)    

In [9]:
claim_df = pd.read_json('claims.json').sort_index()

In [11]:
mask = claim_df.source.isin(SOURCES)
claim_df = claim_df[mask]

In [41]:
claim_df['search_words'] = (claim_df.np_phrase + claim_df.np_roots).apply(lambda x: list(np.unique(x)))

In [42]:
claim_df.head()

Unnamed: 0,claim,source,named_entities,noun_phrases,entity_count,entity_types,entity_types_count,np_count,np_phrase,np_roots,search_words
12,Carlos Santana disbanded Santana in 1965.,dev,"[{'entity': 'Carlos Santana', 'label': 'PERSON', 'root': 'Santana'}, {'entity': 'Santana', 'labe...","[{'noun_phrase': 'Carlos Santana', 'root': 'Santana'}, {'noun_phrase': 'Santana', 'root': 'Santa...",3,"[PERSON, DATE]",2,2,"[Carlos Santana, Santana]",[Santana],"[Carlos Santana, Santana]"
70,David Packouz was born in February of 1982.,dev,"[{'entity': 'David Packouz', 'label': 'PERSON', 'root': 'Packouz'}, {'entity': 'February of 1982...","[{'noun_phrase': 'David Packouz', 'root': 'Packouz'}, {'noun_phrase': 'February', 'root': 'Febru...",2,"[PERSON, DATE]",2,2,"[David Packouz, February]","[February, Packouz]","[David Packouz, February, Packouz]"
97,Craig David is a pop music performer.,dev,"[{'entity': 'Craig David', 'label': 'PERSON', 'root': 'David'}]","[{'noun_phrase': 'Craig David', 'root': 'David'}, {'noun_phrase': 'a pop music performer', 'root...",1,[PERSON],1,2,"[a pop music performer, Craig David]","[David, performer]","[Craig David, David, a pop music performer, performer]"
98,Craig David is a performer that does pop music.,dev,"[{'entity': 'Craig David', 'label': 'PERSON', 'root': 'David'}]","[{'noun_phrase': 'Craig David', 'root': 'David'}, {'noun_phrase': 'a performer', 'root': 'perfor...",1,[PERSON],1,3,"[music, a performer, Craig David]","[David, music, performer]","[Craig David, David, a performer, music, performer]"
158,Wish Upon was released in France.,dev,"[{'entity': 'Wish Upon', 'label': 'ORG', 'root': 'Upon'}, {'entity': 'France', 'label': 'GPE', '...","[{'noun_phrase': 'Wish Upon', 'root': 'Upon'}, {'noun_phrase': 'France', 'root': 'France'}]",2,"[GPE, ORG]",2,2,"[France, Wish Upon]","[Upon, France]","[France, Upon, Wish Upon]"


In [43]:
claim_df.shape

(5001, 11)

In [44]:
search_column = 'search_words'

In [45]:
%%time
fields = ['claim_id', 'found_doc', 'rank', 'percentage', 'weight']
pagesize = 100
i = 0
results = []

with open(SEARCH_RESULT_PATH, 'w') as csvFile:
    writer = csv.DictWriter(csvFile, fieldnames=fields)
    writer.writeheader()

    for claim_id, claim in claim_df[search_column].items():
        if i % 100 == 0:
            print(i // 100, claim_id)
        i += 1

        if isinstance(claim, list):
            claim = ' '.join(claim)
        #print(claim)
        query = queryparser.parse_query(claim)
        enquire.set_query(query)
        matches = enquire.get_mset(0, pagesize)

        query_results = []
        for match in matches:
            result = dict(
                claim_id = claim_id,
                found_doc = get_doc_id(match),
                rank = match.rank + 1,
                percentage = match.percent,
                weight = match.weight,            
            )
            query_results.append(result)
        writer.writerows(query_results)
        results += query_results
csvFile.close()

0 12
1 5075
2 10327
3 14977
4 21009
5 26008
6 31013
7 36371
8 41265
9 46636
10 52175
11 56774
12 61504
13 65940
14 70695
15 75309
16 80488
17 86116
18 91114
19 95061
20 100083
21 104729
22 108567
23 112990
24 118044
25 122887
26 126963
27 131422
28 135962
29 140536
30 144903
31 150017
32 155807
33 159715
34 165245
35 168015
36 172464
37 175478
38 179741
39 182456
40 185736
41 191436
42 194760
43 197392
44 201388
45 203369
46 207379
47 212780
48 217682
49 224381
50 229319
CPU times: user 4min 19s, sys: 50.4 s, total: 5min 10s
Wall time: 5min 29s


# Analyse Found Documents

## Read search results

In [46]:
results_df = pd.read_csv(SEARCH_RESULT_PATH)

In [47]:
results_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 497052 entries, 0 to 497051
Data columns (total 5 columns):
claim_id      497052 non-null int64
found_doc     497052 non-null object
rank          497052 non-null int64
percentage    497052 non-null int64
weight        497052 non-null float64
dtypes: float64(1), int64(3), object(1)
memory usage: 19.0+ MB


In [66]:
#results_df.sort_values('percentage', ascending=False).head(100)
results_df.sort_values('weight', ascending=False).head(10)

Unnamed: 0,claim_id,found_doc,rank,percentage,weight
153769,77743,List_of_accolades_received_by_Bajirao_Mastani,1,82,151.74591
153770,77743,Chunsa_Film_Art_Awards,2,79,145.858403
153771,77743,List_of_accolades_received_by_Arrival,3,78,145.301275
153772,77743,List_of_accolades_received_by_Nocturnal_Animals,4,78,144.363208
153773,77743,List_of_accolades_received_by_Kaminey,5,76,141.118086
153774,77743,List_of_accolades_received_by_Kahaani,6,73,134.654159
153775,77743,List_of_accolades_received_by_Hacksaw_Ridge,7,72,134.340591
153776,77743,List_of_accolades_received_by_Aadukalam,8,71,131.269173
153777,77743,List_of_accolades_received_by_Dookudu,9,70,129.774258
153778,77743,Boston_Society_of_Film_Critics,10,70,129.707984


In [79]:
mask = results_df['rank'] <= 100

In [80]:
func = lambda x: set(x)
found_docs_df = results_df[mask].pivot_table(index='claim_id', values='found_doc', aggfunc=func)
found_docs_df.head()

Unnamed: 0_level_0,found_doc
claim_id,Unnamed: 1_level_1
12,"{Tito_Santana, Jorge_Santana, Santana_Srinivasa_Temple, Scarlett_Santana, Fabral, Santana_do_Par..."
70,"{David_Geffen, Karel_David, Cleveland_Classic_2013, Arvind_Ethan_David, Stan_David, Athanase_Dav..."
97,"{Omar_Hakim, David_Scott_Stone, Tu_te_reconnaîtras, David_Haberfeld, David_di_Donatello, Canadia..."
98,"{David_Scott_Stone, Tyler_LeVander, David_Haberfeld, David_di_Donatello, Canadian_Electronic_Ens..."
158,"{How_I_Married_My_High_School_Crush, Catenay, South_African_identity_card, Maurice_Herzog, Kings..."


## Read claims

In [81]:
labelled_claims_df = pd.read_json('data/l_claims.json').sort_index()
mask = labelled_claims_df.source.isin(SOURCES)
labelled_claims_df = labelled_claims_df[mask]

In [82]:
labelled_claims_df = labelled_claims_df.join(found_docs_df)

In [83]:
labelled_claims_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5001 entries, 12 to 229319
Data columns (total 9 columns):
claim                      5001 non-null object
evidence                   5001 non-null object
label                      5001 non-null object
source                     5001 non-null object
evidence_docs              5001 non-null object
evidence_doc_count         5001 non-null int64
evidence_sentence_count    5001 non-null int64
evidence_set_str           5001 non-null object
found_doc                  4986 non-null object
dtypes: int64(2), object(7)
memory usage: 550.7+ KB


In [84]:
def func(x):
    if isinstance(x['found_doc'], set):
        return set(x['evidence_docs']).difference(x['found_doc'])
    return set()

labelled_claims_df['missed_docs'] = labelled_claims_df.apply(func, axis=1)
labelled_claims_df['missed_count'] = labelled_claims_df['missed_docs'].apply(len)

In [85]:
labelled_claims_df[labelled_claims_df.evidence_doc_count > 10].head()

Unnamed: 0,claim,evidence,label,source,evidence_docs,evidence_doc_count,evidence_sentence_count,evidence_set_str,found_doc,missed_docs,missed_count
32815,Eric Church is a singer.,"[[Drink_in_My_Hand, 0], [Like_a_Wrecking_Ball, 0], [The_Only_Way_I_Know, 0], [Eric_Church, 0], [...",SUPPORTS,dev,"[Drink_in_My_Hand, Like_a_Wrecking_Ball, Love_Your_Love_the_Most, Two_Pink_Lines, Cold_One, Give...",15,22,"{'Drink_in_My_Hand', 'Like_a_Wrecking_Ball', 'Love_Your_Love_the_Most', 'Two_Pink_Lines', 'Cold_...","{Kliros, Mark_Middleton, Emma_Lucy_Gates_Bowen, Live_at_the_First_Baptist_Church, Emma_Abbott_Me...","{Drink_in_My_Hand, Like_Jesus_Does, Raise_'Em_Up, The_Only_Way_I_Know, Springsteen_-LRB-song-RRB...",8
34228,Eric Church is unable to write songs.,"[[Hell_on_the_Heart, 0], [Drink_in_My_Hand, 0], [Eric_Church, 9], [Smoke_a_Little_Smoke, 0], [Ta...",REFUTES,dev,"[Drink_in_My_Hand, Like_a_Wrecking_Ball, Love_Your_Love_the_Most, Homeboy_-LRB-Eric_Church_song-...",14,18,"{'Drink_in_My_Hand', 'Like_a_Wrecking_Ball', 'Love_Your_Love_the_Most', 'Homeboy_-LRB-Eric_Churc...","{List_of_Hillsong_songs, Guys_Like_Me, Dikļi_parish, Church_of_the_Poison_Mind, Konevitsan_kirko...","{Eric_Church, The_Outsiders_-LRB-Eric_Church_song-RRB-, Talladega_-LRB-song-RRB-, Homeboy_-LRB-E...",4
79208,Arizona is not a part of the United States.,"[[Arizona, 1], [Phoenix,_Arizona, 0], [Western_United_States, 0], [Arizona, 16], [Arizona, 9], [...",REFUTES,dev,"[Alpine,_Arizona, U.S._state, Grand_Canyon, Four_Corners, List_of_U.S._states_and_territories_by...",11,22,"{'Alpine,_Arizona', 'U.S._state', 'Grand_Canyon', 'Four_Corners', 'List_of_U.S._states_and_terri...","{Ponometia_altera, 390th, Tradescantia_pinetorum, Thunderbird_School_of_Global_Management, Dyoto...","{Southern_Arizona, Alpine,_Arizona, Grand_Canyon, Northern_Arizona, List_of_U.S._states_and_terr...",10
194922,Stripes had a person appear in it.,"[[Bill_Paxton, 3], [John_Larroquette, 0], [Stripes_-LRB-film-RRB-, 0], [Bill_Murray, 6], [Joe_Fl...",SUPPORTS,dev,"[Timothy_Busfield, Stripes_-LRB-film-RRB-, Bill_Paxton, Judge_Reinhold, John_Larroquette, Harold...",16,52,"{'Timothy_Busfield', 'Stripes_-LRB-film-RRB-', 'Bill_Paxton', 'Judge_Reinhold', 'John_Larroquett...","{Misao_Okawa, In_Person, 20_Dates, Subsolid_personality, Panglima_Gagah_Berani, First_person, Gu...","{Bill_Paxton, Sean_Young, P._J._Soles, Warren_Oates, Timothy_Busfield, Harold_Ramis, Ivan_Reitma...",16


In [86]:
recall_df = labelled_claims_df.pivot_table(index='missed_count', columns='source',# margins=True,
                                                values='claim', aggfunc='count', fill_value=0)
recall_df.head(10)

source,dev
missed_count,Unnamed: 1_level_1
0,3676
1,1146
2,124
3,23
4,18
5,5
6,5
8,1
9,1
10,1


In [87]:
(recall_df / recall_df.sum() * 100).head(1)

source,dev
missed_count,Unnamed: 1_level_1
0,73.505299


## Evidence found

In [88]:
found_df = labelled_claims_df[(labelled_claims_df.missed_count == 0) & (labelled_claims_df.label != 'NOT ENOUGH INFO')]

In [89]:
found_df.evidence_docs.astype(str).value_counts().to_frame()

Unnamed: 0,evidence_docs
['The_Endless_River'],12
['Invasion_literature'],11
['Harris_Jayaraj'],11
['Edgar_Wright'],11
['Despicable_Me_2'],11
['Carlos_Santana'],11
['A_View_to_a_Kill'],11
['Murda_Beatz'],11
['Richard_Dawkins'],11
['John_Dolmayan'],10


## Evidence missed

In [107]:
missed_df = labelled_claims_df[labelled_claims_df.missed_count > 0]

In [108]:
missed_df = missed_df.join(claim_df[['named_entities', 'np_phrase']])

In [111]:
cols = ['claim', 'named_entities', 'np_phrase', 'evidence_docs',
       'evidence_doc_count', 'found_doc', 'missed_docs', 'missed_count']
missed_df.sort_values(['missed_count', 'evidence_set_str'])[cols].head(100)

Unnamed: 0,claim,named_entities,np_phrase,evidence_docs,evidence_doc_count,found_doc,missed_docs,missed_count
193444,Captain America's shield is used by the superhero name Captain America.,"[{'entity': 'America', 'label': 'GPE', 'root': 'America'}, {'entity': 'Captain America', 'label'...","[Captain America, Captain America's shield, the superhero name]",[Captain_America's_shield],1,"{The_Capture_of_Captain_America, Keith_Carmody, Captain_Barbell, Captain_America_and_the_Falcon,...",{Captain_America's_shield},1
193449,Captain America's shield is incapable of being a defensive equipment.,"[{'entity': 'America', 'label': 'GPE', 'root': 'America'}]","[a defensive equipment, Captain America's shield]",[Captain_America's_shield],1,"{Operation_Defensive_Shield, Electromagnetic_compatibility, 3d_Light_Antiaircraft_Missile_Battal...",{Captain_America's_shield},1
193450,Superhero name Captain America uses the Captain America's shield.,"[{'entity': 'Superhero', 'label': 'ORG', 'root': 'Superhero'}, {'entity': 'Captain America', 'la...","[the Captain America's shield, Superhero name Captain America]",[Captain_America's_shield],1,"{The_Capture_of_Captain_America, Keith_Carmody, Captain_Barbell, Captain_America_and_the_Falcon,...",{Captain_America's_shield},1
193458,Captain America's shield is used as a type of equipment.,"[{'entity': 'America', 'label': 'GPE', 'root': 'America'}]","[equipment, Captain America's shield, a type]",[Captain_America's_shield],1,"{Type_98_20_mm_AAG_Tank, Australian_Shield, Michael_Shields, Keith_Carmody, Thyreophoroi, Capuan...",{Captain_America's_shield},1
159563,Dan O'Bannon worked in he film industry.,"[{'entity': 'Dan O'Bannon', 'label': 'PERSON', 'root': 'O'Bannon'}]","[Dan O'Bannon, he film industry]",[Dan_O'Bannon],1,"{List_of_Defiance_episodes, Midde_Rama_Rao, Farscape, 1995_NBA_draft, Wayne_Townsend, The_Resurr...",{Dan_O'Bannon},1
159567,"Dan O'Bannon was born on September 20th, 1955.","[{'entity': 'Dan O'Bannon', 'label': 'PERSON', 'root': 'O'Bannon'}, {'entity': 'September 20th, ...","[September 20th, Dan O'Bannon]",[Dan_O'Bannon],1,"{List_of_Defiance_episodes, Twentieth_Army, Florica_Musicescu, Farscape, 1995_NBA_draft, Wayne_T...",{Dan_O'Bannon},1
159573,Dan O'Bannon was a German.,"[{'entity': 'Dan O'Bannon', 'label': 'PERSON', 'root': 'O'Bannon'}, {'entity': 'German', 'label'...",[Dan O'Bannon],[Dan_O'Bannon],1,"{Dan_Healy, List_of_Defiance_episodes, Dan_Hunter, Dan_Schneider, Farscape, 1995_NBA_draft, Wayn...",{Dan_O'Bannon},1
159574,Dan O'Bannon worked primarily in two genres.,"[{'entity': 'Dan O'Bannon', 'label': 'PERSON', 'root': 'O'Bannon'}, {'entity': 'two', 'label': '...","[Dan O'Bannon, two genres]",[Dan_O'Bannon],1,"{List_of_Defiance_episodes, Farscape, 1995_NBA_draft, Wayne_Townsend, The_Resurrected, Metalcore...",{Dan_O'Bannon},1
159575,"Dan O'Bannon died on December 17th, 2009.","[{'entity': 'Dan O'Bannon', 'label': 'PERSON', 'root': 'O'Bannon'}, {'entity': 'December 17th, 2...","[December 17th, Dan O'Bannon]",[Dan_O'Bannon],1,"{List_of_Defiance_episodes, Farscape, 1995_NBA_draft, Fan_Tingyu, The_Resurrected, 1956_BC_Lions...",{Dan_O'Bannon},1
159580,Dan O'Bannon died.,"[{'entity': 'Dan O'Bannon', 'label': 'PERSON', 'root': 'O'Bannon'}]",[Dan O'Bannon],[Dan_O'Bannon],1,"{Dan_Healy, List_of_Defiance_episodes, Dan_Hunter, Dan_Schneider, Farscape, 1995_NBA_draft, Wayn...",{Dan_O'Bannon},1


In [91]:
missed_df.missed_docs.astype(str).value_counts().to_frame()

Unnamed: 0,missed_docs
{'Excuse_My_French_-LRB-album-RRB-'},11
{'CHiPs_-LRB-film-RRB-'},11
{'Bones_-LRB-TV_series-RRB-'},10
{'Vedam_-LRB-film-RRB-'},10
{'Southpaw_-LRB-film-RRB-'},10
{'The_Gifted_-LRB-TV_series-RRB-'},10
{'Meteora_-LRB-album-RRB-'},9
{'The_Disaster_Artist_-LRB-film-RRB-'},9
{'Miracle_at_St._Anna'},9
{'The_Wallace_-LRB-poem-RRB-'},9
