In [1]:
from tqdm import tqdm
import pandas as pd
import os
import spacy
import xapian

### <p style="color:orange;">Load lookup dataset -- Takes about 2-3 minutes, big file</p>

In [2]:
finalDF = pd.read_pickle('data/lookupDataset.pkl')

In [3]:
devsetPath = 'data/devset.json'
datapath = 'data/wiki-pages-text/'
dbpath = 'index/xIndex'

In [4]:
!xapian-delve $dbpath

UUID = e43d8d88-5f65-4750-8169-e5cfa8350b82
number of documents = 5396106
average document length = 210.05
document length lower bound = 1
document length upper bound = 71671
highest document id ever used = 5396106
has positional information = true
revision = 540
currently open for writing = false


In [15]:
TOP_RESULTS_LIMIT = 100

def search(dbpath, querystring, offset=0, pagesize=10, just_one_mode=False):
    
    database = xapian.Database(dbpath)
    enquire = xapian.Enquire(database)
    query_string = querystring

    qp = xapian.QueryParser()
    stemmer = xapian.Stem("english")
    qp.set_stemmer(stemmer)
    qp.set_database(database)
    
    qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME)
    query = qp.parse_query(query_string)

    enquire.set_query(query)
    matches = enquire.get_mset(0, TOP_RESULTS_LIMIT)
    if just_one_mode:
        return matches
    else:
        resultArray = []
        for m in matches:
            resultArray.append([m.rank + 1, m.percent, m.docid, m.document.get_data().decode('utf-8')])
        return pd.DataFrame.from_records(resultArray, columns=['Rank','Percent','Doc_ID','Document_Title'], index='Rank')

In [16]:
def getPageText(pageTitle):
    return list(finalDF[finalDF['pageTitle'] == pageTitle]['data'].values)[0]

### <p style="color:orange;">Search</p>

In [17]:
query = "Murda Beatz's"
search(dbpath, query)

Unnamed: 0_level_0,Percent,Doc_ID,Document_Title
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,50,4806635,The_Sagas_Of...
2,44,3404975,Murda_Muzik
3,39,609409,Back_on_Road
4,39,3404973,Murda_-LRB-Candyland_song-RRB-
5,39,4983491,U.S.A._-LRB-Aiight_Then-RRB-


### <p style="color:orange;">Get page text from the data loaded i.e. sentences for that page title</p>

In [19]:
getPageText('The_Sagas_Of...')

"The single `` Its Murda '' appeared in the Soundtrack for the movie Kidulthood .\n"

In [22]:
devset = pd.read_json(devsetPath, orient='index')
devset.reset_index(inplace=True)
devset.head()

Unnamed: 0,index,claim,evidence,label
0,91198,Colin Kaepernick became a starting quarterback...,[],NOT ENOUGH INFO
1,194462,Tilda Swinton is a vegan.,[],NOT ENOUGH INFO
2,137334,Fox 2000 Pictures released the film Soul Food.,"[[Soul_Food_-LRB-film-RRB-, 0]]",SUPPORTS
3,166626,Anne Rice was born in New Jersey.,[],NOT ENOUGH INFO
4,111897,Telemundo is a English-language television net...,"[[Telemundo, 5], [Telemundo, 4], [Telemundo, 1...",REFUTES
