In [1]:
from tqdm import tqdm
import pandas as pd
import os
import xapian
import swifter
import re
import spacy
import nltk
import unicodedata
from nltk.corpus import stopwords
from spacy.lang.en.stop_words import STOP_WORDS



In [2]:
pd.set_option('display.max_colwidth', -1)

Load lookup dataset -- Takes about 2-3 minutes, big file

In [39]:
devsetPath = 'data/devset.json'
datapath = 'data/wiki-pages-text/'

Cleaning & helper functions

In [17]:
nlp = spacy.load("en_core_web_lg")

In [18]:
cachedStopWords = set(stopwords.words('english')).union(STOP_WORDS)

In [5]:
def clean(string):
    return ' '.join(word for word in set(string.split()) if word not in cachedStopWords)

In [20]:
def NEtag(string, keywordMode=True):
    string = unicodes(string)
    if keywordMode:
        return [ent.text for ent in nlp(string).ents]
    else:
        return [(ent.text, ent.label_) for ent in nlp(string).ents]

In [9]:
def nouns(string):
    return set([word for (word, pos) in nltk.pos_tag(nltk.word_tokenize(string)) if pos[0] == 'N'])

In [157]:
import codecs

def slashescape(err):
    '''
    codecs error handler. err is UnicodeDecode instance. return
    a tuple with a replacement for the unencodable part of the input
    and a position where encoding should continue
    '''
    thebyte = err.object[err.start:err.end]
    repl = u'\\x' + hex(ord(thebyte))[2:]
    return (repl, err.end)

codecs.register_error('slashescape', slashescape)

def unicodes(string):
    nfd_string = unicodedata.normalize("NFKD", string)
    return nfd_string

In [68]:
def getPropN(string):
    doc = nlp(string)
    return (' '.join([token.text for token in doc if token.pos_ == 'PROPN'])).lower()

Search

In [3]:
DBPATH = 'index/indexRN'

In [4]:
database = xapian.Database(DBPATH)
queryparser = xapian.QueryParser()

In [5]:
queryparser.set_stemmer(xapian.Stem("en"))

In [6]:
queryparser.add_prefix('keywords', 'K')

In [7]:
queryparser.add_prefix('about', 'B')

Optional

In [83]:
def search(claim, pagesize=10, show_query=False):
    
    print('CLAIM:', claim, '\n') if show_query == True else None
    
    enquire = xapian.Enquire(database)
    keywords = nouns(claim)
    about = getPropN(claim)
    
    '''
    Variations:
    keywords = NEtag(claim)
    keywords = nouns(claim)
    keywords = set(get_subject_topic(claim)).union(NEtag(claim))
    '''
    
    query = 'keywords:"{}" about:"{}" {}'.format(" AND ".join(keywords), about, claim)
    
    '''
    Variations:
    query = 'keywords:"{}" {}'.format(" AND ".join(keywords), claim)
    query = 'keywords:{}'.format(" keywords:".join(keywords))
    query = ' + '.join(clean(claim).split())
    query = claim
    '''
    
    # print('NAMED ENTITIES:', NEtag(claim), '\n') if show_query == True else None
    print('FINAL QUERY:', query, '\n') if show_query == True else None
    
    # Parse the query and process it on the Database
    query = queryparser.parse_query(query)
    
    print('PARSED QUERY:', query, '\n') if show_query == True else None
    enquire.set_query(query)
    matches = enquire.get_mset(0, pagesize)

    query_results = []
    for match in matches:
        result = dict(
            claim = claim,
            lookedupDoc = match.document.get_data(),
            rank = match.rank + 1,
            percentage = match.percent,
            weight = match.weight,            
        )
        query_results.append(result)
    return query_results

In [60]:
def getPageText(pageTitle):
    return list(finalDF[finalDF['pageTitle'] == pageTitle]['data'].values)[0]

# Dirty Work - Test Evaluation

In [70]:
query = 'Veeru Devgan only works in Hollywood.'
[[item['lookedupDoc'], item['percentage'], '{:.2f}'.format(item['weight'])] for item in search(query,show_query=True)]

CLAIM: Veeru Devgan only works in Hollywood. 

FINAL QUERY: keywords:"Devgan AND Hollywood AND Veeru" about:"veeru devgan hollywood" Veeru Devgan only works in Hollywood. 

PARSED QUERY: Query(((Kdevgan@1 PHRASE 5 Kand@2 PHRASE 5 Khollywood@3 PHRASE 5 Kand@4 PHRASE 5 Kveeru@5) OR (Bveeru@6 PHRASE 3 Bdevgan@7 PHRASE 3 Bhollywood@8) OR (veeru@9 OR devgan@10 OR Zonli@11 OR Zwork@12 OR Zin@13 OR hollywood@14))) 



[[b'Hindustan_Ki_Kasam_-LRB-1999_film-RRB-', 28, '32.07'],
 [b'Devgan', 28, '31.95'],
 [b'Anil_Devgan', 28, '31.45'],
 [b'Veeru_Devgan', 27, '30.54'],
 [b'Dil_Kya_Kare', 26, '29.47']]

# Evaluation based on devset

In [71]:
devset = pd.read_json(devsetPath, orient='index', encoding='ISO-8859-1')
devset.reset_index(inplace=True)

In [72]:
devset['evidence'] = devset['claim'].swifter.apply(lambda x: [[item['lookedupDoc'], 0] for item in search(x)])

HBox(children=(IntProgress(value=0, description='Pandas Apply', max=5001, style=ProgressStyle(description_widt…

In [74]:
devset.set_index(('index'), inplace=True)

In [75]:
devset['evidence'] = devset['evidence'].apply(decoder)

In [76]:
devset.to_json('data/mydevset10_NE_About_5.json', orient='index')

# Pipeline

Required to make a global dictionary which has everey title-sentence mapping

In [166]:
finalDF = pd.read_pickle('data/lookupDataset.pkl')

In [168]:
finalDF['pageTitle'] = finalDF['pageTitle'].apply(unicodes)

In [170]:
globalDictionary = pd.Series(finalDF.data.values,index=finalDF.pageTitle).to_dict()
del finalDF

Read data JSON

In [123]:
devset = pd.read_json(devsetPath, orient='index', encoding='ISO-8859-1')
devset.reset_index(inplace=True)

Unstack the dataframe

In [124]:
df = pd.DataFrame([(tup.index, tup.claim, d, tup.label) for tup in devset.itertuples() for d in tup.evidence], columns=['index', 'claim', 'evidence', 'label'])

Concatenate the **NOT ENOUGH INFO** to the unstacked dataframe

In [125]:
devset = pd.concat([df, devset[devset['label']=='NOT ENOUGH INFO']])

Get text

In [184]:
def extractText(pageTitle):
    if len(pageTitle) != 0:
        pageTitle = [unicodes(pageTitle[0]), pageTitle[1]]
        return globalDictionary[pageTitle[0]][pageTitle[1]]
    else:
        return []

In [185]:
devset['evidence_text'] = devset['evidence'].apply(lambda x: extractText(x))

Lookup from index based on claims

In [187]:
top_N_results = 10

In [None]:
devset['lookup_evidence_title'] = devset['claim'].swifter.apply(lambda x: [item['lookedupDoc'] for item in search(x, pagesize=top_N_results)])

HBox(children=(IntProgress(value=0, description='Pandas Apply', max=7369, style=ProgressStyle(description_widt…

In [154]:
devset['evidence'] = devset['evidence'].apply()

NameError: name 'decode' is not defined