In [1]:
from tqdm import tqdm
import swifter



Get data

In [2]:
datapath = 'data/wiki-pages-text/'

In [3]:
import os
filesInDataPath = sorted([datapath + fileName for fileName in os.listdir(datapath)])

**`sPacy`** - Helper Functions
https://spacy.io/usage/spacy-101#annotations-ner

In [4]:
import spacy
nlp = spacy.load("en_core_web_lg")

def POStag(string):
    return [(word.text, word.pos_) for word in nlp(string)]

def NEtag(string, keywordMode=True):
    if keywordMode:
        return [ent.text for ent in nlp(string).ents]
    else:
        return [(ent.text, ent.label_) for ent in nlp(string).ents]

In [5]:
import pandas as pd
def shardToDF(shardPath):
    tempArray = []
    with open(shardPath, 'r') as openedFile:
        for line in openedFile:
            pageTitle, sentenceNo, pageText = line.split(' ', 2)
            try:
                sentenceNo = int(sentenceNo)
                pageText = pageText.replace('-LRB- ','(')
                pageText = pageText.replace(' -RRB-',')')
                pageText = pageText.replace('-LSB- ','[')
                pageText = pageText.replace(' -RSB-',']')
            except Exception:
                pass
            tempArray.append([pageTitle, sentenceNo, pageText])
        tempDF = pd.DataFrame.from_records(tempArray, columns=['pageTitle','sentenceNo' ,'pageText'])
        return tempDF

Create information DF for all shards

In [6]:
finalDF = pd.DataFrame()
with tqdm(total=len(filesInDataPath)) as pbar:
    for shardPath in filesInDataPath:
        finalDF = pd.concat([finalDF, shardToDF(shardPath)])
        pbar.update(1)

100%|██████████| 109/109 [03:01<00:00,  2.34s/it]


Aggregating based on Page_Tiles

In [7]:
concatenateFunction = lambda x: ' '.join(x)
aggregation_functions = {'pageText': concatenateFunction}
newDF = finalDF.groupby(finalDF['pageTitle']).aggregate(aggregation_functions)

Precomputation: Find Keywords for each line, will take long

In [None]:
newDF['keywords'] = newDF['pageText'].swifter.apply(lambda x: NEtag(x))
newDF.to_pickle('data/nerDataset.pkl')

HBox(children=(IntProgress(value=0, description='Pandas Apply', max=5396106, style=ProgressStyle(description_w…

### _`dump`_
contains the aggregated dataframe in the form of a list

In [None]:
dump = newDF.to_records()

# XAPIAN

In [None]:
import xapian

In [None]:
dbpath = 'index/xapian_index_v2'

Xapian build Index

In [None]:
x_db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN)

Set termgenerator for indexing

In [None]:
index = xapian.TermGenerator()

Set Stemmer

In [None]:
index.set_stemmer(xapian.Stem('en'))

Build Index

In [None]:
with tqdm(total=len(dump)) as pbar:
    for pageTitle, pageText in dump:
        indexCounter = u"Q" + pageTitle

        '''Set the data that we want to store'''
        xapianDoc = xapian.Document()
        dataDict = {'pageTitle': pageTitle,
                   'pageText': pageText}
        xapianDoc.set_data(dataDict)

        '''Index book-keeping'''
        index.set_document(xapianDoc)
        index.index_text(pageText)
        index.increase_termpos()

        x_db.replace_document(indexCounter, xapianDoc)
        x_db.commit()
        pbar.update(1)
x_db.close()