In [1]:
from tqdm import tqdm

Get data

**`sPacy`** - Helper Functions
https://spacy.io/usage/spacy-101#annotations-ner

In [2]:
import re
import spacy
import unicodedata
import nltk

from nltk.corpus import stopwords
from spacy.lang.en.stop_words import STOP_WORDS

Load pre-trained model

Caching Stop Words

In [3]:
cachedStopWords = set(stopwords.words('english')).union(STOP_WORDS)

In [4]:
def unicodes(string):
    nfd_string = unicodedata.normalize("NFD", string)
    nfd = nfd_string.encode('WINDOWS-1252', 'ignore')
    return nfd

In [5]:
def clean(string):
    return ' '.join(word for word in set(string.split()) if word not in cachedStopWords)

In [7]:
def NEtag(string, keywordMode=True):
    if keywordMode:
        return [ent.text for ent in nlp(string).ents]
    else:
        return [(ent.text, ent.label_) for ent in nlp(string).ents]

In [8]:
def nouns(string):
    return set([word for (word, pos) in nltk.pos_tag(nltk.word_tokenize(string)) if pos[0] == 'N'])

Data wrangling

In [9]:
import pandas as pd

Create information DF for all shards

Aggregating based on Page_Tiles

Precomputation: Find Keywords for each line, will take long

### _`dump`_
contains the aggregated dataframe in the form of a list

In [10]:
newDF = pd.read_pickle('data/nerDataset.pkl')

In [11]:
dump = newDF.to_records()

In [12]:
del newDF

# XAPIAN

In [13]:
import xapian

In [14]:
dbpath = 'index/indexRN'

Xapian build Index

In [15]:
x_db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN)

Set termgenerator for indexing

In [16]:
index = xapian.TermGenerator()

Set Stemmer

In [17]:
index.set_stemmer(xapian.Stem('en'))

Build Index

In [18]:
noKeywords = []
count = 0
with tqdm(total=len(dump)) as pbar:
    for pageTitle, pageText, keywords in dump:
        count += 1
        if len(pageTitle) > 240:
            pageTitle = pageTitle[:150]
        terInfo = pageTitle.split('_')
        indexCounter = u"Q" + str(count)

        '''Set the data that we want to store'''
        xapianDoc = xapian.Document()
        xapianDoc.set_data(unicodes(pageTitle))
        
        # Named Entities are present in keywords + Adding nouns
        keywords = set(keywords).union(nouns(pageText))
        
        # Adding dimension
        [xapianDoc.add_term(b"K" + unicodes(keyword.lower())) for keyword in keywords if len(keyword) < 150]
        [xapianDoc.add_term(b"B" + unicodes(keyword.lower())) for keyword in terInfo if len(keyword) < 150]
        index.set_document(xapianDoc)
        
        '''Indexing Based on'''
        index.index_text(unicodes(pageText))
        index.increase_termpos()
        [index.index_text(unicodes(keyword), 1, "B") for keyword in set(terInfo)]
        [index.index_text(unicodes(keyword), 1, "K") for keyword in set(keywords)]
            

        x_db.replace_document(indexCounter, xapianDoc)
        pbar.update(1)
    x_db.commit()

100%|██████████| 5396106/5396106 [17:32:05<00:00, 85.48it/s]      


In [19]:
x_db.close()

In [20]:
!xapian-delve $dbpath

UUID = d563065e-3d4f-47ec-8f5d-3f07e2e8e8fe
number of documents = 5396106
average document length = 291.765
document length lower bound = 3
document length upper bound = 71217
highest document id ever used = 5396106
has positional information = true
revision = 540
currently open for writing = false
