In [1]:
import spacy
import os
import xapian
import re
import io
import shutil
import datetime
from tqdm import tqdm as tqdm

In [2]:
dbpath = 'xapIndex_SenID_Text'
datapath = 'wiki-pages-text/'

# Shard Paths

In [None]:
filesInDataPath = sorted([datapath + fileName for fileName in os.listdir(datapath)], reverse=True)
filesInDataPath = sorted(filesInDataPath)

In [None]:
assert(filesInDataPath[0]=='wiki-pages-text/wiki-001.txt')

Xapian build Index

In [None]:
x_db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN)

Set termgenerator for indexing

In [None]:
index = xapian.TermGenerator()

Set Stemmer

In [None]:
index.set_stemmer(xapian.Stem('en'))

Build Index

In [None]:
indexCounter = 0
print('Time started at', datetime.datetime.now().time())
with tqdm(total=len(filesInDataPath)) as pbar:
    for shardFile in filesInDataPath:
        with open(shardFile, 'r') as openedFile:
            print('Now processing:', shardFile)
            for line in openedFile:
                indexCounter += 1
                # Processing line
                docID, sentenceID, text = line.split(' ', 2)
                try:
                    sentenceID = int(sentenceID)
                except Exception:
                    pass
                xapianDoc = xapian.Document()
                xapianDoc.set_data(text + ',,,' + docID +',,,' + str(sentenceID))
                index.set_document(xapianDoc)
                index.index_text(text)
                index.increase_termpos()
                x_db.replace_document(indexCounter, xapianDoc)
        x_db.commit()
        pbar.update(1)
x_db.close()

Check DB statistics

In [7]:
!xapian-delve $dbpath

UUID = aec569e4-d627-4bcd-91f6-b56749d28c15
number of documents = 25248397
average document length = 35.7647
document length lower bound = 1
document length upper bound = 7800
highest document id ever used = 25248397
has positional information = true


Search DB

In [3]:
TOP_RESULTS_LIMIT = 15

In [66]:
def search(dbpath, querystring, offset=0, pagesize=10):
    
    database = xapian.Database(dbpath)
    enquire = xapian.Enquire(database)
    query_string = querystring

    qp = xapian.QueryParser()
    stemmer = xapian.Stem("english")
    qp.set_stemmer(stemmer)
    qp.set_database(database)
    
    qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME)
    query = qp.parse_query(query_string)
    #print "Parsed query is: %s" % str(query)

    # Find the top results for the query.
    enquire.set_query(query)
    matches = enquire.get_mset(0, TOP_RESULTS_LIMIT)

    # Display the results.
    #print(%i results found." % matches.get_matches_estimated()
    #print "Results 1-%i:" % matches.size()
    
    text_dictionary = {}
    for m in matches:
        print('RANK:', m.rank + 1)
        print('PERCENTAGE MATCH:', m.percent)
        print('DOC ID:', m.docid)
        print('DOC TXT:', m.document.get_data())
        decoded_text = m.document.get_data().decode('utf-8')
        text_dictionary[(decoded_text.split(",,,")[1],decoded_text.split(",,,")[2])] =decoded_text.split(",,,")[0]
                                                                                  

    return text_dictionary

In [67]:
%%time
query = "When in rome do as romans do"
matches = search(dbpath, query)

RANK: 1
PERCENTAGE MATCH: 100
DOC ID: 24496184
DOC TXT: b"`` When in Rome , do as the Romans do '' , a saying attributed to Ambrose .\n,,,When_in_Rome,,,3"
RANK: 2
PERCENTAGE MATCH: 93
DOC ID: 24431417
DOC TXT: b"That reply is said to have brought about the saying `` When in Rome , do as the Romans do . ''\n,,,When_in_Rome,_do_as_the_Romans_do,,,6"
RANK: 3
PERCENTAGE MATCH: 92
DOC ID: 24431413
DOC TXT: b'When in Rome , do as the Romans do -LRB- often shortened to when in Rome ... -RRB- or a later version when in Rome , do as the Pope does , a proverb attributed to Saint Ambrose , means that it is advisable to follow the conventions of the area in which you are residing or visiting .\n,,,When_in_Rome,_do_as_the_Romans_do,,,0'
RANK: 4
PERCENTAGE MATCH: 85
DOC ID: 24714837
DOC TXT: b"When in Rome Do as The Vandals is the first album by the Huntington Beach punk rock band The Vandals , released in 1984 by National Trust Records , Its title is a play on the phrase `` When in Rome , do as th

In [72]:
import pandas as pd

feature_db = pd.DataFrame({"claim": query_list,"evidence":tuple(matches.keys())})
feature_db
#.split("\n,,,")[1].split(",,,")[0]

Unnamed: 0,claim,evidence
0,When in rome do as romans do,"(When_in_Rome, 3)"
1,When in rome do as romans do,"(When_in_Rome,_do_as_the_Romans_do, 6)"
2,When in rome do as romans do,"(When_in_Rome,_do_as_the_Romans_do, 0)"
3,When in rome do as romans do,"(When_in_Rome_Do_as_The_Vandals, 0)"
4,When in rome do as romans do,"(Greta_Rana, 11)"
5,When in rome do as romans do,"(Peace_thru_Vandalism, 3)"
6,When in rome do as romans do,"(Anapodoton, 5)"
7,When in rome do as romans do,"(When_in_Rome,_do_as_the_Romans_do, 5)"
8,When in rome do as romans do,"(Roman_Party, 1)"
9,When in rome do as romans do,"(No_Apologies_-LRB-The_Eyeliners_album-RRB-, 1)"


In [73]:
feature_db['average_word_length'] = feature_db['claim'].apply(lambda x: (len(x)/len(x.split(' '))))
feature_db

Unnamed: 0,claim,evidence,average_word_length
0,When in rome do as romans do,"(When_in_Rome, 3)",4.0
1,When in rome do as romans do,"(When_in_Rome,_do_as_the_Romans_do, 6)",4.0
2,When in rome do as romans do,"(When_in_Rome,_do_as_the_Romans_do, 0)",4.0
3,When in rome do as romans do,"(When_in_Rome_Do_as_The_Vandals, 0)",4.0
4,When in rome do as romans do,"(Greta_Rana, 11)",4.0
5,When in rome do as romans do,"(Peace_thru_Vandalism, 3)",4.0
6,When in rome do as romans do,"(Anapodoton, 5)",4.0
7,When in rome do as romans do,"(When_in_Rome,_do_as_the_Romans_do, 5)",4.0
8,When in rome do as romans do,"(Roman_Party, 1)",4.0
9,When in rome do as romans do,"(No_Apologies_-LRB-The_Eyeliners_album-RRB-, 1)",4.0


In [98]:
def jaccard_coefficient(claim,evidence_list):
    intersection = len(list(set(claim).intersection(evidence_list)))
    print(list(set(claim).intersection(evidence_list)))
    union = (len(claim) + len(evidence_list)) - intersection
    return (1 - float(intersection / union))

In [91]:
feature_db['jaccard_similarity'] = list(map(lambda x,y: jaccard_coefficient(x,list(y)),feature_db['claim'],matches.values()))


['i', 'o', 'm', ' ', 'W', 's', 'r', 'n', 'd', 'a', 'e', 'h']
['i', 'o', 'm', ' ', 'W', 's', 'r', 'n', 'd', 'a', 'e', 'h']
['i', 'o', 'm', ' ', 'W', 's', 'r', 'n', 'd', 'a', 'e', 'h']
['i', 'o', 'm', ' ', 'W', 's', 'r', 'n', 'd', 'a', 'e', 'h']
['i', 'o', 'm', ' ', 'W', 's', 'r', 'n', 'd', 'a', 'e', 'h']
['i', 'o', 'm', ' ', 'W', 's', 'r', 'n', 'd', 'a', 'e', 'h']
['i', 'o', 'm', ' ', 'W', 's', 'r', 'n', 'd', 'a', 'e', 'h']
['o', 'i', 'm', ' ', 'W', 's', 'r', 'n', 'd', 'a', 'e', 'h']
['i', 'o', 'm', ' ', 's', 'r', 'n', 'd', 'a', 'e', 'h']
['i', 'o', 'm', ' ', 'W', 's', 'r', 'n', 'd', 'a', 'e', 'h']
['o', 'i', 'm', ' ', 'W', 's', 'r', 'n', 'd', 'a', 'e', 'h']
['i', 'o', 'm', ' ', 'W', 's', 'r', 'n', 'd', 'a', 'e', 'h']
['o', 'i', 'm', ' ', 's', 'r', 'n', 'd', 'a', 'e', 'h']
['i', 'o', 'm', ' ', 'W', 's', 'r', 'n', 'd', 'a', 'e', 'h']
['i', 'o', 'm', ' ', 's', 'r', 'n', 'd', 'a', 'e', 'h']


In [101]:
import warnings
warnings.filterwarnings('ignore')
feature_db

Unnamed: 0,claim,evidence,average_word_length,jaccard_similarity,spacy_similarity
0,When in rome do as romans do,"(When_in_Rome, 3)",4.0,0.869565,"[0.1127464875765488, 0.1127464875765488, 0.172..."
1,When in rome do as romans do,"(When_in_Rome,_do_as_the_Romans_do, 6)",4.0,0.891892,"[0.43955122683169295, 0.128560090572638, 0.051..."
2,When in rome do as romans do,"(When_in_Rome,_do_as_the_Romans_do, 0)",4.0,0.96,"[0.4589058202202472, 0.128560090572638, 0.1224..."
3,When in rome do as romans do,"(When_in_Rome_Do_as_The_Vandals, 0)",4.0,0.967213,"[0.4589058202202472, 0.128560090572638, 0.1224..."
4,When in rome do as romans do,"(Greta_Rana, 11)",4.0,0.956835,"[0.3149799664281064, 0.2798722897597501, 0.051..."
5,When in rome do as romans do,"(Peace_thru_Vandalism, 3)",4.0,0.876289,"[0.43955122683169295, 0.128560090572638, 0.122..."
6,When in rome do as romans do,"(Anapodoton, 5)",4.0,0.946903,"[0.3149799664281064, 0.12405169137778345, 0.17..."
7,When in rome do as romans do,"(When_in_Rome,_do_as_the_Romans_do, 5)",4.0,0.925466,"[0.43955122683169295, 0.128560090572638, 0.122..."
8,When in rome do as romans do,"(Roman_Party, 1)",4.0,0.960714,"[0.3149799664281064, 0.16262031465318408, 0.17..."
9,When in rome do as romans do,"(No_Apologies_-LRB-The_Eyeliners_album-RRB-, 1)",4.0,0.923567,"[0.23479753925819682, 0.23862515052215402, 0.1..."


In [102]:
import spacy
nlp = spacy.load('en_core_web_sm')
def spacy_similarity(sentence1, sentence_list):
    s1 = nlp(sentence1)
    similarity_scores = []
    for sentence in sentence_list:
        s2 = nlp(sentence)
        similarity_scores.append(s1.similarity(s2))
    return similarity_scores

In [104]:
feature_db['spacy_similarity'] = list(map(lambda x,y: spacy_similarity(x,y),feature_db['claim'],matches.values()))
feature_db['spacy_similarity']

0     [0.1127464875765488, 0.1127464875765488, 0.172...
1     [0.43955122683169295, 0.128560090572638, 0.051...
2     [0.4589058202202472, 0.128560090572638, 0.1224...
3     [0.4589058202202472, 0.128560090572638, 0.1224...
4     [0.3149799664281064, 0.2798722897597501, 0.051...
5     [0.43955122683169295, 0.128560090572638, 0.122...
6     [0.3149799664281064, 0.12405169137778345, 0.17...
7     [0.43955122683169295, 0.128560090572638, 0.122...
8     [0.3149799664281064, 0.16262031465318408, 0.17...
9     [0.23479753925819682, 0.23862515052215402, 0.1...
10    [0.43955122683169295, 0.128560090572638, 0.122...
11    [0.38096696980092926, 0.12247122522584558, 0.0...
12    [0.38096696980092926, 0.05196985430490302, 0.2...
13    [0.43955122683169295, 0.128560090572638, 0.122...
14    [0.0500609534784563, 0.1721324828701981, 0.314...
Name: spacy_similarity, dtype: object

In [39]:
import pandas as pd
evidence_df = pd.DataFrame(matches).transpose().reset_index()

In [41]:
#evidence_df = evidence_df.reset_index()
evidence_df.columns = ["index","text"]

TypeError: a bytes-like object is required, not 'str'