In [1]:
from bs4 import BeautifulSoup
import re
import os
from PorterStemmer import * 
from tqdm.notebook import tqdm
import snappy
import mmap
import math
import json
ps = PorterStemmer()

In [2]:
directory = 'subset'

In [3]:
delim = '''[ ',(){}.:;"’`\n]'''
def getTokensFromText(text):
    #words = re.split('''[ ',.:_;"’`\n]''',text)
    words = re.split(delim, text)
    res = []
    for w in words:
        if(len(w)>0):
            res.append(ps.stem(w.lower(),0,len(w)-1))
    return res

In [4]:
def processStopwords(filepath):
    stopwords = {}
    text = ''
    file = open(filepath,"r")
    lines = file.readlines()
    for line in lines:
        text += (line + " ")
    res = re.split(delim,text)
    temp = []
    for w in res:
        if len(w)>0:
            temp.append(w.lower())
    res = temp
    for w in res:
        stopwords[w] = 1
    return stopwords

In [5]:
stopwords = processStopwords('stopwords')

In [6]:
def checkStopwords(word,stopwords):
    if stopwords.get(word) is not None:
        return True
    else:
        return False

In [7]:
def getXMLtags(filepath):
    file = open(filepath,"r")
    lines = file.readlines()
    text = ''
    for line in lines:
        text += (line + " ")
    res = re.split(delim,text)
    temp = []
    for w in res:
        if len(w)>0:
            temp.append(w.lower())
    res = temp
    return res

In [57]:
# maps docnum to list of tokens for all files in directory
def process_directory(path,filearr):
    data = {}
    for filename in tqdm(filearr):
        full_path = os.path.join(path,filename)
        file = open(full_path, "r")
        contents = file.read()
        soup = BeautifulSoup(contents, 'html.parser')
        docs = soup.find_all('doc')

        for doc in docs:
            docnum = doc.find(xmlTags[0]).get_text().strip()
            res = []
            for tag in xmlTags:
                if tag == xmlTags[0]:
                    continue
                fields = doc.find_all(tag)
                for field in fields:
                    text = field.get_text()
                    words = re.split(delim, text)
                    for w in words:
                        if(len(w) > 0):
                            res.append(ps.stem(w.lower(), 0, len(w)-1))
            data[docnum] = res
    return data

In [59]:
xmlTags = getXMLtags('xmltags')

In [9]:
# gets vocabulary from the data and returns it in the form of a set
def getVocab(vocab, data,stopwords):
    tokens = []
    for doc,token_list in data.items():
        tokens += token_list
    tokens = set(tokens)
    for w in stopwords:
        tokens.discard(w)
    vocab.update(tokens)
    return vocab

In [10]:
def dec_to_binary(n):
    return bin(n).replace("0b", "")
def bin_to_dec(n):
    return int(n,2)

In [11]:
def gapEncodeList(arr):
    carry = arr[0]
    temp = [carry]
    for i in range(1,len(arr)):
        temp.append(arr[i]-arr[i-1])
    return temp
def undoGapEncode(arr):
    temp = [arr[0]]
    for i in range(1,len(arr)):
        prev = temp[-1]
        temp.append(arr[i]+prev)
    return temp

In [12]:
# generates inverted index from the given data
def getInvIdx(data):
    invidx = {}
    for doc, token_list in data.items():
        doc = docID_to_int[doc]
        for token in token_list:
            if token in vocab:
                if token in invidx.keys():
                    if(invidx[token][-1]!=doc):
                        invidx[token].append(doc)
                else:
                    invidx[token] = [doc]
    return invidx

In [13]:
def getIntersection(postings):
    ans = postings[0]
    for posting in postings:
        tempans = []
        n1 = len(ans)
        n2 = len(posting)
        i, j = 0, 0
        while i != n1 and j != n2:
            if ans[i] == posting[j]:
                tempans.append(ans[i])
                i += 1
                j += 1
            elif ans[i] < posting[j]:
                i += 1
            else:
                j += 1
        ans = tempans
    return ans

In [14]:
def doGapEncoding(invidx):
    for token, posting_list in invidx.items():
        invidx[token] = gapEncodeList(posting_list)
    return invidx

In [15]:
def chunkstring(string, length):
    return [string[0+i:length+i] for i in range(0, len(string), length)]

In [16]:
def chunks(l, n):
    return [l[i:i + n] for i in range(0, len(l), n)] 

In [17]:
def padding(x):
    n = len(x)
    return '0'*(8-n) + x

In [18]:
# does c2 encoding on a number
def c2_encoding(x):
    if(x == 1):
        return "0"

    def l(x):
        return len(dec_to_binary(x))

    def U(n):
        if(n <= 0):
            return '0'
        return '1' * (n-1) + '0'

    def lsb(a, b):
        binary = dec_to_binary(a)
        return binary[-b:]
    lx = l(x)
    llx = l(lx)
    t1 = U(llx)
    t2 = lsb(lx, llx-1)
    t3 = lsb(x, lx-1)
    return t1+t2+t3


# does c2 encoding on a list of numbers
def c2_encode_list(arr):
    ans = ''
    for e in arr:
        ans += c2_encoding(e)
    return ans


# does c2 decoding on a list of numbers
def c2_decode(x):
    i = 0
    n = len(x)
    res = []
    while(i < n):
        t1 = ''
        while(x[i] != '0'):
            t1 += x[i]
            i += 1
        llx = len(t1) + 1
        i += 1
        lx_bin = '1' + x[i:i+llx-1]
        i += (llx-1)
        lx = bin_to_dec(lx_bin)
        num_bin = '1' + x[i:i+lx-1]
        i += (lx-1)
        num = bin_to_dec(num_bin)
        res.append(num)
    return res

In [19]:
# makes posting list file for C2 compression
def dumpINV(invidx,i):
    dictionary = {}
    fname = f"index{i}.idx"
    file = open(os.path.join("tempfiles",fname), "wb")
    offset = 0
    for term, posting_list in invidx.items():
        allbytes = c2_encode_list(posting_list)
        sz = len(allbytes)
        skipbits = 0
        if(sz % 8 != 0):
            allbytes = '0'*(8-sz % 8) + allbytes
            skipbits = (8-sz % 8)
        chunks = chunkstring(allbytes, 8)
        temp = []
        for chunk in chunks:
            temp.append(bin_to_dec(chunk))
        file.write(bytearray(temp))
        length = len(chunks)
        dictionary[term] = [offset, length, skipbits]
        offset += length
    file.close()
    return dictionary

In [20]:
# get posting list using C2
def getPosting(reader,token,tempdict):
    start = tempdict[token][0]
    sz = tempdict[token][1]
    skipbits = tempdict[token][2]
    reader.seek(start)
    allbytes = ''
    for i in range(0, sz):
        temp = reader.read(1)
        val = int.from_bytes(temp, "big")
        allbytes += padding(dec_to_binary(val))
    i = skipbits
    allbytes = allbytes[i:]
    doclist = c2_decode(allbytes)
    return undoGapEncode(doclist)

In [21]:
#data = process_directory(directory,os.listdir(directory))
allfiles = os.listdir(directory)

In [22]:
def mapDocIDs(docIDs,numdocs):
    for doc in docIDs:
        docID_to_int[doc] = numdocs
        int_to_docID[numdocs] = doc
        numdocs+=1
    return numdocs

In [60]:
FILES_PER_SPLIT = 2
int_to_docID = {}
numfiles = len(chunks(allfiles,FILES_PER_SPLIT))
numdocs = 1
docID_to_int = {}
int_to_docID = {}
vocab = set()
for i,chunked in enumerate(chunks(allfiles,FILES_PER_SPLIT)):
    data = process_directory(directory,chunked)
    numdocs = mapDocIDs(data,numdocs)
    vocab = getVocab(vocab,data,stopwords)
    invidx = getInvIdx(data)
    invidx = doGapEncoding(invidx)
    tempdict = dumpINV(invidx,i)
    with open(os.path.join("tempfiles",f"dict{i}.json"), "w") as outfile:
        json.dump(tempdict,outfile)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [24]:
def loadDictFiles():
    res = []
    for i in range(numfiles):
        with open(os.path.join("tempfiles",f"dict{i}.json"), "r") as jsonfile:
            tempinvidx = json.load(jsonfile)
            res.append(tempinvidx)
    return res

In [25]:
def loadBinFiles():
    res = []
    for i in range(numfiles):
        f = open(os.path.join("tempfiles",f"index{i}.idx"),"rb")
        res.append(f)
    return res

In [26]:
def closeBinFiles(allidx):
    for i in range(numfiles):
        allidx[i].close()

In [37]:
invidx = {}
alldicts = loadDictFiles()
allidx = loadBinFiles()
for w in vocab:
    postings = []
    for i,tempdict in enumerate(alldicts):
        if w in tempdict:
            #f = open(f"index{i}.idx","rb")
            reader = allidx[i]
            postings.extend(getPosting(reader,w,alldicts[i]))
            #f.close()
    invidx[w] = postings

In [62]:
arr = invidx["african"]
for e in arr:
    print(int_to_docID[e])

AP880504-0011
AP880504-0014
AP880504-0111
AP880504-0037
AP880504-0019
AP880504-0070
AP880504-0018
AP880504-0016


In [63]:
def dumpC2():
    alldicts = loadDictFiles()
    allidx = loadBinFiles()
    dictionary = {}
    fname =  'tempc2.idx'
    file = open(fname, "wb")
    offset = 0
    for w in vocab:
        postings = []
        for i,tempdict in enumerate(alldicts):
            if w in tempdict:
                reader = allidx[i]
                postings.extend(getPosting(reader,w,alldicts[i]))
        postings = gapEncodeList(postings)
        allbytes = c2_encode_list(postings)
        sz = len(allbytes)
        skipbits = 0
        if(sz % 8 != 0):
            allbytes = '0'*(8-sz % 8) + allbytes
            skipbits = (8-sz % 8)
        chunks = chunkstring(allbytes, 8)
        temp = []
        for chunk in chunks:
            temp.append(bin_to_dec(chunk))
        file.write(bytearray(temp))
        length = len(chunks)
        dictionary[w] = [offset, length, skipbits]
        offset += length
    closeBinFiles(allidx)
    return dictionary

In [64]:
dictc2 = dumpC2()

In [65]:
dictc2["sanction"]

[64418, 9, 1]

In [66]:
def getPostings_c2(tokens, dictionary, filename):
    f = open(filename, "rb")
    res = []
    for token in tokens:
        start = dictionary[token][0]
        sz = dictionary[token][1]
        skipbits = dictionary[token][2]
        f.seek(start)
        allbytes = ''
        for i in range(0,sz):
            temp = f.read(1)
            val = int.from_bytes(temp,"big")
            allbytes+=padding(dec_to_binary(val))
        i = (skipbits)
        allbytes = allbytes[i:]
        doclist = c2_decode(allbytes)
        res.append(undoGapEncode(doclist))
    return res

In [74]:
postings = getPostings_c2(getTokensFromText("South African Sanctions"),dictc2,'tempc2.idx')
docids = getIntersection(postings)
for d in docids:
    print(int_to_docID[d])

AP880504-0011
AP880504-0025
AP880504-0173


In [71]:
getTokensFromText("africans")

['african']

In [72]:
def delTempFiles():
    for i in range(numfiles):
        if os.path.exists(f"dict{i}.json"):
            os.remove(f"dict{i}.json")
        if os.path.exists(f"index{i}.idx"):
            os.remove(f"index{i}.idx")

In [189]:
delTempFiles()

PermissionError: [WinError 32] The process cannot access the file because it is being used by another process: 'index0.idx'