In [1]:
from bs4 import BeautifulSoup
import re
import os
from PorterStemmer import * 
from tqdm.notebook import tqdm
import snappy
import mmap
import math
import json
ps = PorterStemmer()

In [2]:
directory = 'debug'

In [3]:
delim = '''[ ',(){}.:;"’`\n]'''
def getTokensFromText(text):
    #words = re.split('''[ ',.:_;"’`\n]''',text)
    words = re.split(delim, text)
    res = []
    for w in words:
        if(len(w)>0):
            res.append(ps.stem(w.lower(),0,len(w)-1))
    return res

In [4]:
def processStopwords(filepath):
    stopwords = {}
    text = ''
    file = open(filepath,"r")
    lines = file.readlines()
    for line in lines:
        text += (line + " ")
    res = re.split('''[ ',.:_;"’`\n]''',text)
    temp = []
    for w in res:
        if len(w)>0:
            temp.append(w.lower())
    res = temp
    for w in res:
        stopwords[w] = 1
    return stopwords

In [5]:
stopwords = processStopwords('stopwords')

In [6]:
def checkStopwords(word,stopwords):
    if stopwords.get(word) is not None:
        return True
    else:
        return False

In [7]:
def getXMLtags(filepath):
    file = open(filepath,"r")
    lines = file.readlines()
    text = ''
    for line in lines:
        text += (line + " ")
    res = re.split(delim,text)
    temp = []
    for w in res:
        if len(w)>0:
            temp.append(w.lower())
    res = temp
    return res

In [8]:
# maps docnum to list of tokens for all files in directory
def process_directory(dir_path):
    data = {}
    for filename in tqdm(os.listdir(dir_path)):
        full_path = os.path.join(dir_path,filename)
        file = open(full_path, "r")
        contents = file.read()
        soup = BeautifulSoup(contents, 'html.parser')
        docs = soup.find_all('doc')

        for doc in docs:
            docnum = doc.find('docno').get_text().strip()
            fields = doc.find_all('text')
            res = []
            for field in fields:
                text = field.get_text()
                words = re.split(delim,text)
                for w in words:
                    if(len(w)>0):
                        res.append(ps.stem(w.lower(),0,len(w)-1))
            data[docnum] = res
    return data

In [9]:
data = process_directory(directory)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




In [10]:
# gets vocabulary from the data and returns it in the form of a set
def getVocab(data,stopwords):
    tokens = []
    for doc,token_list in data.items():
        tokens += token_list
    tokens = set(tokens)
    for w in stopwords:
        tokens.discard(w)
    return tokens

In [11]:
# takes a list of document names and maps them to integers and returns the map
def mapDocIDs(docIDs):
    docID_to_int = {}
    int_to_docID = {}
    i = 1
    for doc in docIDs:
        docID_to_int[doc] = i
        int_to_docID[i] = doc
        i+=1
    return docID_to_int, int_to_docID

def makeDocIdMap(full_path):
    file = open(full_path, "r")
    contents = file.read()
    soup = BeautifulSoup(contents, 'html.parser')
    docs = soup.find_all('doc')
    docIDs = []
    for doc in docs:
        docnum = doc.find('docno').get_text().strip()
        docIDs.append(docnum)
    return mapDocIDs(docIDs)

In [12]:
docID_to_int, int_to_docID = mapDocIDs(data)
vocab = getVocab(data,stopwords)

In [13]:
# generates inverted index from the given data
def getInvIdx(data):
    invidx = {}
    for doc, token_list in data.items():
        doc = docID_to_int[doc]
        for token in token_list:
            if token in vocab:
                if token in invidx.keys():
                    if(invidx[token][-1]!=doc):
                        invidx[token].append(doc)
                else:
                    invidx[token] = [doc]
    return invidx

In [14]:
def gapEncodeList(arr):
    carry = arr[0]
    temp = [carry]
    for i in range(1,len(arr)):
        temp.append(arr[i]-arr[i-1])
    return temp
#     for i in range(1,len(arr)):
#         temp = arr[i]
#         arr[i] -= carry
#         carry = temp
#     return arr

In [15]:
def undoGapEncode(arr):
    temp = [arr[0]]
    for i in range(1,len(arr)):
        prev = temp[-1]
        temp.append(arr[i]+prev)
    return temp
#     for i,item in enumerate(arr):
#         if i==0:
#             continue
#         arr[i] +=arr[i-1]
#     return arr

In [16]:
def dec_to_binary(n):
    return bin(n).replace("0b", "")

In [17]:
def bin_to_dec(n):
    return int(n,2)

In [18]:
invidx = getInvIdx(data)

In [19]:
res = []
for token in getTokensFromText("South African Sanctions"):
    res.append((invidx[token]))
res

[[1,
  7,
  8,
  9,
  14,
  30,
  31,
  49,
  55,
  60,
  67,
  69,
  81,
  98,
  103,
  131,
  142,
  159,
  169,
  175,
  195,
  204,
  214,
  216,
  229,
  234,
  235,
  239,
  244,
  252,
  270,
  272,
  284,
  304,
  333,
  334,
  339,
  340,
  347,
  349,
  350,
  369,
  383,
  402,
  406,
  411,
  414,
  417,
  426,
  434,
  435,
  444,
  445,
  452,
  456,
  459,
  466,
  469,
  475],
 [9, 19, 60, 108, 235, 284, 304, 434],
 [56, 79, 434]]

In [20]:
def doGapEncoding(invidx):
    for token, posting_list in invidx.items():
        invidx[token] = gapEncodeList(posting_list)
    return invidx

In [21]:
invidx = doGapEncoding(invidx)

In [22]:
# find list of common docs from multiple lists
def getIntersection(postings):
    ans = postings[0]
    for posting in postings:
        tempans = []
        n1 = len(ans)
        n2 = len(posting)
        i,j = 0,0
        while i!=n1 and j!=n2:
            if ans[i] == posting[j]:
                tempans.append(ans[i])
                i+=1
                j+=1
            elif ans[i]<posting[j]:
                i+=1
            else:
                j+=1
        ans = tempans
    return ans

In [23]:
def padding(x):
    n = len(x)
    return '0'*(8-n) + x

In [24]:
def chunkstring(string, length):
    return [string[0+i:length+i] for i in range(0, len(string), length)]

In [25]:
def c2_encoding(x):
    if(x==1):
        return "0"
    def l(x):
        return math.floor(math.log2(x))+1
    def U(n):
        if(n<=0):
            return '0'
        return '1' * (n-1) + '0'
    def lsb(a,b):
        binary = dec_to_binary(a)
        if len(binary) < b:
            binary = '0'*(b - len(binary)) + binary
        return binary[-b:]
    lx = l(x)
    llx = l(lx)
    t1 = U(llx)
    t2 = lsb(lx,llx-1)
    t3 = lsb(x,lx-1)
    return t1+t2+t3

In [26]:
def c2_encode_list(arr):
    ans = ''
    for e in arr:
        ans+=c2_encoding(e)
    return ans

In [27]:
def c2_decode(x):
    i = 0
    n = len(x)
    res = []
    while(i<n):
        t1 = ''
        while(x[i]!='0'):
            t1+=x[i]
            i+=1
        llx = len(t1) + 1
        i+=1
        lx_bin = '1'+ x[i:i+llx-1]
        i += (llx-1)
        lx = bin_to_dec(lx_bin)
        num_bin = '1' + x[i:i+lx-1]
        i += (lx-1)
        num = bin_to_dec(num_bin)
        res.append(num)
    return res

In [28]:
def dumpFiles_c2(invidx):
    dictionary = {}
    file = open("debugc2.idx","wb")
    offset = 0
    for term, posting_list in invidx.items():
        allbytes = c2_encode_list(posting_list)
        sz = len(allbytes)
        skipbits = 0
        if(sz%8!=0):
            allbytes = '0'*(8-sz%8) + allbytes
            skipbits = (8-sz%8)
        chunks = chunkstring(allbytes,8)
        temp = []
        for chunk in chunks:
            temp.append(bin_to_dec(chunk))
        file.write(bytearray(temp))
        length = len(chunks)
        dictionary[term] = [offset,length,skipbits]
        offset += length
    file.close()
    return dictionary

In [29]:
dictionary_c2 = dumpFiles_c2(invidx)

In [30]:
dictionary_c2["african"]

[28777, 10, 0]

In [31]:
def getPostings_c2(tokens, dictionary, filename):
    f = open(filename, "rb")
    res = []
    for token in tokens:
        start = dictionary[token][0]
        sz = dictionary[token][1]
        skipbits = dictionary[token][2]
        f.seek(start)
        allbytes = ''
        for i in range(0,sz):
            temp = f.read(1)
            val = int.from_bytes(temp,"big")
            allbytes+=padding(dec_to_binary(val))
        i = (skipbits)
        allbytes = allbytes[i:]
        doclist = c2_decode(allbytes)
        res.append(undoGapEncode(doclist))
    return res
getPostings_c2(getTokensFromText("south african sanction"),dictionary_c2,'debugc2.idx')

[[1,
  7,
  8,
  9,
  14,
  30,
  31,
  49,
  55,
  60,
  67,
  69,
  81,
  98,
  103,
  131,
  142,
  159,
  169,
  175,
  195,
  204,
  214,
  216,
  229,
  234,
  235,
  239,
  244,
  252,
  270,
  272,
  284,
  304,
  333,
  334,
  339,
  340,
  347,
  349,
  350,
  369,
  383,
  402,
  406,
  411,
  414,
  417,
  426,
  434,
  435,
  444,
  445,
  452,
  456,
  459,
  466,
  469,
  475],
 [9, 19, 60, 108, 235, 284, 304, 434],
 [56, 79, 434]]

In [32]:
postings = getPostings_c2(getTokensFromText("South African Sanctions"),dictionary_c2,'debugc2.idx')
docids = getIntersection(postings)
for d in docids:
    print(int_to_docID[d])

AP880214-0053


In [33]:
def reconstruct(dictionary,filename):
    f = open(filename, "rb")
    reconidx = {}
    for token in dictionary.keys():
        start = dictionary[token][0]
        sz = dictionary[token][1]
        skipbits = dictionary[token][2]
        f.seek(start)
        allbytes = ''
        for i in range(0,sz):
            temp = f.read(1)
            val = int.from_bytes(temp,"big")
            allbytes+=padding(dec_to_binary(val))
        i = (skipbits)
        allbytes = allbytes[i:]
        doclist = c2_decode(allbytes)
        reconidx[token] = doclist
    return reconidx

In [34]:
reconidx = reconstruct(dictionary_c2,"debugc2.idx")

In [35]:
invidx == reconidx

True

In [36]:
res = []
for token in getTokensFromText("South African Sanctions"):
    res.append(undoGapEncode(invidx[token]))
res

[[1,
  7,
  8,
  9,
  14,
  30,
  31,
  49,
  55,
  60,
  67,
  69,
  81,
  98,
  103,
  131,
  142,
  159,
  169,
  175,
  195,
  204,
  214,
  216,
  229,
  234,
  235,
  239,
  244,
  252,
  270,
  272,
  284,
  304,
  333,
  334,
  339,
  340,
  347,
  349,
  350,
  369,
  383,
  402,
  406,
  411,
  414,
  417,
  426,
  434,
  435,
  444,
  445,
  452,
  456,
  459,
  466,
  469,
  475],
 [9, 19, 60, 108, 235, 284, 304, 434],
 [56, 79, 434]]

In [37]:
import random
import numpy as np
for i in range(1000):
    for n in range(25):
        for k in range(10):
            randomlist = random.choices(range(1, 25),k=2)
            var = (c2_decode(c2_encode_list(randomlist)) == randomlist)
            if(var==False):
                print(randomlist)