In [1]:
from bs4 import BeautifulSoup
import re
import os
from PorterStemmer import * 
from tqdm.notebook import tqdm
import snappy
import mmap
import math
import json
ps = PorterStemmer()

In [2]:
path = 'data/ap880214'
#directory = 'tipster-ap-frac'
directory = 'debug'

In [3]:
# for filename in os.listdir(directory):
#     process_file(os.path.join(directory,filename))

In [4]:
delim = '''[ ',(){}.:;"’`\n]'''
def getTokensFromText(text):
    #words = re.split('''[ ',.:_;"’`\n]''',text)
    words = re.split(delim, text)
    res = []
    for w in words:
        if(len(w)>0):
            res.append(ps.stem(w.lower(),0,len(w)-1))
    return res

In [5]:
def processStopwords(filepath):
    stopwords = {}
    text = ''
    file = open(filepath,"r")
    lines = file.readlines()
    for line in lines:
        text += (line + " ")
    res = re.split('''[ ',.:_;"’`\n]''',text)
    temp = []
    for w in res:
        if len(w)>0:
            temp.append(w.lower())
    res = temp
    for w in res:
        stopwords[w] = 1
    return stopwords

In [6]:
stopwords = processStopwords('stopwords')

In [7]:
def checkStopwords(word,stopwords):
    if stopwords.get(word) is not None:
        return True
    else:
        return False

In [8]:
def getXMLtags(filepath):
    file = open(filepath,"r")
    lines = file.readlines()
    text = ''
    for line in lines:
        text += (line + " ")
    res = re.split('''[ ',.:_;"’`\n]''',text)
    temp = []
    for w in res:
        if len(w)>0:
            temp.append(w.lower())
    res = temp
    return res

In [9]:
getXMLtags('xmltags')

['docno', 'head', 'text']

In [10]:
# maps docnum to list of tokens for all files in directory
def process_directory(dir_path):
    data = {}
    for filename in tqdm(os.listdir(dir_path)):
        full_path = os.path.join(dir_path,filename)
        file = open(full_path, "r")
        contents = file.read()
        soup = BeautifulSoup(contents, 'html.parser')
        docs = soup.find_all('doc')

        for doc in docs:
            docnum = doc.find('docno').get_text().strip()
            fields = doc.find_all('text')
            res = []
            for field in fields:
                text = field.get_text()
                words = re.split('''[ ',.:_;"’`\n]''',text)
                for w in words:
                    if(len(w)>0):
                        res.append(ps.stem(w.lower(),0,len(w)-1))
            data[docnum] = res
    return data

In [11]:
# maps docnum to list of tokens in a file
def process_file(full_path):
    file = open(full_path, "r")
    contents = file.read()
    soup = BeautifulSoup(contents, 'html.parser')
    docs = soup.find_all('doc')
    data = {}
    for doc in docs:
        docnum = doc.find('docno').get_text().strip()
        fields = doc.find_all('text')
        res = []
        for field in fields:
            text = field.get_text()
            words = re.split('''[ ',.:_;"’`\n]''',text)
            for w in words:
                if(len(w)>0):
                    res.append(ps.stem(w.lower(),0,len(w)-1))
        data[docnum] = res
    return data

In [12]:
data = process_file(path)

In [20]:
#data = process_directory(directory)

In [21]:
allw = data['AP880214-0053']
getTokensFromText("South African Sanctions")
'sanction' in allw

True

In [13]:
# gets vocabulary from the data and returns it in the form of a set
def getVocab(data,stopwords):
    tokens = []
    for doc,token_list in data.items():
        tokens += token_list
    tokens = set(tokens)
    for w in stopwords:
        tokens.discard(w)
    return tokens

In [14]:
# takes a list of document names and maps them to integers and returns the map
def mapDocIDs(docIDs):
    docID_to_int = {}
    int_to_docID = {}
    i = 1
    for doc in docIDs:
        docID_to_int[doc] = i
        int_to_docID[i] = doc
        i+=1
    return docID_to_int, int_to_docID

def makeDocIdMap(full_path):
    file = open(full_path, "r")
    contents = file.read()
    soup = BeautifulSoup(contents, 'html.parser')
    docs = soup.find_all('doc')
    docIDs = []
    for doc in docs:
        docnum = doc.find('docno').get_text().strip()
        docIDs.append(docnum)
    return mapDocIDs(docIDs)

In [15]:
docID_to_int, int_to_docID = mapDocIDs(data)
vocab = getVocab(data,stopwords)

In [16]:
vocab

{'terror',
 'fought',
 '$82',
 'lose',
 '1945',
 'pre-packag',
 'nose',
 'guarante',
 'temperatur',
 'eyewit',
 'holist',
 'immun',
 'injuri',
 'one-hour',
 'uninterest',
 'welfar',
 'subcommitte',
 'breast',
 'sirri',
 '200',
 'nomine',
 'citat',
 'extrem',
 'worri',
 'nowher',
 'hylton',
 'administr',
 'tax-cut',
 've',
 'also',
 'lawsuit',
 'card',
 'success',
 'reach',
 'panama',
 'grab',
 'ever',
 'feather-clad',
 'corazon',
 'rig',
 'daili',
 'leon',
 'adjust',
 'rosco',
 'freedom',
 'isra',
 'iran-contra',
 'taylor',
 'mexico',
 'thorough',
 'rosemari',
 'suggest',
 'smokeout',
 'firebomb',
 'disturb',
 '39-year-old',
 'weekend',
 'chime',
 'mission',
 'paramount',
 'captor',
 'song',
 'war',
 'moran',
 'luci',
 'technician',
 'abc-washington',
 'stuff',
 '(republican',
 'west',
 '(in',
 'arrub',
 'shift',
 '149',
 'sleep',
 'owen',
 'certain',
 'postpon',
 'poison',
 'cold',
 'farm',
 'chief',
 'via',
 'combat',
 'pro',
 'occup',
 'apprais',
 'act',
 '1964',
 '2020',
 'juliu',


In [17]:
# generates inverted index from the given data
def getInvIdx(data):
    invidx = {}
    for doc, token_list in data.items():
        doc = docID_to_int[doc]
        for token in token_list:
            if token in vocab:
                if token in invidx.keys():
                    if(invidx[token][-1]!=doc):
                        invidx[token].append(doc)
                else:
                    invidx[token] = [doc]
    return invidx

In [18]:
def gapEncodeList(arr):
    carry = arr[0]
    temp = [carry]
    for i in range(1,len(arr)):
        temp.append(arr[i]-arr[i-1])
    return temp
#     for i in range(1,len(arr)):
#         temp = arr[i]
#         arr[i] -= carry
#         carry = temp
#     return arr

In [19]:
def undoGapEncode(arr):
    temp = [arr[0]]
    for i in range(1,len(arr)):
        prev = temp[-1]
        temp.append(arr[i]+prev)
    return temp
#     for i,item in enumerate(arr):
#         if i==0:
#             continue
#         arr[i] +=arr[i-1]
#     return arr

In [20]:
undoGapEncode(gapEncodeList([1,3,6,8,14,15]))

[1, 3, 6, 8, 14, 15]

In [21]:
def dec_to_binary(n):
    return bin(n).replace("0b", "")

In [22]:
def bin_to_dec(n):
    return int(n,2)

In [23]:
invidx = getInvIdx(data)

In [34]:
invidx["sanction"]

[53]

In [24]:
def doGapEncoding(invidx):
    for token, posting_list in invidx.items():
        invidx[token] = gapEncodeList(posting_list)
    return invidx

In [25]:
invidx = doGapEncoding(invidx)

In [26]:
# find list of common docs from multiple lists
def getIntersection(postings):
    ans = postings[0]
    for posting in postings:
        tempans = []
        n1 = len(ans)
        n2 = len(posting)
        i,j = 0,0
        while i!=n1 and j!=n2:
            if ans[i] == posting[j]:
                tempans.append(ans[i])
                i+=1
                j+=1
            elif ans[i]<posting[j]:
                i+=1
            else:
                j+=1
        ans = tempans
    return ans

In [27]:
def c0_encode(x):
    binary = dec_to_binary(x)
    binary = binary.zfill(32)
    return binary

def c0_encode_list(arr):
    temp = []
    for e in arr:
        temp.append(c0_encode(e))
    return "".join(temp)

def c0_decode(data):
    allbin = chunkstring(data,32)
    temp = []
    for e in allbin:
        temp.append(bin_to_dec(e))
    return temp

In [47]:
def dumpFiles_c0(invidx):
    dictionary = {}
    file = open("debugc0.idx","wb")
    offset = 0
    for term, posting_list in invidx.items():
        allbytes = c0_encode_list(posting_list)
        sz = len(allbytes)
        chunks = chunkstring(allbytes,8)
        temp = []
        for chunk in chunks:
            temp.append(bin_to_dec(chunk))
        file.write(bytearray(temp))
        length = len(chunks)
        dictionary[term] = [offset,length]
        offset += length
    file.close()
    return dictionary

In [48]:
dictionary_c0 = dumpFiles_c0(invidx)

In [52]:
def getPostings_c0(tokens, dictionary, filename):
    f = open(filename, "rb")
    res = []
    for token in tokens:
        start = dictionary[token][0]
        sz = dictionary[token][1]
        f.seek(start)
        allbytes = ''
        for i in range(0,sz):
            temp = f.read(1)
            val = int.from_bytes(temp,"big")
            allbytes+=padding(dec_to_binary(val))
        doclist = c0_decode(allbytes)
        res.append(undoGapEncode(doclist))
    return res
getPostings_c0(getTokensFromText("south african sanction"),dictionary_c0,'debugc0.idx')

[[2, 21, 25, 30, 33, 36, 45, 53, 54, 63, 64, 71, 75, 78, 85, 88, 94],
 [53],
 [53]]

In [7]:
def vbencode_number(number):
    bytes_list = []
    while True:
        bytes_list.insert(0, number % 128)
        if number < 128:
            break
        number = number // 128
    for i in range(len(bytes_list)-1):
        bytes_list[i] +=128
    
    temp = []
    for num in bytes_list:
        temp.append(dec_to_binary(num))
    app = 8 - len(temp[-1])
    temp[-1] = '0'*app + temp[-1]
    return temp

In [8]:
vbencode_number(111119)


['10000110', '11100100', '00001111']

In [9]:
def vbencode(numbers):
    stream = []
    for n in numbers:
        temp = vbencode_number(n)
        stream.extend(temp)
    return stream

In [10]:
vbencode([111119,4])

['10000110', '11100100', '00001111', '00000100']

In [11]:
def vbdecode(stream):
    numbers = []
    n = 0
    for byte in stream:
        if(byte[0]=='1'):
            n = 128*n + bin_to_dec(byte[1:])
        else:
            n = 128*n + bin_to_dec(byte)
            numbers.append(n)
            n=0
    return numbers

In [12]:
vbdecode(vbencode([111119,4]))

[111119, 4]

In [41]:
invidx["bush"]

[5, 1, 16, 31, 8, 1, 13, 4, 1, 14]

In [42]:
def dumpFiles(invidx):
    dictionary = {}
    file = open("debug.idx","wb")
    offset = 0
    for term, posting_list in invidx.items():
        encoded = vbencode(posting_list)
        temp = []
        for enc in encoded:
            temp.append(bin_to_dec(enc))
        #file.write(allbytes.encode())
        for item in temp:
            file.write(item.to_bytes(1,sys.byteorder))
        #file.write(bytes(temp))
        length = len(temp)
        dictionary[term] = [offset,length]
        offset += length
    file.close()
    return dictionary

In [43]:
dictionary = dumpFiles(invidx)

In [129]:
def dumpDicts(dictionary, int_to_docID):
    with open("sample.json", "w") as outfile:
        res ={"compression": 0,"dictionary":dictionary, "int_to_docID":int_to_docID}
        json.dump(res,outfile)
dumpDicts(dictionary,int_to_docID)

In [133]:
with open("sample.json", "r") as jsonfile:
    tempdict = json.load(jsonfile)

In [28]:
def padding(x):
    n = len(x)
    return '0'*(8-n) + x

In [29]:
padding(dec_to_binary(3))

'00000011'

In [30]:
def chunkstring(string, length):
    return [string[0+i:length+i] for i in range(0, len(string), length)]

In [47]:
def getPostings(tokens, dictionary, filename):
    f = open(filename, "rb")
    res = []
    for token in tokens:
        start = dictionary[token][0]
        sz = dictionary[token][1]
        f.seek(start)
        allbytes = ''
        for i in range(0,sz):
            temp = f.read(1)
            val = int.from_bytes(temp,"big")
            allbytes+=padding(dec_to_binary(val))
        chunks = chunkstring(allbytes,8)
        doclist = vbdecode(chunks)
        res.append(undoGapEncode(doclist))
    return res

In [48]:
postings = getPostings(getTokensFromText("South African Sanction"),dictionary,'debug.idx')
docids = getIntersection(postings)
for d in docids:
   print(int_to_docID[d])

AP880214-0053


In [41]:
postings = getPostings(getTokensFromText("Bush secretary"),dictionary,'debug.idx')
docids = getIntersection(postings)
for d in docids:
   print(int_to_docID[d])

NameError: name 'dictionary' is not defined

In [34]:
#vbencode([170])
#vbdecode([b'10000001', b'00101010'])
ll = b'10000001'
bin_to_dec('10101010')

170

In [31]:
def c2_encoding(x):
    if(x==1):
        return "0"
    def l(x):
        return math.floor(math.log2(x))+1
    def U(n):
        if(n<=0):
            return '0'
        return '1' * (n-1) + '0'
    def lsb(a,b):
        binary = dec_to_binary(a)
        if len(binary) < b:
            binary = '0'*(b - len(binary)) + binary
        return binary[-b:]
    lx = l(x)
    llx = l(lx)
    t1 = U(llx)
    t2 = lsb(lx,llx-1)
    t3 = lsb(x,lx-1)
    return t1+t2+t3

In [32]:
def c2_encode_list(arr):
    ans = ''
    for e in arr:
        ans+=c2_encoding(e)
    return ans

In [17]:
#c2_encoding(1)
#c2_encoding(119) + c2_encoding(1)+ c2_encoding(69)
c2_decode(c2_encode_list([10]))

NameError: name 'c2_decode' is not defined

In [33]:
def c2_decode(x):
    i = 0
    n = len(x)
    res = []
    while(i<n):
        t1 = ''
        while(x[i]!='0'):
            t1+=x[i]
            i+=1
        llx = len(t1) + 1
        i+=1
        lx_bin = '1'+ x[i:i+llx-1]
        i += (llx-1)
        lx = bin_to_dec(lx_bin)
        num_bin = '1' + x[i:i+lx-1]
        i += (lx-1)
        num = bin_to_dec(num_bin)
        res.append(num)
    return res

In [19]:
c2_decode('11011110111011011000100')

[119, 1, 68]

In [55]:
def dumpFiles_c2(invidx):
    dictionary = {}
    file = open("debugc2.idx","wb")
    offset = 0
    for term, posting_list in invidx.items():
        allbytes = c2_encode_list(posting_list)
        sz = len(allbytes)
        if(sz%8!=0):
            allbytes = '0'*(8-sz%8) + allbytes
        skipbits = (8-sz%8)
        chunks = chunkstring(allbytes,8)
        temp = []
        for chunk in chunks:
            temp.append(bin_to_dec(chunk))
        file.write(bytearray(temp))
        length = len(chunks)
        dictionary[term] = [offset,length,skipbits]
        offset += length
    file.close()
    return dictionary

In [66]:
def dumpFiles_c2(invidx):
    dictionary = {}
    file = open("debugc2.idx","wb")
    offset = 0
    for term, posting_list in invidx.items():
        allbytes = c2_encode_list(posting_list)
        sz = len(allbytes)
        skipbits = 0
        if(sz%8!=0):
            allbytes = '0'*(8-sz%8) + allbytes
            skipbits = (8-sz%8)
        chunks = chunkstring(allbytes,8)
        temp = []
        for chunk in chunks:
            temp.append(bin_to_dec(chunk))
        file.write(bytearray(temp))
        length = len(chunks)
        dictionary[term] = [offset,length,skipbits]
        offset += length
    file.close()
    return dictionary

In [141]:
c2_encode_list([1])

'0'

In [56]:
dictionary_c2 = dumpFiles_c2(invidx)

In [63]:
def getPostings_c2(tokens, dictionary, filename):
    f = open(filename, "rb")
    res = []
    for token in tokens:
        start = dictionary[token][0]
        sz = dictionary[token][1]
        skipbits = dictionary[token][2]
        f.seek(start)
        allbytes = ''
        for i in range(0,sz):
            temp = f.read(1)
            val = int.from_bytes(temp,"big")
            allbytes+=padding(dec_to_binary(val))
        i = (skipbits)
        allbytes = allbytes[i:]
        doclist = c2_decode(allbytes)
        res.append(undoGapEncode(doclist))
    return res
getPostings_c2(getTokensFromText("south african sanction"),dictionary_c2,'debugc2.idx')

[[2, 21, 25, 30, 33, 36, 45, 53, 54, 63, 64, 71, 75, 78, 85, 88, 94],
 [53],
 [53]]

In [62]:
dictionary_c2['african']

[14708, 2, 6]

In [65]:
postings = getPostings_c2(getTokensFromText("South African Sanctions"),dictionary_c2,'debugc2.idx')
docids = getIntersection(postings)
for d in docids:
    print(int_to_docID[d])

AP880214-0053


In [20]:
def c3_encoding(text):
    text = str(text)
    return snappy.compress(text)

In [375]:
def c3_encode_list(arr):
    temp = vbencode(arr)
    s = ''
    for enc in temp:
        s+=enc
    return c3_encoding(s)

In [376]:
c3_encode_list([194,7,55])

b' \x0410\x05\x01\x0410\x05\t\x05\x0500011100110111'

In [377]:
def c3_decode_list(text):
    allbytes = snappy.decompress(text).decode('utf-8')
    chunks = chunkstring(allbytes,8)
    return vbdecode(chunks)

In [378]:
c3_decode_list(c3_encode_list([194,7,55]))

[194, 7, 55]

In [53]:
# def dumpFiles_c3(invidx):
#     dictionary = {}
#     file = open("indexfile_c3_nav.idx","wb")
#     offset = 0
#     cont = []
#     for term, posting_list in invidx.items():
#         temp = c2_encode_list(posting_list)
#         cont.append(temp)
#         invidx[term] = []
#         length = len(temp)
#         dictionary[term] = [offset,length]
#         offset += length
#     allbytes = "".join(cont)
#     sz = len(allbytes)
#     if(sz%8!=0):
#         allbytes = '0'*(8-sz%8) + allbytes
#     chunks = chunkstring(allbytes,8)
#     temp = []
#     for chunk in chunks:
#         temp.append(bin_to_dec(chunk))
#     file.write(snappy.compress(bytearray(temp)))
#     file.close()
#     return dictionary, (8-sz%8)

In [75]:
def c3encodehelper(arr):
    temp = []
    for e in arr:
        temp.append(str(e))
    return ",".join(temp)

In [80]:
def c3decodehelper(string):
    ans = string.split(",")
    return list(map(int,ans))

In [95]:
def dumpFiles_c3(invidx):
    dictionary = {}
    file = open("indexfile_c3.idx","wb")
    offset = 0
    cont = []
    for term, posting_list in invidx.items():
        temp = c3encodehelper(posting_list)
        cont.append(temp)
        invidx[term] = []
        length = len(temp)
        dictionary[term] = [offset,length]
        offset += length
    allbytes = "".join(cont)
    file.write(snappy.compress(allbytes))
    file.close()
    return dictionary

In [96]:
c3dictionary = dumpFiles_c3(invidx)

In [97]:
def getPostings_c3(tokens, dictionary, filename):
    f = open(filename, "rb")
    res = []
    encoded = f.read()
    decoded = snappy.decompress(encoded)
    for token in tokens:
        start = dictionary[token][0]
        end = start + dictionary[token][1]
        subset = decoded[start:end]
        subset = str(subset)
        subset = subset.replace("b'","")
        subset = subset.replace("'","")
        docList = c3decodehelper(subset)
        res.append(undoGapEncode(docList))
    return res

In [98]:
postings = getPostings_c3(getTokensFromText("Bush secretary"),c3dictionary,'indexfile_c3.idx')
docids = getIntersection(postings)
for d in docids:
    print(int_to_docID[d])

AP880214-0006
AP880214-0075


In [380]:
a= bytearray([1,2,3])
comp = snappy.compress(a)
decomp = snappy.decompress(comp)
print(a==decomp)

True


In [58]:
c3dictionary,origoff = dumpFiles_c3(invidx)

In [59]:
c3dictionary

{'more': [0, 236],
 'than': [236, 208],
 '150': [444, 29],
 'former': [473, 107],
 'offic': [580, 161],
 'of': [741, 177],
 'the': [918, 173],
 'overthrown': [1091, 1],
 'south': [1092, 107],
 'vietnames': [1199, 1],
 'govern': [1200, 184],
 'have': [1384, 241],
 'been': [1625, 224],
 'releas': [1849, 155],
 'from': [2004, 226],
 'a': [2230, 179],
 're-educ': [2409, 1],
 'camp': [2410, 45],
 'after': [2455, 235],
 '13': [2690, 80],
 'year': [2770, 220],
 'detent': [2990, 1],
 'offici': [2991, 177],
 'vietnam': [3168, 12],
 'new': [3180, 238],
 'agenc': [3418, 131],
 'report': [3549, 206],
 'saturdai': [3755, 93],
 'hanoi': [3848, 1],
 'monitor': [3849, 64],
 'in': [3913, 175],
 'bangkok': [4088, 1],
 'did': [4089, 176],
 'not': [4265, 230],
 'give': [4495, 158],
 'specif': [4653, 50],
 'figur': [4703, 138],
 'but': [4841, 257],
 'said': [5098, 197],
 'those': [5295, 171],
 'freed': [5466, 20],
 'fridai': [5486, 192],
 'includ': [5678, 197],
 'an': [5875, 230],
 'ex-cabinet': [6105, 1],

In [450]:
#undoGapEncode(invidx['more'])

In [60]:
def getPostings_c3(tokens, dictionary, filename):
    f = open(filename, "rb")
    res = []
    encoded = f.read()
    decoded = snappy.decompress(encoded)
    fullstr = ''
    for x in list(decoded):
        fullstr+=padding(dec_to_binary(x))
    for token in tokens:
        start = dictionary[token][0]+origoff
        end = start + dictionary[token][1]
        subset = fullstr[start:end]
        docList = c2_decode(subset)
        res.append(undoGapEncode(docList))
    return res

In [61]:
postings = getPostings_c3(getTokensFromText("Bush secretary"),c3dictionary,'indexfile_c3_nav.idx')
docids = getIntersection(postings)
for d in docids:
    print(int_to_docID[d])

AP880212-0005
AP880212-0011
AP880212-0080


In [54]:
# first check in vocab. error handling left.

In [65]:
def c4_encoding(x, k):
    def U(n):
        if(n <= 0):
            return ''
        return '1' * (n-1) + '0'
    b = pow(2, k)
    q = math.floor((x-1)/b)
    r = x - q*b - 1
    t1 = U(q+1)
    temp = dec_to_binary(r)
    t2 = '0'*(k-len(temp)) + temp
    return t1 + t2

In [36]:
c4_encoding(30,6)

'0011101'

In [66]:
def c4_encode_list(arr, k):
    ans = ''
    temp = []
    for e in arr:
        #ans += c4_encoding(e, k)
        temp.append(c4_encoding(e, k))
    ans = "".join(temp)
    return ans

In [24]:
for x in [50,150,250,400,800,2000]:
    for i in [3,4,5,6,7,8,9]:
        #print(x,i,len(c4_encode_list([1,1,3,30,x],i)))
        continue

In [38]:
def c4_decode(stream,k):
    i=0
    b = pow(2,k)
    res = []
    while(i<len(stream)): 
        q = 0
        while(stream[i]!='0'):
            q+=1
            i+=1
        i+=1
        r_bin = stream[i:i+k]
        i+=k
        r = bin_to_dec(r_bin)
        x = q*b + r + 1
        #print(x)
        res.append(x)
    return res

In [67]:
def c4_decode_withpad(stream, k, numbits):
    i = 0
    b = pow(2, k)
    res = []
    if k == 0:
        num_bits_in_one = 2
    else:
        num_bits_in_one = k+1
    while(i < numbits):
        while(True):
            if(i+num_bits_in_one-1 < numbits):
                if(stream[i:i+num_bits_in_one] == ("0"*num_bits_in_one)):
                    res.append(1)
                    i += num_bits_in_one
                else:
                    break
            else:
                break
        if(i >= numbits):
            break
        q = 0
        while(stream[i] != '0'):
            q += 1
            i += 1
        i += 1
        r_bin = stream[i:i+k]
        i += k
        r = bin_to_dec(r_bin)
        x = q*b + r + 1
        # print(x)
        res.append(x)
    return res

In [68]:
c4_decode_withpad('000000000000000000010001110100',6,28)

[1, 1, 3, 30]

In [69]:
len('0000000000000000000100011101')

28

In [70]:
c4_decode('0000000000000000000100011101',6)

[1, 1, 3, 30]

In [71]:
temp = [2, 5, 7, 11, 27, 35, 39, 54, 55, 73, 80, 82, 87, 102]
maxmm = temp[-1]
k = int(math.log2(maxmm))
allbytes = c4_encode_list(temp,k)
numbits = len(allbytes)
allbytes +='0'
c4_decode_withpad(allbytes,k,numbits)

[2, 5, 7, 11, 27, 35, 39, 54, 55, 73, 80, 82, 87, 102]

In [31]:
c4_decode('0000001000010000001100001010001101001000100100110011010101101101000100010001111100100011001011010100101',6)

[2, 5, 7, 11, 27, 35, 39, 54, 55, 73, 80, 82, 87, 102]

In [32]:
c4_decode('000000000000100011101',6)

[1, 3, 30]

In [58]:
def c2_decode_aux(x):
    i = 0
    n = len(x)
    res = []
    while(i < n):
        t1 = ''
        while(x[i] != '0'):
            t1 += x[i]
            i += 1
        llx = len(t1) + 1
        i += 1
        lx_bin = '1' + x[i:i+llx-1]
        i += (llx-1)
        lx = bin_to_dec(lx_bin)
        num_bin = '1' + x[i:i+lx-1]
        i += (lx-1)
        num = bin_to_dec(num_bin)
        break
    return num,i

In [48]:
def c4_helper(arr):
    return arr.index(min(arr))

In [110]:
def dumpFiles_c4(invidx):
    dictionary = {}
    file = open(f"indexfile_c4_new.idx","wb")
    offset = 0
    for term, posting_list in invidx.items():
        maxm = max(posting_list)
        k = int(math.log2(maxm))
        if k<5:
            kres = [1,2,3,4]
        else:
            kres = [k-4,k-3,k-2,k-1]
        temparr = []
        for k in kres:
            temp = c2_encoding(k) + c4_encode_list(posting_list,k)
            bitsreq = len(temp) + (8-len(temp)%8)
            temparr.append(bitsreq)
        k = kres[c4_helper(temparr)]
        allbytes = c2_encoding(k)+c4_encode_list(posting_list,k)
        numbits = len(allbytes)
        if(numbits%8!=0):
            allbytes = allbytes + '0'*(8-numbits%8)
        chunks = chunkstring(allbytes,8)
        temp = []
        for chunk in chunks:
            temp.append(bin_to_dec(chunk))
        file.write(bytearray(temp))
        dictionary[term] = [offset,numbits]
        offset += len(temp)
    file.close()
    return dictionary

In [111]:
c4Dictionary = dumpFiles_c4(invidx)

In [112]:
c4Dictionary

{'prime': [0, 29],
 'minist': [4, 47],
 'rajiv': [10, 3],
 'gandhi': [11, 3],
 'induct': [12, 3],
 '13': [13, 53],
 'peopl': [20, 110],
 'into': [34, 90],
 'hi': [46, 122],
 'council': [62, 47],
 'of': [68, 193],
 'on': [93, 187],
 'sundai': [117, 131],
 'and': [134, 193],
 'reshuffl': [159, 3],
 'the': [160, 193],
 'portfolio': [185, 3],
 'exist': [186, 22],
 'is': [189, 140],
 '12th': [207, 3],
 'sinc': [208, 81],
 'wa': [219, 171],
 'sworn': [241, 23],
 'offic': [244, 92],
 'in': [256, 189],
 'januari': [280, 31],
 '1985': [284, 51],
 'come': [291, 68],
 'week': [300, 92],
 'befor': [312, 96],
 'parliament': [324, 8],
 's': [325, 174],
 'annual': [347, 38],
 'budget': [352, 47],
 'session': [358, 34],
 'due': [363, 29],
 'to': [367, 193],
 'start': [392, 68],
 'told': [401, 80],
 'report': [411, 100],
 'expans': [424, 3],
 'meant': [425, 22],
 'give': [428, 76],
 'more': [438, 104],
 'represent': [451, 3],
 'variou': [452, 23],
 'region': [455, 52],
 'countri': [462, 77],
 'social':

In [113]:
def getPostings_c4(tokens, dictionary, filename):
    f = open(filename, "rb")
    res = []
    for token in tokens:
        start = dictionary[token][0]
        numbits = dictionary[token][1]
        f.seek(start)
        allbytes = ''
        numiter = numbits
        if(numiter%8!=0):
            numiter += (8-numiter%8)
        numiter = int(numiter/8)
        for i in range(0,numiter):
            temp = f.read(1)
            val = int.from_bytes(temp,"big")
            allbytes += padding(dec_to_binary(val))
        k, jump = c2_decode_aux(allbytes)
        doclist = c4_decode_withpad(allbytes[jump:],k,numbits-jump)
        res.append(undoGapEncode(doclist))
    return res

In [114]:
postings = getPostings_c4(getTokensFromText("Bush secretary"),c4Dictionary,'indexfile_c4_new.idx')
docids = getIntersection(postings)
for d in docids:
    print(int_to_docID[d])

AP880214-0006
AP880214-0075


In [43]:
with open('fullc1.dict', "r") as jsonfile:
        tempp = json.load(jsonfile)

In [44]:
tempp['int_to_docID']['1']

'AP880212-0001'

In [49]:
def c3_encoding(text):
    text = str(text)
    return snappy.compress(text)

In [51]:
jsonobj = json.dumps(invidx)

In [56]:
def dumpFiles_c3(invidx):
    file = open("indexfile_c3_new.idx","wb")
    file.write(c3_encoding(str(json.dumps(invidx)).encode()))

In [57]:
dumpFiles_c3(invidx)

In [55]:
with open("invidx.json", "w") as outfile:
    json.dump(invidx,outfile)

In [58]:
def check():
    jsonobj = json.dumps(invidx)
    text = jsonobj.encode('utf-8')
    compressed = snappy.compress(text)
    return text == snappy.decompress(compressed)

In [59]:
check()

True

In [62]:
def c3dumptest(invidx):
    temp = ''
    for _,postings in invidx.items():
        for elem in postings:
            temp+=str(elem)
    compressed = snappy.compress(temp.encode('utf-8'))
    f = open('c3tempp', "wb")
    f.write(compressed)
    f.close()

In [63]:
c3dumptest(invidx)