In [1]:
from bs4 import BeautifulSoup
import re
import os
from PorterStemmer import * 
from tqdm.notebook import tqdm
import snappy
import mmap
ps = PorterStemmer()

In [2]:
path = 'tipster-ap-frac/ap880212'
directory = 'tipster-ap-frac'

In [None]:
# for filename in os.listdir(directory):
#     process_file(os.path.join(directory,filename))

In [3]:
# maps docnum to list of tokens for all files in directory
def process_directory(dir_path):
    data = {}
    for filename in tqdm(os.listdir(dir_path)):
        full_path = os.path.join(dir_path,filename)
        file = open(full_path, "r")
        contents = file.read()
        soup = BeautifulSoup(contents, 'html.parser')
        docs = soup.find_all('doc')

        for doc in docs:
            docnum = doc.find('docno').get_text().strip()
            fields = doc.find_all('text')
            res = []
            for field in fields:
                text = field.get_text()
                words = re.split('''[ ',.:_;"’`\n]''',text)
                for w in words:
                    if(len(w)>0):
                        res.append(ps.stem(w.lower(),0,len(w)-1))
            data[docnum] = res
    return data

In [4]:
# maps docnum to list of tokens in a file
def process_file(full_path):
    file = open(full_path, "r")
    contents = file.read()
    soup = BeautifulSoup(contents, 'html.parser')
    docs = soup.find_all('doc')
    data = {}
    for doc in docs:
        docnum = doc.find('docno').get_text().strip()
        fields = doc.find_all('text')
        res = []
        for field in fields:
            text = field.get_text()
            words = re.split('''[ ',.:_;"’`\n]''',text)
            for w in words:
                if(len(w)>0):
                    res.append(ps.stem(w.lower(),0,len(w)-1))
        data[docnum] = res
    return data

In [5]:
data = process_file(path)

In [48]:
#full_data = process_directory(directory)

In [6]:
# gets vocabulary from the data and returns it in the form of a set
def getVocab(data):
    tokens = []
    for doc,token_list in data.items():
        tokens += token_list
    return set(tokens)

In [7]:
# takes a list of document names and maps them to integers and returns the map
def mapDocIDs(docIDs):
    docID_to_int = {}
    int_to_docID = {}
    i = 1
    for doc in docIDs:
        docID_to_int[doc] = i
        int_to_docID[i] = doc
        i+=1
    return docID_to_int, int_to_docID

def makeDocIdMap(full_path):
    file = open(full_path, "r")
    contents = file.read()
    soup = BeautifulSoup(contents, 'html.parser')
    docs = soup.find_all('doc')
    docIDs = []
    for doc in docs:
        docnum = doc.find('docno').get_text().strip()
        docIDs.append(docnum)
    return mapDocIDs(docIDs)

In [8]:
docID_to_int, int_to_docID = mapDocIDs(data)
vocab = getVocab(data)

In [9]:
# generates inverted index from the given data
def getInvIdx(data):
    invidx = {}
    for doc, token_list in data.items():
        doc = docID_to_int[doc]
        for token in token_list:
            if token in vocab:
                if token in invidx.keys():
                    if(invidx[token][-1]!=doc):
                        invidx[token].append(doc)
                else:
                    invidx[token] = [doc]
    return invidx

In [10]:
def gapEncodeList(arr):
    first = arr[0]
    res = [first]
    for i,item in enumerate(arr):
        if i==0:
            continue
        res.append(item - first)
    return res

In [11]:
def undoGapEncode(arr):
    first = arr[0]
    for i,item in enumerate(arr):
        if i==0:
            continue
        arr[i] +=first
    return arr

In [12]:
def dec_to_binary(n):
    return bin(n).replace("0b", "")

In [13]:
def bin_to_dec(n):
    return int(n,2)

In [25]:
invidx = getInvIdx(data)

In [15]:
def doGapEncoding(invidx):
    for token, posting_list in invidx.items():
        invidx[token] = gapEncodeList(posting_list)
    return invidx

In [35]:
invidx = doGapEncoding(invidx)

In [28]:
# find list of common docs from multiple lists
def getIntersection(postings):
    ans = postings[0]
    for posting in postings:
        tempans = []
        n1 = len(ans)
        n2 = len(posting)
        i,j = 0,0
        while i!=n1 and j!=n2:
            if ans[i] == posting[j]:
                tempans.append(ans[i])
                i+=1
                j+=1
            elif ans[i]<posting[j]:
                i+=1
            else:
                j+=1
        ans = tempans
    return ans

In [29]:
def getTokensFromText(text):
    words = re.split('''[ ',.:_;"’`\n]''',text)
    res = []
    for w in words:
        if(len(w)>0):
            res.append(ps.stem(w.lower(),0,len(w)-1))
    return res

In [30]:
def vbencode_number(number):
    bytes_list = []
    while True:
        bytes_list.insert(0, number % 128)
        if number < 128:
            break
        number = number // 128
    for i in range(len(bytes_list)-1):
        bytes_list[i] +=128
    
    temp = []
    for num in bytes_list:
        temp.append(dec_to_binary(num))
    app = 8 - len(temp[-1])
    temp[-1] = '0'*app + temp[-1]
    return temp

In [20]:
vbencode_number(111119)


['10000110', '11100100', '00001111']

In [31]:
def vbencode(numbers):
    stream = []
    for n in numbers:
        temp = vbencode_number(n)
        stream.extend(temp)
    return stream

In [32]:
vbencode([111119,4])

['10000110', '11100100', '00001111', '00000100']

In [33]:
def vbdecode(stream):
    numbers = []
    n = 0
    for byte in stream:
        if(byte[0]=='1'):
            n = 128*n + bin_to_dec(byte[1:])
        else:
            n = 128*n + bin_to_dec(byte)
            numbers.append(n)
            n=0
    return numbers

In [23]:
vbdecode(vbencode([111119,4]))

[111119, 4]

In [36]:
invidx

{'more': [1,
  1,
  2,
  4,
  6,
  10,
  11,
  14,
  17,
  18,
  19,
  22,
  27,
  28,
  29,
  37,
  40,
  44,
  47,
  48,
  49,
  51,
  53,
  55,
  58,
  61,
  62,
  63,
  65,
  66,
  69,
  73,
  75,
  77,
  79,
  80,
  85,
  87,
  89,
  91,
  94,
  96,
  99,
  100,
  101,
  102,
  103,
  105,
  106,
  107,
  108,
  109,
  110,
  111,
  114,
  115,
  117,
  119,
  120,
  121,
  122,
  124,
  125,
  126,
  127,
  128,
  130,
  131,
  133,
  134,
  135,
  136,
  138,
  139,
  143,
  145,
  148,
  152,
  155,
  158,
  162,
  163,
  165,
  166,
  167,
  170],
 'than': [1,
  1,
  4,
  10,
  11,
  14,
  16,
  17,
  18,
  19,
  27,
  29,
  31,
  44,
  47,
  48,
  49,
  51,
  53,
  55,
  62,
  63,
  66,
  69,
  73,
  75,
  77,
  78,
  79,
  80,
  81,
  85,
  87,
  91,
  92,
  95,
  96,
  100,
  101,
  102,
  105,
  107,
  108,
  109,
  110,
  111,
  118,
  119,
  120,
  121,
  122,
  123,
  127,
  128,
  131,
  133,
  134,
  135,
  136,
  139,
  142,
  143,
  145,
  148,
  152,
  155,
  158,


In [37]:
def dumpFiles(invidx):
    dictionary = {}
    file = open("indexfile.idx","a")
    offset = 0
    for term, posting_list in invidx.items():
        encoded = vbencode(posting_list)
        allbytes = ""
        for enc in encoded:
            allbytes +=enc
        file.write(allbytes)
        length = len(allbytes)
        dictionary[term] = [offset,length]
        offset += length
    file.close()
    return dictionary

In [38]:
dictionary = dumpFiles(invidx)

In [39]:
dictionary

{'more': [0, 856],
 'than': [856, 744],
 '150': [1600, 48],
 'former': [1648, 192],
 'offic': [1840, 376],
 'of': [2216, 1680],
 'the': [3896, 1704],
 'overthrown': [5600, 8],
 'south': [5608, 184],
 'vietnames': [5792, 8],
 'govern': [5800, 488],
 'have': [6288, 1080],
 'been': [7368, 968],
 'releas': [8336, 344],
 'from': [8680, 1304],
 'a': [9984, 1664],
 're-educ': [11648, 8],
 'camp': [11656, 56],
 'after': [11712, 992],
 '13': [12704, 136],
 'year': [12840, 792],
 'detent': [13632, 8],
 'offici': [13640, 520],
 'vietnam': [14160, 16],
 'new': [14176, 1176],
 'agenc': [15352, 240],
 'report': [15592, 800],
 'saturdai': [16392, 128],
 'hanoi': [16520, 8],
 'monitor': [16528, 80],
 'in': [16608, 1696],
 'bangkok': [18304, 8],
 'did': [18312, 384],
 'not': [18696, 1040],
 'give': [19736, 288],
 'specif': [20024, 64],
 'figur': [20088, 416],
 'but': [20504, 1200],
 'said': [21704, 1520],
 'those': [23224, 400],
 'freed': [23624, 24],
 'fridai': [23648, 1048],
 'includ': [24696, 656],


In [40]:
def chunkstring(string, length):
    return [string[0+i:length+i] for i in range(0, len(string), length)]

In [41]:
def getPostings(tokens, dictionary, filename):
    f = open(filename, "r+b")
    mm = mmap.mmap(f.fileno(), 0)
    res = []
    for token in tokens:
        start = dictionary[token][0]
        end = start + dictionary[token][1]
        allbytes = mm[start:end]
        allbytes = str(allbytes)
        allbytes = allbytes.replace("'","")
        allbytes = allbytes.replace("b","")
        chunks = chunkstring(allbytes,8)
        doclist = vbdecode(chunks)
        res.append(undoGapEncode(doclist))
    return res

In [42]:
postings = getPostings(getTokensFromText("Bush secretary"),dictionary,'indexfile.idx')
docids = getIntersection(postings)
for d in docids:
    print(int_to_docID[d])

AP880212-0005
AP880212-0011
AP880212-0080


In [163]:
#vbencode([170])
#vbdecode([b'10000001', b'00101010'])
ll = b'10000001'
bin_to_dec('10101010')

170

In [55]:
def c2_encoding(x):
    if(x==1):
        return "0"
    def l(x):
        return len(dec_to_binary(x))
    def U(n):
        if(n<=0):
            return '0'
        return '1' * (n-1) + '0'
    def lsb(a,b):
        binary = dec_to_binary(a)
        return binary[-b:]
    lx = l(x)
    llx = l(lx)
    t1 = U(llx)
    t2 = lsb(lx,llx-1)
    t3 = lsb(x,lx-1)
    return t1+t2+t3

In [59]:
def c2_encode_list(arr):
    ans = ''
    for e in arr:
        ans+=c2_encoding(e)
    return ans

In [60]:
#c2_encoding(1)
#c2_encoding(119) + c2_encoding(1)+ c2_encoding(69)
c2_encode_list([119,1,69])

'11011110111011011000101'

In [45]:
def c2_decode(x):
    i = 0
    n = len(x)
    res = []
    while(i<n):
        t1 = ''
        while(x[i]!='0'):
            t1+=x[i]
            i+=1
        llx = len(t1) + 1
        i+=1
        lx_bin = '1'+ x[i:i+llx-1]
        i += (llx-1)
        lx = bin_to_dec(lx_bin)
        num_bin = '1' + x[i:i+lx-1]
        i += (lx-1)
        num = bin_to_dec(num_bin)
        res.append(num)
    return res

In [58]:
c2_decode('11011110111011011000101')

[119, 1, 69]

In [63]:
def dumpFiles_c2(invidx):
    dictionary = {}
    file = open("indexfile_c2.idx","a")
    offset = 0
    for term, posting_list in invidx.items():
        allbytes = c2_encode_list(posting_list)
        file.write(allbytes)
        length = len(allbytes)
        dictionary[term] = [offset,length]
        offset += length
    file.close()
    return dictionary

In [64]:
dictionary_c2 = dumpFiles_c2(invidx)

In [65]:
def getPostings_c2(tokens, dictionary, filename):
    f = open(filename, "r+b")
    mm = mmap.mmap(f.fileno(), 0)
    res = []
    for token in tokens:
        start = dictionary[token][0]
        end = start + dictionary[token][1]
        allbytes = mm[start:end]
        allbytes = str(allbytes)
        allbytes = allbytes.replace("'","")
        allbytes = allbytes.replace("b","")
        doclist = c2_decode(allbytes)
        res.append(undoGapEncode(doclist))
    return res

In [66]:
postings = getPostings_c2(getTokensFromText("Bush secretary"),dictionary_c2,'indexfile_c2.idx')
docids = getIntersection(postings)
for d in docids:
    print(int_to_docID[d])

AP880212-0005
AP880212-0011
AP880212-0080


In [67]:
def c3_encoding(text):
    text = str(text)
    return snappy.compress(text)

In [68]:
def c3_encode_list(arr):
    temp = vbencode(arr)
    s = ''
    for enc in temp:
        s+=enc
    return c3_encoding(s)

In [69]:
c3_encode_list([194,7,55])

b' \x0410\x05\x01\x0410\x05\t\x05\x0500011100110111'

In [70]:
def c3_decode_list(text):
    allbytes = snappy.decompress(text).decode('utf-8')
    chunks = chunkstring(allbytes,8)
    return vbdecode(chunks)

In [71]:
c3_decode_list(c3_encode_list([194,7,55]))

[194, 7, 55]

In [72]:
def dumpFiles_c3(invidx):
    dictionary = {}
    file = open("indexfile_c3.idx","wb")
    offset = 0
    for term, posting_list in invidx.items():
        allbytes = c3_encode_list(posting_list)
        file.write(allbytes)
        length = len(allbytes)
        dictionary[term] = [offset,length]
        offset += length
    file.close()
    return dictionary

In [73]:
c3dictionary = dumpFiles_c3(invidx)

In [74]:
c3dictionary

{'more': [0, 325],
 'than': [325, 300],
 '150': [625, 39],
 'former': [664, 90],
 'offic': [754, 164],
 'of': [918, 614],
 'the': [1532, 637],
 'overthrown': [2169, 10],
 'south': [2179, 92],
 'vietnames': [2271, 10],
 'govern': [2281, 200],
 'have': [2481, 420],
 'been': [2901, 389],
 'releas': [3290, 155],
 'from': [3445, 488],
 'a': [3933, 603],
 're-educ': [4536, 10],
 'camp': [4546, 41],
 'after': [4587, 388],
 '13': [4975, 71],
 'year': [5046, 329],
 'detent': [5375, 10],
 'offici': [5385, 221],
 'vietnam': [5606, 18],
 'new': [5624, 443],
 'agenc': [6067, 121],
 'report': [6188, 310],
 'saturdai': [6498, 71],
 'hanoi': [6569, 10],
 'monitor': [6579, 51],
 'in': [6630, 635],
 'bangkok': [7265, 10],
 'did': [7275, 173],
 'not': [7448, 420],
 'give': [7868, 138],
 'specif': [8006, 48],
 'figur': [8054, 172],
 'but': [8226, 472],
 'said': [8698, 576],
 'those': [9274, 177],
 'freed': [9451, 23],
 'fridai': [9474, 407],
 'includ': [9881, 274],
 'an': [10155, 442],
 'ex-cabinet': [105

In [75]:
def getPostings_c3(tokens, dictionary, filename):
    f = open(filename, "r+b")
    mm = mmap.mmap(f.fileno(), 0)
    res = []
    for token in tokens:
        start = dictionary[token][0]
        end = start + dictionary[token][1]
        allbytes = mm[start:end]
        doclist = c3_decode_list(allbytes)
        res.append(undoGapEncode(doclist))
    return res

In [76]:
postings = getPostings_c3(getTokensFromText("Bush secretary"),c3dictionary,'indexfile_c3.idx')
docids = getIntersection(postings)
for d in docids:
    print(int_to_docID[d])

AP880212-0005
AP880212-0011
AP880212-0080
