In [83]:
from bs4 import BeautifulSoup
import re
import os
from PorterStemmer import * 
from tqdm.notebook import tqdm
import snappy
import mmap
ps = PorterStemmer()

In [3]:
path = 'tipster-ap-frac/ap880212'
directory = 'tipster-ap-frac'

In [None]:
# for filename in os.listdir(directory):
#     process_file(os.path.join(directory,filename))

In [4]:
# maps docnum to list of tokens for all files in directory
def process_directory(dir_path):
    data = {}
    for filename in tqdm(os.listdir(dir_path)):
        full_path = os.path.join(dir_path,filename)
        file = open(full_path, "r")
        contents = file.read()
        soup = BeautifulSoup(contents, 'html.parser')
        docs = soup.find_all('doc')

        for doc in docs:
            docnum = doc.find('docno').get_text().strip()
            fields = doc.find_all('text')
            res = []
            for field in fields:
                text = field.get_text()
                words = re.split('''[ ',.:_;"’`\n]''',text)
                for w in words:
                    if(len(w)>0):
                        res.append(ps.stem(w.lower(),0,len(w)-1))
            data[docnum] = res
    return data

In [5]:
# maps docnum to list of tokens in a file
def process_file(full_path):
    file = open(full_path, "r")
    contents = file.read()
    soup = BeautifulSoup(contents, 'html.parser')
    docs = soup.find_all('doc')
    data = {}
    for doc in docs:
        docnum = doc.find('docno').get_text().strip()
        fields = doc.find_all('text')
        res = []
        for field in fields:
            text = field.get_text()
            words = re.split('''[ ',.:_;"’`\n]''',text)
            for w in words:
                if(len(w)>0):
                    res.append(ps.stem(w.lower(),0,len(w)-1))
        data[docnum] = res
    return data

In [6]:
data = process_file(path)

In [48]:
#full_data = process_directory(directory)

In [7]:
# gets vocabulary from the data and returns it in the form of a set
def getVocab(data):
    tokens = []
    for doc,token_list in data.items():
        tokens += token_list
    return set(tokens)

In [8]:
# takes a list of document names and maps them to integers and returns the map
def mapDocIDs(docIDs):
    docID_to_int = {}
    int_to_docID = {}
    i = 0
    for doc in docIDs:
        docID_to_int[doc] = i
        int_to_docID[i] = doc
        i+=1
    return docID_to_int, int_to_docID

def makeDocIdMap(full_path):
    file = open(full_path, "r")
    contents = file.read()
    soup = BeautifulSoup(contents, 'html.parser')
    docs = soup.find_all('doc')
    docIDs = []
    for doc in docs:
        docnum = doc.find('docno').get_text().strip()
        docIDs.append(docnum)
    return mapDocIDs(docIDs)

In [179]:
docID_to_int, int_to_docID = mapDocIDs(data)
vocab = getVocab(data)

In [10]:
# generates inverted index from the given data
def getInvIdx(data):
    invidx = {}
    for doc, token_list in data.items():
        doc = docID_to_int[doc]
        for token in token_list:
            if token in vocab:
                if token in invidx.keys():
                    if(invidx[token][-1]!=doc):
                        invidx[token].append(doc)
                else:
                    invidx[token] = [doc]
    return invidx

In [11]:
def dec_to_binary(n):
    return bin(n).replace("0b", "")

In [60]:
def bin_to_dec(n):
    return int(n,2)

In [20]:
invidx = getInvIdx(data)

In [172]:
# find list of common docs from multiple lists
def getIntersection(postings):
    ans = postings[0]
    for posting in postings:
        tempans = []
        n1 = len(ans)
        n2 = len(posting)
        i,j = 0,0
        while i!=n1 and j!=n2:
            if ans[i] == posting[j]:
                tempans.append(ans[i])
                i+=1
                j+=1
            elif ans[i]<posting[j]:
                i+=1
            else:
                j+=1
        ans = tempans
    return ans

In [174]:
def getTokensFromText(text):
    words = re.split('''[ ',.:_;"’`\n]''',text)
    res = []
    for w in words:
        if(len(w)>0):
            res.append(ps.stem(w.lower(),0,len(w)-1))
    return res

In [30]:
# takes a num input and gives c1 encoding
def c1_encode(num):
    binary = dec_to_binary(num)
    n = len(binary)
    app = 7 - n%7
    binary = app*'0' + binary
    ans = ''
    for i in range(0,len(binary)//7):
        temp = binary[7*i:7*(i+1)]
        if i== (len(binary)/7 -1):
           ans+='0'
        else:
           ans+='1'
        ans+=temp
    return ans

In [109]:
# takes a c1 encoded binary string input and gives 
def c1_decode(binary):
    n = len(binary)
    for i in range(0,len(binary)//7):
        

In [56]:
def vbencode_number(number):
    bytes_list = []
    while True:
        bytes_list.insert(0, number % 128)
        if number < 128:
            break
        number = number // 128
    for i in range(len(bytes_list)-1):
        bytes_list[i] +=128
    
    temp = []
    for num in bytes_list:
        temp.append(dec_to_binary(num))
    app = 8 - len(temp[-1])
    temp[-1] = '0'*app + temp[-1]
    return temp

In [57]:
vbencode_number(111119)


['10000110', '11100100', '00001111']

In [58]:
def vbencode(numbers):
    stream = []
    for n in numbers:
        temp = vbencode_number(n)
        stream.extend(temp)
    return stream

In [59]:
vbencode([111119,4])

['10000110', '11100100', '00001111', '00000100']

In [73]:
def vbdecode(stream):
    numbers = []
    n = 0
    for byte in stream:
        if(byte[0]=='1'):
            n = 128*n + bin_to_dec(byte[1:])
        else:
            n = 128*n + bin_to_dec(byte)
            numbers.append(n)
            n=0
    return numbers

In [74]:
vbdecode(vbencode([111119,4]))

[111119, 4]

In [81]:
invidx

{'more': [0,
  1,
  2,
  4,
  6,
  10,
  11,
  14,
  17,
  18,
  19,
  22,
  27,
  28,
  29,
  37,
  40,
  44,
  47,
  48,
  49,
  51,
  53,
  55,
  58,
  61,
  62,
  63,
  65,
  66,
  69,
  73,
  75,
  77,
  79,
  80,
  85,
  87,
  89,
  91,
  94,
  96,
  99,
  100,
  101,
  102,
  103,
  105,
  106,
  107,
  108,
  109,
  110,
  111,
  114,
  115,
  117,
  119,
  120,
  121,
  122,
  124,
  125,
  126,
  127,
  128,
  130,
  131,
  133,
  134,
  135,
  136,
  138,
  139,
  143,
  145,
  148,
  152,
  155,
  158,
  162,
  163,
  165,
  166,
  167,
  170],
 'than': [0,
  1,
  4,
  10,
  11,
  14,
  16,
  17,
  18,
  19,
  27,
  29,
  31,
  44,
  47,
  48,
  49,
  51,
  53,
  55,
  62,
  63,
  66,
  69,
  73,
  75,
  77,
  78,
  79,
  80,
  81,
  85,
  87,
  91,
  92,
  95,
  96,
  100,
  101,
  102,
  105,
  107,
  108,
  109,
  110,
  111,
  118,
  119,
  120,
  121,
  122,
  123,
  127,
  128,
  131,
  133,
  134,
  135,
  136,
  139,
  142,
  143,
  145,
  148,
  152,
  155,
  158,


In [76]:
def dumpFiles(invidx):
    dictionary = {}
    file = open("indexfile.idx","a")
    offset = 0
    for term, posting_list in invidx.items():
        encoded = vbencode(posting_list)
        allbytes = ""
        for enc in encoded:
            allbytes +=enc
        file.write(allbytes)
        length = len(allbytes)
        dictionary[term] = [offset,length]
        offset += length
    file.close()
    return dictionary

In [79]:
dictionary = dumpFiles(invidx)

In [95]:
dictionary

{'more': [0, 856],
 'than': [856, 744],
 '150': [1600, 48],
 'former': [1648, 192],
 'offic': [1840, 376],
 'of': [2216, 1680],
 'the': [3896, 1704],
 'overthrown': [5600, 8],
 'south': [5608, 184],
 'vietnames': [5792, 8],
 'govern': [5800, 488],
 'have': [6288, 1080],
 'been': [7368, 968],
 'releas': [8336, 344],
 'from': [8680, 1304],
 'a': [9984, 1664],
 're-educ': [11648, 8],
 'camp': [11656, 56],
 'after': [11712, 992],
 '13': [12704, 136],
 'year': [12840, 792],
 'detent': [13632, 8],
 'offici': [13640, 520],
 'vietnam': [14160, 16],
 'new': [14176, 1176],
 'agenc': [15352, 240],
 'report': [15592, 800],
 'saturdai': [16392, 128],
 'hanoi': [16520, 8],
 'monitor': [16528, 80],
 'in': [16608, 1696],
 'bangkok': [18304, 8],
 'did': [18312, 384],
 'not': [18696, 1040],
 'give': [19736, 288],
 'specif': [20024, 64],
 'figur': [20088, 416],
 'but': [20504, 1200],
 'said': [21704, 1520],
 'those': [23224, 400],
 'freed': [23624, 24],
 'fridai': [23648, 1048],
 'includ': [24696, 656],


In [99]:
def chunkstring(string, length):
    return [string[0+i:length+i] for i in range(0, len(string), length)]

In [168]:
def getPostings(tokens, dictionary, filename):
    f = open(filename, "r+b")
    mm = mmap.mmap(f.fileno(), 0)
    res = []
    for token in tokens:
        start = dictionary[token][0]
        end = start + dictionary[token][1]
        allbytes = mm[start:end]
        allbytes = str(allbytes)
        allbytes = allbytes.replace("'","")
        allbytes = allbytes.replace("b","")
        chunks = chunkstring(allbytes,8)
        doclist = vbdecode(chunks)
        res.append(doclist)
    return res

In [180]:
postings = getPostings(getTokensFromText("Bush secretary"),dictionary,'indexfile.idx')
docids = getIntersection(postings)
for d in docids:
    print(int_to_docID[d])

AP880212-0005
AP880212-0011
AP880212-0080


In [163]:
#vbencode([170])
#vbdecode([b'10000001', b'00101010'])
ll = b'10000001'
bin_to_dec('10101010')

170