In [2]:
from langchain_community.document_loaders import PDFMinerPDFasHTMLLoader
from bs4 import BeautifulSoup
import re
from langchain_community.docstore.document import Document
from multiprocessing.dummy import Pool as ThreadPool
import pickle
import gc

In [3]:
def get_data(pat):
    loader = PDFMinerPDFasHTMLLoader(pat)
    data = loader.load()[0]
    return data

In [4]:
def get_content(data):
    soup = BeautifulSoup(data.page_content, 'html.parser')
    content = soup.find_all('span')
    return content

In [5]:
def get_snippets(content):
    import re
    cur_hSimNum = None
    cur_fsm = None
    cur_fs = None
    cur_text = ''
    snippets = []   # first collect all snippets that have the same font size
    for c in content:
        st = c.get('style')
        if not st:
            continue
        fsm = re.findall('font-family:(.*?);',st)
        fs = re.findall('font-size:(\d+)px',st) 
        text_slice= c.text[:5]
        hSimNum = re.findall(r'\w+\.\w*|§', text_slice)
        if not fsm or not fs:
            continue
        fsm = str(fsm[0]).strip()
        fs = int(fs[0])
        if hSimNum:
            hSimNum = 't'
        else:
            hSimNum = 'f'   
        if not cur_fsm: 
            cur_fsm = fsm
        if not cur_fs:
            cur_fs = fs  
        if not cur_hSimNum:
            cur_hSimNum = hSimNum        
        if fsm == cur_fsm and fs == cur_fs and hSimNum == cur_hSimNum: 
            cur_text += c.text   
        else:
            snippets.append((cur_text,cur_fsm,cur_fs,cur_hSimNum))
            cur_fsm = fsm
            cur_fs = fs
            cur_hSimNum = hSimNum
            cur_text = c.text
    snippets.append((cur_text,cur_fsm,cur_fs,cur_hSimNum))
    return snippets    

In [None]:
path='D:/school/ALL_PDFs/print/W20061110525.pdf'
data=get_data(path)
content=get_content(data)
snippets = get_snippets(content)


In [111]:
def get_semantic_snippets(data,snippets):
    cur_idx = -1
    semantic_snippets = []
    # Assumption: headings have higher font size than their respective content
    for s in snippets:
        # if current snippet's font size > previous section's heading => it is a new heading
        if not semantic_snippets or s[2] > semantic_snippets[cur_idx].metadata['heading_fontS']:   
            metadata={'heading':s[0],'content_fontM':'', 'heading_fontM':s[1], 'content_fontS': 0, 'heading_fontS': s[2], 'content_has_h_elem':'', 'heading_has_h_elem':s[3]}
            metadata.update(data.metadata)
            semantic_snippets.append(Document(page_content='',metadata=metadata))
            cur_idx += 1
            continue
        
        if s[1] != semantic_snippets[cur_idx].metadata['content_fontM']:
            if semantic_snippets[cur_idx].page_content != '':
                if 'Bold' in s[1] or 'Italic' in s[1] or 'BoldItalic' in s[1] or 'BoldOblique' in s[1] or 'BoldOblique' in s[1]:
                    metadata={'heading':s[0],'content_fontM':'', 'heading_fontM':s[1], 'content_fontS': 0, 'heading_fontS': s[2], 'content_has_h_elem':'', 'heading_has_h_elem':s[3]}
                    metadata.update(data.metadata)
                    semantic_snippets.append(Document(page_content='',metadata=metadata))
                    cur_idx += 1
                    continue
                    
        
        if s[3] == 't':
            if semantic_snippets[cur_idx].page_content != '':
                if len(semantic_snippets[cur_idx].page_content) > 30000:
                    metadata={'heading':s[0],'content_fontM':'', 'heading_fontM':s[1], 'content_fontS': 0, 'heading_fontS': s[2], 'content_has_h_elem':'', 'heading_has_h_elem':s[3]}
                    metadata.update(data.metadata)
                    semantic_snippets.append(Document(page_content='',metadata=metadata))
                    cur_idx += 1
                    continue
        # if current snippet's font size <= previous section's content => content belongs to the same section (one can also create
        # a tree like structure for sub sections if needed but that may require some more thinking and may be data specific)

            
        if not semantic_snippets[cur_idx].metadata['content_fontS'] or not semantic_snippets[cur_idx].metadata['content_fontM'] or not semantic_snippets[cur_idx].metadata['content_has_h_elem'] or not 'Bold' in s[1] or not 'Italic' in s[1] or not 'BoldItalic' in s[1] or not 'BoldOblique' in s[1] or not  'BoldOblique' in s[1] or s[2] <= semantic_snippets[cur_idx].metadata['content_fontS']:
            semantic_snippets[cur_idx].page_content += s[0]
            semantic_snippets[cur_idx].metadata['content_fontM'] = s[1] if not semantic_snippets[cur_idx].metadata['content_fontM'] else semantic_snippets[cur_idx].metadata['content_fontM']
            semantic_snippets[cur_idx].metadata['content_fontS'] = max(s[2], semantic_snippets[cur_idx].metadata['content_fontS'])
            semantic_snippets[cur_idx].metadata['content_has_h_elem'] = s[3]
            continue

        # if current snippet's font size > previous section's content but less than previous section's heading than also make a new
        # section (e.g. title of a PDF will have the highest font size but we don't want it to subsume all sections)
        

        
        if s[2] > semantic_snippets[cur_idx].metadata['content_fontS'] and s[2] < semantic_snippets[cur_idx].metadata['heading_fontS']:
            metadata={'heading':s[0],'content_fontM':'', 'heading_fontM':s[1], 'content_fontS': 0, 'heading_fontS': s[2], 'content_has_h_elem':'','heading_has_h_elem':'t'}
            metadata.update(data.metadata)
            semantic_snippets.append(Document(page_content='',metadata=metadata))
            cur_idx += 1
            continue
        
        
        
        metadata={'heading':s[0], 'content_fontM':'', 'heading_fontM':s[1],'content_fontS': 0, 'heading_fontS': s[2],'content_has_h_elem':'','heading_has_h_elem':'t'}
        metadata.update(data.metadata)
        semantic_snippets.append(Document(page_content='',metadata=metadata))
        cur_idx += 1 
    return semantic_snippets  

In [None]:
path='D:/school/ALL_PDFs/print/L20060000264.pdf'
data=get_data(path)
content=get_content(data)
snippets = get_snippets(content)
semantic_snippets = get_semantic_snippets(data,snippets)

In [None]:
snippets 

In [112]:
path='D:/school/ALL_PDFs/well_structured/B20190036605.pdf'
semantic_snippets = get_semantic_snippets(get_data(path),get_snippets(get_content(get_data(path))))

In [103]:
len(semantic_snippets)

160

In [None]:
for snip in semantic_snippets:
    print('HEADING:  ',snip.metadata['heading'])
    print('-----------------')
    print()
    print('CONTENT_LENGT:  ',len(snip.page_content))
    print()
    print('CONTENT_ESTIMATET_TOKEN:  ',len(snip.page_content)/4)
    print()
    print('-----------------')
    print()
    print('CONTENT_FIRST_500C:')
    print()
    print(snip.page_content[:500])
    print('---------------------------------------------------------------------------------')
    print()

In [None]:


def save_opject(obj, file_name):
    with open(file_name, 'wb') as outp:
        pickle.dump(obj, outp, pickle.HIGHEST_PROTOCOL) 
    outp.close()



def snipmaker(file_pathT):
    loader = PDFMinerPDFasHTMLLoader(file_pathT)
    data = loader.load()[0] 
    soup = BeautifulSoup(data.page_content,'html.parser')
    content = soup.find_all('div')
    cur_fsm = None
    cur_fs = None
    cur_text = ''
    snippets = []
    for c in content:
        st = c.get('style')
        if not st:
            continue
        fsm = re.findall('font-family:(.*?);',st)
        fs = re.findall('font-size:(\d+)px',st)
        if not fsm or not fs:
            continue
        fsm = str(fsm[0]).strip()
        fs = int(fs[0])
        if not cur_fsm: 
            cur_fsm = fsm
        if not cur_fs:
            cur_fs = fs      
        if fsm == cur_fsm and fs == cur_fs: 
            cur_text += c.text  
        else:
            snippets.append((cur_text,cur_fsm,cur_fs))
            cur_fsm = fsm
            cur_fs = fs
            cur_text = c.text
    snippets.append((cur_text,cur_fsm,cur_fs))    
    
    cur_idx = -1
    semantic_snippets = []
    # Assumption: headings have higher font size than their respective content
    for s in snippets:
        # if current snippet's font size > previous section's heading => it is a new heading
        if not semantic_snippets  or s[2] > semantic_snippets[cur_idx].metadata['heading_fontS']:   
            metadata={'heading':s[0],'content_fontM':'', 'heading_fontM':s[1], 'content_fontS': 0, 'heading_fontS': s[2]}
            metadata.update(data.metadata)
            semantic_snippets.append(Document(page_content='',metadata=metadata))
            cur_idx += 1
            continue
        
        if s[1] != semantic_snippets[cur_idx].metadata['content_fontM']:
            if semantic_snippets[cur_idx].page_content != '':
                    if 'Bold' in s[1] or 'Italic' in s[1] or 'BoldItalic' in s[1] or 'BoldOblique' in s[1] or 'BoldOblique' in s[1]:
                        metadata={'heading':s[0],'content_fontM':'', 'heading_fontM':s[1], 'content_fontS': 0, 'heading_fontS': s[2]}
                        metadata.update(data.metadata)
                        semantic_snippets.append(Document(page_content='',metadata=metadata))
                        cur_idx += 1
                        continue
        # if current snippet's font size <= previous section's content => content belongs to the same section (one can also create
        # a tree like structure for sub sections if needed but that may require some more thinking and may be data specific)

            
        if not semantic_snippets[cur_idx].metadata['content_fontS'] or not semantic_snippets[cur_idx].metadata['content_fontM'] or not 'Bold' in s[1] or not 'Italic' in s[1] or not 'BoldItalic' in s[1] or not 'BoldOblique' in s[1] or not  'BoldOblique' in s[1] or s[2] <= semantic_snippets[cur_idx].metadata['content_fontS']:
            semantic_snippets[cur_idx].page_content += s[0]
            semantic_snippets[cur_idx].metadata['content_fontM'] = s[1] if not semantic_snippets[cur_idx].metadata['content_fontM'] else semantic_snippets[cur_idx].metadata['content_fontM']
            semantic_snippets[cur_idx].metadata['content_fontS'] = max(s[2], semantic_snippets[cur_idx].metadata['content_fontS'])
            continue

        # if current snippet's font size > previous section's content but less than previous section's heading than also make a new
        # section (e.g. title of a PDF will have the highest font size but we don't want it to subsume all sections)
        
    
        
        if s[2] > semantic_snippets[cur_idx].metadata['content_fontS'] and s[2] < semantic_snippets[cur_idx].metadata['heading_fontS']:
            metadata={'heading':s[0],'content_fontM':'', 'heading_fontM':s[1], 'content_fontS': 0, 'heading_fontS': s[2]}
            metadata.update(data.metadata)
            semantic_snippets.append(Document(page_content='',metadata=metadata))
            cur_idx += 1
            continue
        
        
        
        metadata={'heading':s[0], 'content_fontM':'', 'heading_fontM':s[1],'content_fontS': 0, 'heading_fontS': s[2]}
        metadata.update(data.metadata)
        semantic_snippets.append(Document(page_content='',metadata=metadata))
        cur_idx += 1
    return semantic_snippets

def run (file_paths, file_names): 
    for i in range(len(file_paths)):
        gc.collect()
        snip = snipmaker(file_paths[i])
        save_path = 'D:/school/ALL_PDFs/snips2/' + file_names[i] + '.pkl'
        gc.collect()
        save_opject(snipmaker(file_paths[i]), 'D:/school/ALL_PDFs/snips2/' + file_names[i] + '.pkl')
        gc.collect()
    

In [None]:
import os 

file_names1=[]
file_paths = []
def get_file_names(directory):
    file_namesT = []
    file_paths = []
    for dir in directory:
        for root, dirs, files in os.walk(dir):
            for file in files:
                file_path = root+'/'+file
                file_paths.append(file_path)
                split_file= file.split('.')
                file_name = split_file[0]
                file_namesT.append(file_name)     
    return file_namesT, file_paths
# Example usage
directory_path = ['D:/school/ALL_PDFs/well_structured','D:/school/ALL_PDFs/print']
file_names1,file_paths = get_file_names(directory_path)
print(len(file_names1))
print(len(file_paths))
print(file_paths[48])

In [None]:
urls_pdf = []
file_names = []
def make_urls_and_file_names(files):
    urls_pdfT = []
    file_namesT = []
    count = 0
    for file in files:
        file_namesT.append(file)
        urls_pdfT.append("https://www.retsinformation.dk/eli/accn/" + file)
        count += 1    
    print(count)    
    return urls_pdfT,file_namesT

urls_pdf, file_names = make_urls_and_file_names(file_names1)

print(len(urls_pdf))
print(len(file_names))

In [None]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [None]:
chunksed_filenamses = []

chunksed_filenamses = list(chunks(file_names, 1))  
chunksed_urls_pdf = list(chunks(urls_pdf, 1))
chunksed_file_paths = list(chunks(file_paths, 1))  

In [None]:
part = 0


print(len(chunksed_urls_pdf[part]))
print(chunksed_urls_pdf[part])
print(len(chunksed_filenamses[part]))
print(chunksed_filenamses[part])
print(len(chunksed_file_paths[part]))
print(chunksed_file_paths[part])

In [None]:
def snipMandSave(file_pathsT2, fT2):
    
    run(file_pathsT2, fT2)
    

In [None]:

for i in range(len(chunksed_urls_pdf)):
    snipMandSave(chunksed_file_paths[i], chunksed_filenamses[i])
     