# Import Packages & Functions

In [None]:
# read csv
import pandas as pd
import os
# PdfMiner
import glob
import numpy as np
from io import StringIO
from io import BytesIO
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
# BibTex
import bibtexparser
# Regex
import re
import nltk
from nltk.tokenize import RegexpTokenizer
# Copy Files
from shutil import copyfile

In [None]:
def removePassage(my_str):
    my_str = re.sub("\\\\ud", " ", my_str)
    my_str = re.sub("\n", " ", my_str)
#     my_str = re.sub(":", " ", my_str)
#     my_str5 = re.sub("\(|\)", " ", my_str3)
    my_str = re.sub("  ", " ", my_str)
    return(my_str)

In [None]:
def range_subset(range1, range2):
    """Whether range1 is a subset of range2."""
    if not range1:
        return True  # empty range is subset of anything
    if not range2:
        return False  # non-empty range can't be subset of empty range
    if len(range1) > 1 and range1.step % range2.step:
        return False  # must have a single value or integer multiple step
    return range1.start in range2 and range1[-1] in range2

In [None]:
### Funktion zum parsen von PDF zu String Format [Von selbst erstellten PDFs!]

def extractor(path):
    output_string = StringIO()
    with open(path, 'rb') as in_file:
        parser = PDFParser(in_file)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
        return(output_string)

In [None]:
# def nltk_token_text(string , preserve_line):
#     tokens = nltk.word_tokenize(string.lower(),language = 'russian',preserve_line=preserve_line)
#     text = str(' '.join(tokens))
#     return(text)

In [None]:
def nltk_token_text(string , preserve_line):
    pattern = re.compile("[0-9]+\.")
    pattern2= re.compile(":[0-9]+")
    pattern3= re.compile("([A-Z]\.)|([\wа-я]\.)")
    pattern4 = re.compile(".*\.")

    tokens = nltk.word_tokenize(string.lower(),language = 'russian',preserve_line=preserve_line)
    nltk_tokens = []
    
    for t in tokens:
        if pattern2.match(t)!=None:
            nltk_tokens.append(":")
            if "." in t:
                nltk_tokens.append(t.replace(":","").replace(".",""))
            else:
                nltk_tokens.append(t.replace(":",""))
            nltk_tokens.append(".")
        elif ((pattern.match(t)!= None) or (pattern3.match(t)== None and pattern4.match(t)!=None)) and t !=".":
            nltk_tokens.append(t.replace(".",""))
            nltk_tokens.append(".")

        else:
            nltk_tokens.append(t)
        
    text = str(' '.join(nltk_tokens))
    return(text)

In [None]:
import numpy as np
def levenshtein_ratio_and_distance(s, t, ratio_calc = False):
    """ levenshtein_ratio_and_distance:
        Calculates levenshtein distance between two strings.
        If ratio_calc = True, the function computes the
        levenshtein distance ratio of similarity between two strings
        For all i and j, distance[i,j] will contain the Levenshtein
        distance between the first i characters of s and the
        first j characters of t
    """
    # Initialize matrix of zeros
    rows = len(s)+1
    cols = len(t)+1
    distance = np.zeros((rows,cols),dtype = int)

    # Populate matrix of zeros with the indeces of each character of both strings
    for i in range(1, rows):
        for k in range(1,cols):
            distance[i][0] = i
            distance[0][k] = k

    # Iterate over the matrix to compute the cost of deletions,insertions and/or substitutions    
    for col in range(1, cols):
        for row in range(1, rows):
            if s[row-1] == t[col-1]:
                cost = 0 # If the characters are the same in the two strings in a given position [i,j] then the cost is 0
            else:
                # In order to align the results with those of the Python Levenshtein package, if we choose to calculate the ratio
                # the cost of a substitution is 2. If we calculate just distance, then the cost of a substitution is 1.
                if ratio_calc == True:
                    cost = 2
                else:
                    cost = 1
            distance[row][col] = min(distance[row-1][col] + 1,      # Cost of deletions
                                 distance[row][col-1] + 1,          # Cost of insertions
                                 distance[row-1][col-1] + cost)     # Cost of substitutions
    if ratio_calc == True:
        # Computation of the Levenshtein Distance Ratio
        Ratio = ((len(s)+len(t)) - distance[row][col]) / (len(s)+len(t))
        return Ratio
    else:
        # print(distance) # Uncomment if you want to see the matrix showing how the algorithm computes the cost of deletions,
        # insertions and/or substitutions
        # This is the minimum number of edits needed to convert string a to string b
        return "The strings are {} edits away".format(distance[row][col])

# Process the PDFs

## Match Bib entries with PDF Text

### GOST2006 style
[1] Author. Title [Text] / Author i Author // Journal. -- year. -- No.volume(number). -- C. pages.

In dieser Reihenfolge gehe ich die Labels durch und checke, ob es zu Überschneidungen kommt. Wenn Ja, dann suche nächsten match

In [None]:
# path_input = "C:\\Masterarbeit\\venvPDF\\pdf_bib\\gost2006"
# path_output = "C:\\Masterarbeit\\venvPDF\\labelled_text\\gost2006"

In [None]:
path_input = ".\Sequence-Labeling-for-Reference-Parsing-of-Cyrillic-Script-Scholarly-Data\Synthetic_data\pdf_bibtex_data\clean_gost2006"

In [None]:
from collections import Counter
from os import listdir
from os.path import isfile, join
onlyfiles = [f.replace(".pdf","") for f in listdir(path_input) if isfile(join(path_input, f))]

counterfiles = Counter(onlyfiles)

pdf_list = []
for c in counterfiles.keys():
    if counterfiles[c]>1:
        pdf_list.append(c)
len(pdf_list)

In [None]:
# Test readability of papers
not_readable_pdf = []
for bib in pdf_list:
    try:
        pdf = extractor(path_input+"\\"+bib+".pdf")
        text = pdf.getvalue()
#         text.encode('latin').decode('windows-1251')
    except Exception as e:
        print(e)
        not_readable_pdf.append(bib)
        pass
    
# Test readability of bibtex
not_readable_bib = []
for bib in pdf_list:
    try:
        with open(path_input + "\\" +bib , encoding = 'utf-8') as bibtex_file:
            bibtex_str = bibtex_file.read()

        bib_database = bibtexparser.loads(bibtex_str)


        for key in bib_database.entries[0].keys():
            if key!='author':
                globals()[key] = nltk_token_text(removePassage(bib_database.entries[0][key]),False)#.replace(".","").replace(",","")
            else:
                globals()[key] = removePassage(bib_database.entries[0][key])

    except Exception as e:
        print(e)
        not_readable_bib.append(bib)
        pass

In [None]:
# Remove non-readable PDFs & Bibtex
i=0
for p in pdf_list:
    if (p in not_readable_pdf) or (p in not_readable_bib):
        pdf_list.remove(p)
        i+=1
        
print(f"{i} PDF´s Removed! \n Current size of dataset {len(pdf_list)}")

## Copy clean files to new directory

In [None]:
# Dieser Schritt kann ausgelassen werden, da ich schon "saubere" Daten liefere

In [None]:
# Copy "clean" files to new directory    copyfile(path_input+"\\"+p, "C:\\Masterarbeit\\venvPDF\\clean_plain_data\\"+p)
path = "C:\\Masterarbeit\\venvPDF\\pdf_bib\\clean_gost2006\\"
for p in pdf_list:
    copyfile(path_input+"\\"+p, path+p)
    copyfile(path_input+"\\"+p+'.pdf', path+p+'.pdf')    

# Create labelled text

In [None]:
# path_input = "C:\\Masterarbeit\\venvPDF\\pdf_bib\\clean_gost2006"
# path_output = "C:\\Masterarbeit\\venvPDF\\labelled_text\\gost2006_fine_grained"

In [None]:
path_input = ".\Sequence-Labeling-for-Reference-Parsing-of-Cyrillic-Script-Scholarly-Data\Synthetic_data\pdf_bibtex_data\clean_gost2006"
path_output = ".\Sequence-Labeling-for-Reference-Parsing-of-Cyrillic-Script-Scholarly-Data\Synthetic_data\Labeled_text_data\gost2006_fine_grained_clean"

In [None]:
from collections import Counter
from os import listdir
from os.path import isfile, join
onlyfiles = [f.replace(".pdf","") for f in listdir(path_input) if isfile(join(path_input, f))]

counterfiles = Counter(onlyfiles)

pdf_list = []
for c in counterfiles.keys():
    if counterfiles[c]>1:
        pdf_list.append(c)
len(pdf_list)

In [None]:
from datetime import datetime

start_time = datetime.now()
iter_count = 0
for bib in pdf_list:
    iter_count += 1
    pdf = extractor(path_input+"\\"+bib+".pdf")

    ### Read PDF Text
    pdf_text = pdf.getvalue().replace("(cid:22)","")#.encode('latin').decode('windows-1251')
    # 1.) Wenn Wort nicht in eine Zeile passt und aufgesplittet wird, wollen wir das wort wieder zsm führen
    # 2.) "-" wird als \x15 ausgegeben
    # 3.) "-" wird als \x15 ausgegeben
    # 4.) "-" wird als \x15 ausgegeben'
#     pdf_text2 = pdf_text.replace("-\n","").replace("\x15","-").replace("\x10","“").replace("\x11","”").replace("'","’")
    pdf_text2 = pdf_text.replace("-\n","").replace("\x15","-").replace("\x10","“").replace("\x11","”").replace("'","’").replace("–\n","–")

    pdf_text2 = removePassage(pdf_text2)
    pdf_text2 = pdf_text2.replace("Список литературы [1]","")
    
    ### Transform & Clean with nltk
    nltk_tokens = []
    pattern = re.compile("[0-9]+\.")
    pattern2= re.compile(":[0-9]+")
    pattern3= re.compile("([A-Z]\.)|([\wа-я]\.)")
    pattern4 = re.compile(".*\.")

    for t in nltk.word_tokenize(pdf_text2,preserve_line=True):
        if pattern2.match(t)!=None:
            nltk_tokens.append(":")
            if "." in t:
                nltk_tokens.append(t.replace(":","").replace(".",""))
            else:
                nltk_tokens.append(t.replace(":",""))
            nltk_tokens.append(".")
        elif ((pattern.match(t)!= None) or (pattern3.match(t)== None and pattern4.match(t)!=None)) and t !=".":
            nltk_tokens.append(t.replace(".",""))
            nltk_tokens.append(".")

        else:
            nltk_tokens.append(t)

    nltk_text = str(' '.join(nltk_tokens))

    ### Get BibTeX data
    with open(path_input + "\\" +bib , encoding = 'utf-8') as bibtex_file:
        bibtex_str = bibtex_file.read()

    bib_database = bibtexparser.loads(bibtex_str)


    for key in bib_database.entries[0].keys():
        if key!='author':
            if key == "pages":
                globals()[key] = nltk_token_text(removePassage(bib_database.entries[0][key]),False).replace(" -- ","–")
            elif (key == "address") and (re.match(".\.",bib_database.entries[0][key])):
                globals()[key] = removePassage(bib_database.entries[0][key])
            else:
                globals()[key] = nltk_token_text(removePassage(bib_database.entries[0][key]),False).replace("\\textit { ","").replace("\\textsubscript { ","").replace("\\textsuperscript { ","").replace(""" } ""","")
        else:
            globals()[key] = removePassage(bib_database.entries[0][key])

    ### Split Author Entries
    author2 = " ".join(author.split(" and")).split("  ")
    list_authors = []
    for a in author2:
        a_list = a.split(", ")
        list_authors.append(a_list)

    ##########################################################
    ### Match Bib Entries with PDF Text
    ##########################################################
    # OHNE JOURNAL & AUTHOR!
    if bib_database.entries[0]["type"] == "Journal Article":
        LABELS_TXT = ["year","title","journal","volume","number","pages","pagetotal"] # Ohne ,"author" da dieses Label aus einer Liste besteht!
    else:
        LABELS_TXT = ["year","title","booktitle","volume","number","pages","address","publisher","pagetotal"]
        
    LABELS_VAR = [] # Ohne author da dieses Label aus einer Liste besteht!
    LABELS_TXT2 = []
    for l in LABELS_TXT:
        if l in bib_database.entries[0].keys():
            LABELS_TXT2.append(l)
            LABELS_VAR.append(globals()[l])
    LABELS_TXT = LABELS_TXT2
    # Wie damit umgehen, wenn Werte mehrfach vorkommen??? z.B. V.V. bei Autoren
    token_spans = pd.DataFrame(columns=['token','start',"end","label"])
    start = 0
    end = 0
    for token in nltk_tokens:
    #     print(token)
        length = len(token)
        end += length
        d = {'token':[token], 'start':[start],'end':[end], 'label':None}
        token_spans = token_spans.append(pd.DataFrame(d),ignore_index = True)
        start+=length +1
        end+=1

    label_spans = pd.DataFrame(columns=['text','start',"end","label"])
    # 22 in der find function, weil erst ab Zeichen 22 die Referenz beginnt
    # Suche nach Author Einträgen im Text und hole die spans
    for full_author in list_authors:
        for text in full_author:
            try:
                start_found = [m.start() for m in re.finditer(text.lower(), nltk_text.lower())]
                for s in start_found:
                    span_label =[s ,s+len(text)]
                    i=0
                    while any([range_subset(range(span_label[0],span_label[1]),range(start,end)) for start,end in zip(label_spans.start,label_spans.end)])==True:
                        span_label =[nltk_text.lower().find(text.lower(),span_label[0]+1) ,nltk_text.lower().find(text.lower(),span_label[0]+1)+len(text)]
                        i+=1
                        if i==10:
                            break

                    d = {'text':[text], 'start':[span_label[0]], 'end':[span_label[1]], 'label':['author']}
                    label_spans = label_spans.append(pd.DataFrame(d),ignore_index=True)
            except:
                pass

    # Suche nach Labels Einträgen im Text und hole die spans [ausser Author]
    missing_labels = []
    for text,label in zip(LABELS_VAR,LABELS_TXT):

        text2 = " "+text.lower()+" "
        try:
            span_label = [nltk_text.lower().find(text2)+1 ,nltk_text.lower().find(text2)+len(text)+1]
            i=0
            while any([range_subset(range(span_label[0],span_label[1]),range(start,end)) for start,end in zip(label_spans.start,label_spans.end)])==True:
                span_label =[nltk_text.lower().find(text2,span_label[0])+1 ,nltk_text.lower().find(text2,span_label[0])+len(text)+1]
                i+=1
                if i==10:
                    break
            if span_label[0] <1: #Wenn z.B. Titel nicht mit dem Titel im Text übereinstimmt (ein Zeichen falsch intepretiert), wird ein komischer span ausgegeben z.B. [-1,107]. Daher labeln wir den Titel lieber nicht 
                missing_labels.append(label)
                pass
            else:
                d = {'text':[text], 'start':[span_label[0]], 'end':[span_label[1]], 'label':[label]}
                label_spans = label_spans.append(pd.DataFrame(d),ignore_index=True)
        except:
            pass

    for token_start,token_end in zip(token_spans.start, token_spans.end): #Für jeden Eintrag aus token_spans ...
        for label_start, label_end, label in zip(label_spans.start,label_spans.end,label_spans.label): # ... suche nach Match in label_spans
            if range_subset(range(token_start,token_end),range(label_start,label_end)):
                token_spans.label[(token_spans.start == token_start)&(token_spans.end == token_end)] = label
    
    #Levenshtein distance
    for text,label in zip(LABELS_VAR,LABELS_TXT):
        if label in missing_labels:
            ### Window function
            none_tokens = token_spans.token[token_spans.label.isin([None])]
            window_size = len(text.split())
            indexes = list(none_tokens.index)
            all_dist = []
            all_windows = []
            for i in none_tokens.rolling(window=window_size):
    #             print(' '.join(i))
                distance = levenshtein_ratio_and_distance(' '.join(i).lower() , text,True)
                all_dist.append(distance)
                all_windows.append(i)
            max_match = all_dist.index(max(all_dist))
            indizes = list(all_windows[max_match].index)
            if indizes != list(range(indizes[0], indizes[-1]+1)):
                i_ = 0
                for idz in list(range(indizes[0], indizes[-1]+1)):
                    if idz not in indizes:
                        indizes2 = indizes[i_:]
                    i_ =+1
            else:
                indizes2 = indizes
            if token_spans.token[indizes2[0]] == ".":
                indizes2 = indizes2[1:]
            for i in indizes2:
                token_spans['label'][i:i+1] = str(label)
                
                
    if "number" not in token_spans.label:
        try:
            if token_spans.token[list(token_spans[token_spans.label.isin(['volume'])].index + 2)].iloc[0] == number:
                token_spans.label[list(token_spans[token_spans.label.isin(['volume'])].index + 2)]="number"
        except:
            pass
### Create Labelled Text
    LABELS_TXT.append("author")

    output_text_token = []
    for token, label in zip(token_spans.token,token_spans.label):
        if label != None:
#             if label == "booktitle":
#                 output_text_token.append("<journal>" + token + "</journal>")
#             elif label == "pagetotal":
#                 output_text_token.append("<pages>" + token + "</pages>")
#             else:
            output_text_token.append(f"<{label}>" + token + f"</{label}>")
        else:
            output_text_token.append(token)

    output_text = ' '.join(output_text_token)  
    myfile = open(path_output + "\\"+bib.replace(".bib",".xml"), "w",encoding="utf-8")
    myfile.write(output_text)
    myfile.close()
    if iter_count % 500 == 0:
        print(iter_count)
        
end_time = datetime.now()
print(difference = start_time - end_time)