# Import Packages & Define functions

In [1]:
# read csv
import pandas as pd
import os
# PdfMiner
import glob
import numpy as np
from io import StringIO
from io import BytesIO
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
# Token Vectorization
from langdetect import detect 
import fasttext.util
import fasttext
########################################
from random import random
from numpy import array
from numpy import cumsum
########################################
import regex as re  
import string
import re
########################################
from datetime import datetime
import collections
########################################
# Keras imports for ML Model
import tensorflow as tf
from keras.models import Sequential
from keras.models import Model
from keras.layers import Dense, Conv2D, MaxPooling2D, LSTM, Bidirectional
from keras.layers import Dropout, Flatten, GlobalAveragePooling2D, Embedding
from keras.utils import np_utils
from keras import optimizers
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score

from dataclasses import dataclass
from bs4 import BeautifulSoup

Using TensorFlow backend.


In [2]:
def read_tei(tei_file):
    with open(tei_file, 'r',encoding="utf-8") as tei:
        soup = BeautifulSoup(tei, 'lxml')
        return soup
    raise RuntimeError('Cannot generate a soup from the input')

In [5]:
def elem_to_text(elem, default=''):
    if elem:
        return elem.getText()
    else:
        return default

In [4]:


@dataclass
class Person:
    firstname: str
    middlename: str
    surname: str

turing_author = Person(firstname='Alan', middlename='M', surname='Turing')

f"{turing_author.firstname} {turing_author.surname} authored many influential publications in computer science."

'Alan Turing authored many influential publications in computer science.'

In [6]:
class TEIFile(object):
    def __init__(self, filename):
        self.filename = filename
        self.soup = read_tei(filename)
        self._text = None
        self._title = ''
        self._abstract = ''

    @property
    def doi(self):
        idno_elem = self.soup.find('idno', type='DOI')
        if not idno_elem:
            return ''
        else:
            return idno_elem.getText()

    @property
    def title(self):
        if not self._title:
            self._title = self.soup.title.getText()
        return self._title

    @property
    def abstract(self):
        if not self._abstract:
            abstract = self.soup.abstract.getText(separator=' ', strip=True)
            self._abstract = abstract
        return self._abstract

    @property
    def authors(self):
        authors_in_header = self.soup.analytic.find_all('author')

        result = []
        for author in authors_in_header:
            persname = author.persname
            if not persname:
                continue
            firstname = elem_to_text(persname.find("forename", type="first"))
            middlename = elem_to_text(persname.find("forename", type="middle"))
            surname = elem_to_text(persname.surname)
            person = Person(firstname, middlename, surname)
            result.append(person)
        return result
    
    @property
    def text(self):
        if not self._text:
            divs_text = []
            for div in self.soup.body.find_all("div"):
                # div is neither an appendix nor references, just plain text.
                if not div.get("type"):
                    div_text = div.get_text(separator=' ', strip=True)
                    divs_text.append(div_text)

            plain_text = " ".join(divs_text)
            self._text = plain_text
        return self._text

In [7]:
# Function that get the fullpath of files in a directory
def listdir_fullpath(d):
    return [os.path.join(d, f) for f in os.listdir(d)]

In [8]:
### Funktion zum entfernen von Zeilenumbrüchen
def removePassage(my_str):
    my_str1 = re.sub("\\\\ud", " ", my_str)
    my_str2 = re.sub("\\\\n", " ", my_str1)
    return(my_str2)

### Funktion zum parsen von PDF[nur erste Seite] zu String Format
def extract_page_one(path):
    output_string = StringIO()
    
    with open(path, 'rb') as in_file:
        parser = PDFParser(in_file)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        interpreter.process_page(list(PDFPage.create_pages(doc))[0])
        return(output_string)
    


### Funktion zum Vergrößern der Input & Output Vektoren um NEWLINES
def add_newlines(Tokens,Real_Tokens,y_final):
    y_final_REAL = []
    k = 0
    m = 0
    for i in range(len(Tokens)):
        if k == 0:
            j=i
        else:
            if m == 0:
                j = k+1
            else:
                j = m+1
        if Tokens[i] == Real_Tokens[j] : # Wenn Tokens gleich sind, dann übernehme y_final_REAL
            y_final_REAL.append(y_final[i])
            m = j
        else:
            for k in range(j,len(Real_Tokens)): # Sonst gehe die Real_Tokens durch bis match vorhanden

                if Real_Tokens[k] == 'NEWLINE':
                    y_final_REAL.append('Sonstiges')
                
#                 elif Real_Tokens[k] not in Tokens:
#                     y_final_REAL.append('Sonstiges')

                else:
                    y_final_REAL.append(y_final[i])
                    m=k
                    break

    RealTokens_final = Real_Tokens[:len(y_final_REAL)]
    
    index_title = [i for i, e in enumerate(y_final_REAL) if e == 'I-title']
    if index_title==[]:
        return(RealTokens_final,y_final_REAL)
    else:
        end_title = max(index_title)

        ### lable NEWLINES im Titel als "I-title"        
        for i in range(len(RealTokens_final)):
            if RealTokens_final[i]=='NEWLINE':
                if (y_final_REAL[i+1] =='I-title' or end_title>i) and y_final_REAL[i-1] in ('B-title','I-title'):
                    y_final_REAL[i] = 'I-title'
        ## Kann passieren, dass im Titel mehrere NEWLINES aufeinander folgen. Daher end_title>i um festzustellen, ob iwann später noch ein Titel Label kommt        

        return(RealTokens_final,y_final_REAL)

#Create Word Vector representation
def detect_and_vectorize(tokens_sequence): #### input is the tokens list of ONE PAPER e.g. ergebnis_tokens[2]
    
    tokens_vectorized = []
    lang = detect(' '.join(tokens_sequence))
    
    if (lang == 'ru'):
        for i in range(len(tokens_sequence)):
            tokens_vectorized.append(np.float16(ft_ru.get_word_vector(tokens_sequence[i])))
                
    elif (lang == 'bg'):
        for i in range(len(tokens_sequence)):
            tokens_vectorized.append(np.float16(ft_bg.get_word_vector(tokens_sequence[i])))
                
    else:  ## assume language == uk
        for i in range(len(tokens_sequence)):
            tokens_vectorized.append(np.float16(ft_uk.get_word_vector(tokens_sequence[i])))
    
    while len(tokens_vectorized)<1000:
        tokens_vectorized.append(np.zeros(60))
      
    if len(tokens_vectorized)>1000:
        del tokens_vectorized[1000:] 

    return np.array(tokens_vectorized)

### Additional Features
punctuations = '''!()[]{};:'"\<>/?@#$%^&*«»_–~.,-'''

def compute_additional_features(tokens_sequence): #### input is the tokens list of ONE PAPER e.g. ergebnis_tokens[2]
    
    tokens = tokens_sequence
    feature_upper = []
    feature_capitalized = []
    feature_autor_format = []
    feature_punctation = []
    feature_newline = []
    feature_array = []
    
    while len(tokens)<1000:
        tokens.append(str(0))
    if len(tokens)>1000:
        del tokens[1000:] 
    #print(tokens)    
    for i in range(len(tokens)):
        if tokens[i] !='NEWLINE':
            if str(tokens[i]).isupper():
                feature_upper.append(1)

            else:
                feature_upper.append(0)
        else: 
            feature_upper.append(0)

        if tokens[i] !='NEWLINE':
            if str(tokens[i][0]).isupper():
                feature_capitalized.append(1)

            else:
                feature_capitalized.append(0)
        else: 
            feature_capitalized.append(0)

        if tokens[i] !='NEWLINE':
            if re.match('.\.',str(tokens[i])) != None and str(tokens[i]).isupper():
                feature_autor_format.append(1)

            else:
                feature_autor_format.append(0)
        else: 
            feature_autor_format.append(0)

        if tokens[i] !='NEWLINE':
            if any((c in punctuations) for c in str(tokens[i])):
                feature_punctation.append(1)
            else:
                feature_punctation.append(0)
        else: 
            feature_punctation.append(0)
                
        if tokens[i] =='NEWLINE':
            feature_newline.append(1)
        else: 
            feature_newline.append(0)
    df = pd.DataFrame(list(zip(feature_upper, feature_capitalized,feature_autor_format ,feature_punctation,feature_newline)))  
    feature_array = df.to_numpy(copy=True)
    
    return np.array(feature_array)

In [9]:
### Grobid: Entferne [[...]]
### Funktion zum entfernen von 'Arrays' aus Autoren
def removeAutor_grobid(my_str):
    my_str1 = re.sub("\[\['", "", my_str) 
    my_str2 = re.sub("'\]\]", "", my_str1)
    my_str3 = re.sub("'", "", my_str2) 
    return(my_str3)

In [10]:
### Funktion zum entfernen von 'Arrays' aus Autoren
def removeAutor(my_str):
    my_str1 = re.sub("\['", "", my_str)
    my_str2 = re.sub("'\]", "", my_str1)
    my_str3 = re.sub("'", "", my_str2)
    return(my_str3)


# Read TEI Files

In [10]:
### DO NOT RUN! Predictions of GROBID are already saved in 'grobid_16467.csv'
### Transform the tei.xml files into a dataframe


all_files = listdir_fullpath("D:\_final_selection_16478\GROBID_ALL")
# all_files = listdir_fullpath("D:\_final_selection_16478\out2")

Frame = pd.DataFrame({"core_id": [] , "title": [] , "authors": [] })
i = 0
for tei_doc in all_files:
    tei = TEIFile(tei_doc)
    core_id = re.sub(".tei.xml","",re.sub("D:\\\\_final_selection_16478\\\\GROBID_ALL\\\\Core_ID_","",tei_doc))
    authors = []
    for i in range(len(tei.authors)):
        if len(tei.authors[i].firstname)==1:
            forename = tei.authors[i].firstname + ". " + tei.authors[i].middlename + "."
            surname = tei.authors[i].surname
            name = [forename , surname]
        elif len(tei.authors[i].middlename)==1:
            forename = tei.authors[i].firstname +" " +tei.authors[i].middlename + "."
            surname = tei.authors[i].surname
            name = [forename , surname]
        else:
            forename = tei.authors[i].firstname + " " +tei.authors[i].middlename
            surname = tei.authors[i].surname
            name = [forename , surname]
        authors.append(name)
    
    
    Frame = Frame.append(pd.DataFrame(data = {"core_id": core_id , "title":tei.title , "authors":str(authors)},index = [i]), ignore_index=True)
    i =+ 1

# Frame.to_excel("grobid_0_8000.xlsx")
# Frame.to_excel("grobid_8000_16478.xlsx")
Frame.to_excel("grobid_16467.xlsx")
Frame.to_csv('grobid_16467.csv')
Frame

Unnamed: 0,core_id,title,authors
0,11083759,Середовище проведення освітніх вебінарів «WIP ...,"[['Юрій ', 'Богачков']]"
1,11083794,Комплексна підготовка дистанційних матеріалів ...,[]
2,11083797,,[]
3,11083801,,[]
4,11083807,Інструментальні засоби профорієнтації,[]
...,...,...,...
16462,95313001,,[]
16463,95313006,,[]
16464,95313010,,[]
16465,95313012,ТЕХНОЛОГІЯ ПОКВАРТИРНОГО ОБЛІКУ ТЕПЛОВОЇ ЕНЕРГІЇ,"[['Горбачьова А О', '']]"


# Evaluate GROBID prediction

In [11]:
df_meta = pd.read_csv('grobid_16467.csv', sep = ',')#  , encoding= 'utf-16')
df_meta.drop('Unnamed: 0' , axis = 1 , inplace=True)
df_meta

Unnamed: 0,core_id,title,authors
0,11083759,Середовище проведення освітніх вебінарів «WIP ...,"[['Юрій ', 'Богачков']]"
1,11083794,Комплексна підготовка дистанційних матеріалів ...,[]
2,11083797,,[]
3,11083801,,[]
4,11083807,Інструментальні засоби профорієнтації,[]
...,...,...,...
16462,95313001,,[]
16463,95313006,,[]
16464,95313010,,[]
16465,95313012,ТЕХНОЛОГІЯ ПОКВАРТИРНОГО ОБЛІКУ ТЕПЛОВОЇ ЕНЕРГІЇ,"[['Горбачьова А О', '']]"


In [16]:
path_15553 = "D:/_final_selection_16478/PDFs_15553/"


pdf = os.listdir(path_15553)


files_core_id = []
files_paths = []
for elem in pdf:
    core = int(re.sub(".pdf","",re.sub("Core_ID_","",elem)))
    if core in list(df_meta.core_id):
        files_core_id.append(core)
#         files_paths.append("D:/_final_selection_16478/all_pdf/" + elem)
        files_paths.append(path_15553 + elem)
print(len(files_paths))
print(files_paths[0])



15552
D:/_final_selection_16478/PDFs_15553/Core_ID_11083759.pdf


In [13]:
df_meta = df_meta[df_meta.core_id.isin(files_core_id)].reset_index()
df_meta.drop('index' , axis = 1 , inplace=True)
print(df_meta.shape)
print("{} Titel und {} Autoren sind NA".format(sum(df_meta.title.isna()),sum(df_meta.authors == "[]")))

(15552, 3)
12787 Titel und 14910 Autoren sind NA


In [15]:
### extract text 

all_pdf_text = [] 
start_time = datetime.now()
for i in range(len(files_paths)):
    try:
        all_pdf_text.append(extract_page_one(files_paths[i]).getvalue())
        if i % 500 == 0:
            print(str((i/len(files_paths))*100)+'%')
    except:
        all_pdf_text.append("Einlesen nicht moeglich")
        print("FEHLER")
    
end_time = datetime.now()

0.0%
3.2150205761316872%
6.4300411522633745%
9.645061728395062%
12.860082304526749%
16.075102880658438%
19.290123456790123%
22.505144032921812%
25.720164609053498%
28.935185185185187%
32.150205761316876%
35.365226337448554%
38.58024691358025%
41.79526748971193%
45.010288065843625%
48.2253086419753%
51.440329218106996%
54.65534979423868%
57.870370370370374%
61.08539094650206%
64.30041152263375%
67.51543209876543%
70.73045267489711%
73.9454732510288%
77.1604938271605%
80.37551440329219%
83.59053497942386%
86.80555555555556%
90.02057613168725%
93.23559670781893%
96.4506172839506%
99.6656378600823%


In [18]:
### Import REAL Meta Data
path_meta = 'D:/_final_selection_16478/'

df_meta_real = pd.read_csv(path_meta + 'final_items_15553.csv', sep = ',')#  , encoding= 'utf-16')
# df_meta.drop('Unnamed: 0' , axis = 1 , inplace=True)
# df_meta_real

### get all titles from meta data with core_ids of fulltext
titles_real = []
for i in range(len(files_core_id)):
    index = df_meta_real.index[df_meta_real['coreId'] == int(files_core_id[i])].tolist()
    if index == []:
        titles_real.append('Keine Meta Daten gefunden')
    else: 
        index = index[0]
        title_pdf  = df_meta_real.loc[index,'title']
        titles_real.append(title_pdf)
len(titles_real)

### Get autor for the PDF´s
######## PROBLEM: verschiedene Schreibweisen Meta <-> PDF
autors_real = []
for i in range(len(files_core_id)):
    index = df_meta_real.index[df_meta_real['coreId'] == int(files_core_id[i])].tolist()
    index = index[0]
    autor_pdf  = df_meta_real.loc[index,'authors']

    autor_pdf = removeAutor(autor_pdf).split(",")
    for j in range(len(autor_pdf)):
        autor_pdf[j] = ' '.join(autor_pdf[j].split()) ## Entferne überflüssige Whitespaces (auch am Anfang)
        
    autors_real.append(autor_pdf)
len(autors_real)

15552

# Get Label vectors for the GROBID Prediction

In [19]:
### get all titles from meta data with core_ids of fulltext
titles = []
for i in range(len(df_meta.core_id)):
    index = df_meta.index[df_meta['core_id'] == int(df_meta.core_id[i])].tolist()
    if df_meta.title.isna()[i]:
        titles.append('Keine Meta Daten gefunden')
    else: 
        index = index[0]
        title_pdf  = df_meta.loc[index,'title']
        titles.append(title_pdf)

len(titles)

15552

In [20]:
### Get autor for the PDF´s
######## PROBLEM: verschiedene Schreibweisen Meta <-> PDF
autors = []
for i in range(len(df_meta.core_id)):
    index = df_meta.index[df_meta['core_id'] == int(df_meta.core_id[i])].tolist()
    index = index[0]
    autor_pdf  = df_meta.loc[index,'authors']

    autor_pdf = removeAutor_grobid(autor_pdf).split(",")
    for j in range(len(autor_pdf)):
        autor_pdf[j] = ' '.join(autor_pdf[j].split()) ## Entferne überflüssige Whitespaces (auch am Anfang)
        
    autors.append(autor_pdf)
len(autors)

15552

In [21]:
### Funktion, die die Daten labelt 

kein_autor = []
kein_titel = []
error_papers = []
ergebnis_tokens = []
ergebnis_label = []
### Real Meta
ergebnis_label_real = []
###

anzahl_papers = len(df_meta.core_id)

for paper in range(anzahl_papers):

    title = ' '.join(removePassage(titles[paper]).split()).lower() # Remove excces Whitespace & to lowercase
    title = re.sub("\(","\(",title) # () as non-regex string
    title = re.sub("\)","\)",title)
    title = re.sub("\*","\*",title) # * as non-regex string

    title_index = re.search(title, ' '.join(all_pdf_text[paper].split()).lower()) # search for the title
    
    #### Real Meta
    title_real = ' '.join(removePassage(titles_real[paper]).split()).lower() # Remove excces Whitespace & to lowercase
    title_real = re.sub("\(","\(",title_real) # () as non-regex string
    title_real = re.sub("\)","\)",title_real)
    title_real = re.sub("\*","\*",title_real) # * as non-regex string

    title_index_real = re.search(title_real, ' '.join(all_pdf_text[paper].split()).lower()) # search for the title
    ######
    
    print('CoreID:  ' + str(files_core_id[paper]))

    if title_index==None:
        Text_pdf_0 = ' '.join(all_pdf_text[paper].split())
        
        kein_titel.append(df_meta.core_id[paper])
        y_final= np.repeat('Sonstiges',len(Text_pdf_0.split()))
    else:
        
        Text_pdf_0 = ' '.join(all_pdf_text[paper].split())

        ##### TITLE ################################################
        if title_index.start()==0:
            teil_B = ""
        else:
            teil_B = Text_pdf_0[0:title_index.start()-1]
        teil_T = Text_pdf_0[title_index.start():title_index.end()]
        teil_E = Text_pdf_0[title_index.end()+1:len(Text_pdf_0)]

        y_teil1 = np.repeat('Sonstiges',len(teil_B.split()))
        y_teil2 = np.append(['B-title'],np.repeat('I-title',len(teil_T.split())-1))
        y_teil3 = np.repeat('Sonstiges',len(teil_E.split()))

        y_final = np.concatenate((y_teil1, y_teil2 , y_teil3), axis=None)
        
    ### Real Meta    
    if title_index_real!=None:
        Text_pdf_0 = ' '.join(all_pdf_text[paper].split())

        ##### TITLE ################################################
        if title_index_real.start()==0:
            teil_B_real = ""
        else:
            teil_B_real = Text_pdf_0[0:title_index_real.start()-1]
        teil_T_real = Text_pdf_0[title_index_real.start():title_index_real.end()]
        teil_E_real = Text_pdf_0[title_index_real.end()+1:len(Text_pdf_0)]

        y_teil1_r = np.repeat('Sonstiges',len(teil_B_real.split()))
        y_teil2_r = np.append(['B-title'],np.repeat('I-title',len(teil_T_real.split())-1))
        y_teil3_r = np.repeat('Sonstiges',len(teil_E_real.split()))

        y_final_real = np.concatenate((y_teil1_r, y_teil2_r , y_teil3_r), axis=None)
    ###
    
    ##### Get Text
    all_pdf_text1 = re.sub("\\n"," NEWLINE ",all_pdf_text[paper])
    Text_pdf_0_NL = ' '.join(all_pdf_text1.split())

    Tokens = Text_pdf_0.split()
    Labels = y_final
    Real_Tokens = Text_pdf_0_NL.split()

    Tokens = all_pdf_text[paper].split()

    Tokens_final_lower = []
    for i in range(len(Tokens)):
        Tokens_final_lower.append(Tokens[i].lower())
    try:
        if autors[paper]!= ['[]']:

            autors_surname = []
            for i in range(len(autors[paper])):
                if i % 2 == 0:
                    autors_surname.append(autors[paper][i])

            autors_surname_lower = []
            for i in range(len(autors_surname)):
                autors_surname_lower.append(autors_surname[i].lower())

            if re.match('.\.',autors[paper][1]) == None:
                autors_forename = []
                for i in range(len(autors[paper])):
                    if i % 2 == 1:
                        autors_forename.append(autors[paper][i].split())

                autors_forename = list(np.concatenate((autors_forename), axis=None))
                autors_forename_lower = []
                for i in range(len(autors_forename)):
                    autors_forename_lower.append(autors_forename[i].lower())

                autors_surname_lower = list(np.concatenate((autors_forename_lower,autors_surname_lower), axis=None))


            vec_autor = []
            for token in Tokens_final_lower:
                line = any(word in token for word in autors_surname_lower)
                vec_autor.append(line)

            index_autor = [i for i, e in enumerate(vec_autor) if e == True]

            if title_index!=None:
                if len(index_autor)>(len(autors_surname_lower)):
                    diff = len(index_autor) - len(autors_surname_lower)
                    dist = []
                    for j in range(len(index_autor)):
                        dist.append(abs(index_autor[j]-np.where(y_final=="B-title")[0][0]))

                    dict1 = dict(zip(dist , index_autor))

                    dist.sort(reverse = True)

                    for k in range(len(dist[0:diff])):
                        vec_autor[dict1[dist[0:diff][k]]] = False

            for i in range(len(y_final)):
                if vec_autor[i] == True:
                    y_final[i] = 'autor'

            if True not in vec_autor:
                kein_autor.append(files_core_id[paper])

            if re.match('.\.',autors[paper][1]) != None:

                index_autor_true = [i for i, e in enumerate(vec_autor) if e == True]

                for w in range(len(index_autor_true)):
                    index = index_autor_true[w]
                    for t in range(index - 4,index + 4):
                        if re.match('.\.',Tokens_final_lower[t]) != None and Tokens[t].isupper():
                            y_final[t] = 'autor'
        ### Real Meta
        autors_surname_real = []
        for i in range(len(autors_real[paper])):
            if i % 2 == 0:
                autors_surname_real.append(autors_real[paper][i])

        autors_surname_lower_real = []
        for i in range(len(autors_surname_real)):
            autors_surname_lower_real.append(autors_surname_real[i].lower())

        if re.match('.\.',autors_real[paper][1]) == None:
            autors_forename_real = []
            for i in range(len(autors_real[paper])):
                if i % 2 == 1:
                    autors_forename_real.append(autors_real[paper][i].split())

            autors_forename_real = list(np.concatenate((autors_forename_real), axis=None))
            autors_forename_lower_real = []
            for i in range(len(autors_forename_real)):
                autors_forename_lower_real.append(autors_forename_real[i].lower())

            autors_surname_lower_real = list(np.concatenate((autors_forename_lower_real,autors_surname_lower_real), axis=None))


        vec_autor_real = []
        for token in Tokens_final_lower:
            line_real = any(word in token for word in autors_surname_lower_real)
            vec_autor_real.append(line_real)

        index_autor_real = [i for i, e in enumerate(vec_autor_real) if e == True]

        if title_index_real!=None:
            if len(index_autor_real)>(len(autors_surname_lower_real)):
                diff = len(index_autor_real) - len(autors_surname_lower_real)
                dist = []
                for j in range(len(index_autor_real)):
                    dist.append(abs(index_autor_real[j]-np.where(y_final_real=="B-title")[0][0]))

                dict1 = dict(zip(dist , index_autor_real))

                dist.sort(reverse = True)

                for k in range(len(dist[0:diff])):
                    vec_autor_real[dict1[dist[0:diff][k]]] = False

        for i in range(len(y_final_real)):
            if vec_autor_real[i] == True:
                y_final_real[i] = 'autor'

#         if True not in vec_autor_real:
#             kein_autor.append(files_core_id[paper])

        if re.match('.\.',autors_real[paper][1]) != None:

            index_autor_true_real = [i for i, e in enumerate(vec_autor_real) if e == True]

            for w in range(len(index_autor_true_real)):
                index = index_autor_true_real[w]
                for t in range(index - 4,index + 4):
                    if re.match('.\.',Tokens_final_lower[t]) != None and Tokens[t].isupper():
                        y_final_real[t] = 'autor'
        ###

        RealTokens_final = add_newlines(Tokens,Real_Tokens,y_final)[0]
        y_final_REAL = add_newlines(Tokens,Real_Tokens,y_final)[1]
        ### Real Meta
        y_final_REAL2 = add_newlines(Tokens,Real_Tokens,y_final_real)[1]
        ergebnis_label_real.append(y_final_REAL2)
        ###
        ergebnis_label.append(y_final_REAL)
        ergebnis_tokens.append(RealTokens_final)
    except:
        error_papers.append(files_core_id[paper])

CoreID:  11083759
CoreID:  11083794
CoreID:  11083801
CoreID:  11083807
CoreID:  11083814
CoreID:  11083816
CoreID:  11083818
CoreID:  11083820
CoreID:  11083841
CoreID:  11083843
CoreID:  11084183
CoreID:  11084185
CoreID:  11084196
CoreID:  11084226
CoreID:  11084228
CoreID:  11084232
CoreID:  11084286
CoreID:  11084289
CoreID:  11084295
CoreID:  11084334
CoreID:  11084432
CoreID:  11084498
CoreID:  11084525
CoreID:  11084527
CoreID:  11084531
CoreID:  11084535
CoreID:  11084639
CoreID:  11084670
CoreID:  11084672
CoreID:  11084674
CoreID:  11084676
CoreID:  11084682
CoreID:  11084739
CoreID:  11084741
CoreID:  11311954
CoreID:  11311956
CoreID:  11311957
CoreID:  11312082
CoreID:  11312083
CoreID:  11312155
CoreID:  11312159
CoreID:  11312161
CoreID:  11312162
CoreID:  11312164
CoreID:  11312165
CoreID:  11312167
CoreID:  11312168
CoreID:  11312169
CoreID:  11312171
CoreID:  11312173
CoreID:  11312174
CoreID:  11312176
CoreID:  11312178
CoreID:  11312180
CoreID:  11312182
CoreID:  1

CoreID:  11313487
CoreID:  11313488
CoreID:  11313489
CoreID:  11313490
CoreID:  11313491
CoreID:  11313492
CoreID:  11313494
CoreID:  11313495
CoreID:  11313496
CoreID:  11313498
CoreID:  11313499
CoreID:  11313501
CoreID:  11313503
CoreID:  11313504
CoreID:  11313517
CoreID:  11313526
CoreID:  11313533
CoreID:  11313534
CoreID:  11313537
CoreID:  11313540
CoreID:  11313541
CoreID:  11313550
CoreID:  11313559
CoreID:  11313562
CoreID:  11313563
CoreID:  11313582
CoreID:  11313589
CoreID:  11313592
CoreID:  11313595
CoreID:  11313599
CoreID:  11313601
CoreID:  11313602
CoreID:  11313605
CoreID:  11313627
CoreID:  11313631
CoreID:  11313640
CoreID:  11313641
CoreID:  11313658
CoreID:  11313668
CoreID:  11313669
CoreID:  11313671
CoreID:  11313672
CoreID:  11313673
CoreID:  11313674
CoreID:  11313675
CoreID:  11313676
CoreID:  11313677
CoreID:  11313678
CoreID:  11313679
CoreID:  11313680
CoreID:  11313681
CoreID:  11313682
CoreID:  11313683
CoreID:  11313684
CoreID:  11313685
CoreID:  1

CoreID:  11314362
CoreID:  11314363
CoreID:  11314364
CoreID:  11314365
CoreID:  11314366
CoreID:  11314367
CoreID:  11314368
CoreID:  11314369
CoreID:  11314370
CoreID:  11314371
CoreID:  11314372
CoreID:  11314374
CoreID:  11314375
CoreID:  11314377
CoreID:  11314378
CoreID:  11314379
CoreID:  11314380
CoreID:  11314381
CoreID:  11314382
CoreID:  11314383
CoreID:  11314389
CoreID:  11314390
CoreID:  11314391
CoreID:  11314392
CoreID:  11314393
CoreID:  11314394
CoreID:  11314395
CoreID:  11314396
CoreID:  11314397
CoreID:  11314398
CoreID:  11314399
CoreID:  11314400
CoreID:  11314401
CoreID:  11314402
CoreID:  11314403
CoreID:  11314404
CoreID:  11314405
CoreID:  11314406
CoreID:  11314407
CoreID:  11314408
CoreID:  11314409
CoreID:  11314410
CoreID:  11314411
CoreID:  11314412
CoreID:  11314413
CoreID:  11314414
CoreID:  11314415
CoreID:  11314416
CoreID:  11314417
CoreID:  11314418
CoreID:  11314419
CoreID:  11314420
CoreID:  11314421
CoreID:  11314422
CoreID:  11314423
CoreID:  1

CoreID:  11315520
CoreID:  11315521
CoreID:  11315522
CoreID:  11315523
CoreID:  11315524
CoreID:  11315525
CoreID:  11315526
CoreID:  11315527
CoreID:  11315528
CoreID:  11315529
CoreID:  11315530
CoreID:  11315532
CoreID:  11315533
CoreID:  11315535
CoreID:  11315536
CoreID:  11315537
CoreID:  11315538
CoreID:  11315539
CoreID:  11315540
CoreID:  11315541
CoreID:  11315542
CoreID:  11315544
CoreID:  11315545
CoreID:  11315546
CoreID:  11315547
CoreID:  11315548
CoreID:  11315549
CoreID:  11315550
CoreID:  11315551
CoreID:  11315552
CoreID:  11315554
CoreID:  11315555
CoreID:  11315556
CoreID:  11315557
CoreID:  11315558
CoreID:  11315559
CoreID:  11315560
CoreID:  11315561
CoreID:  11315562
CoreID:  11315563
CoreID:  11315564
CoreID:  11315565
CoreID:  11315566
CoreID:  11315567
CoreID:  11315568
CoreID:  11315569
CoreID:  11315570
CoreID:  11315571
CoreID:  11315572
CoreID:  11315576
CoreID:  11315577
CoreID:  11315578
CoreID:  11315579
CoreID:  11315580
CoreID:  11315581
CoreID:  1

CoreID:  11320268
CoreID:  11320291
CoreID:  11320302
CoreID:  11320339
CoreID:  11320350
CoreID:  11320356
CoreID:  11320381
CoreID:  11320382
CoreID:  11320384
CoreID:  11320392
CoreID:  11320399
CoreID:  11320400
CoreID:  11320401
CoreID:  11320403
CoreID:  11320404
CoreID:  11320409
CoreID:  11320411
CoreID:  11320417
CoreID:  11320428
CoreID:  11320456
CoreID:  11320462
CoreID:  11320463
CoreID:  11320464
CoreID:  11320470
CoreID:  11320471
CoreID:  11320473
CoreID:  11320478
CoreID:  11320489
CoreID:  11320490
CoreID:  11320493
CoreID:  11320501
CoreID:  11320515
CoreID:  11320522
CoreID:  11320524
CoreID:  11320525
CoreID:  11320526
CoreID:  11320527
CoreID:  11320528
CoreID:  11320529
CoreID:  11320530
CoreID:  11320534
CoreID:  11320537
CoreID:  11320539
CoreID:  11320541
CoreID:  11320542
CoreID:  11320543
CoreID:  11320556
CoreID:  11320557
CoreID:  11320558
CoreID:  11320560
CoreID:  11320561
CoreID:  11320574
CoreID:  11320576
CoreID:  11320577
CoreID:  11320583
CoreID:  1

CoreID:  11324886
CoreID:  11324887
CoreID:  11324888
CoreID:  11324889
CoreID:  11324892
CoreID:  11324893
CoreID:  11324894
CoreID:  11324895
CoreID:  11324896
CoreID:  11324897
CoreID:  11324898
CoreID:  11324899
CoreID:  11324902
CoreID:  11324966
CoreID:  11324967
CoreID:  11324968
CoreID:  11324969
CoreID:  11324970
CoreID:  11324971
CoreID:  11324972
CoreID:  11324973
CoreID:  11324974
CoreID:  11324975
CoreID:  11324976
CoreID:  11324977
CoreID:  11324978
CoreID:  11324979
CoreID:  11324980
CoreID:  11324981
CoreID:  11324982
CoreID:  11324990
CoreID:  11324991
CoreID:  11324992
CoreID:  11324993
CoreID:  11324994
CoreID:  11324996
CoreID:  11324997
CoreID:  11324998
CoreID:  11325001
CoreID:  11325002
CoreID:  11325004
CoreID:  11325005
CoreID:  11325006
CoreID:  11325007
CoreID:  11325008
CoreID:  11325009
CoreID:  11325010
CoreID:  11325011
CoreID:  11325013
CoreID:  11325016
CoreID:  11325019
CoreID:  11325020
CoreID:  11325021
CoreID:  11325022
CoreID:  11325023
CoreID:  1

CoreID:  11328939
CoreID:  11328946
CoreID:  11328955
CoreID:  11328956
CoreID:  11328963
CoreID:  11328964
CoreID:  11329007
CoreID:  11329018
CoreID:  11329019
CoreID:  11329037
CoreID:  11329038
CoreID:  11329051
CoreID:  11329053
CoreID:  11329054
CoreID:  11329061
CoreID:  11329071
CoreID:  11329079
CoreID:  11329080
CoreID:  11329081
CoreID:  11329082
CoreID:  11329113
CoreID:  11329145
CoreID:  11329146
CoreID:  11329149
CoreID:  11329159
CoreID:  11329195
CoreID:  11329255
CoreID:  11329258
CoreID:  11329259
CoreID:  11329261
CoreID:  11329282
CoreID:  11329283
CoreID:  11329284
CoreID:  11329285
CoreID:  11329286
CoreID:  11329287
CoreID:  11329297
CoreID:  11329298
CoreID:  11329309
CoreID:  11329310
CoreID:  11329333
CoreID:  11329334
CoreID:  11329336
CoreID:  11329338
CoreID:  11329339
CoreID:  11329340
CoreID:  11329361
CoreID:  11329362
CoreID:  11329368
CoreID:  11329371
CoreID:  11329376
CoreID:  11329382
CoreID:  11329383
CoreID:  11329384
CoreID:  11329385
CoreID:  1

CoreID:  11334667
CoreID:  11334668
CoreID:  11334677
CoreID:  11334678
CoreID:  11334695
CoreID:  11334697
CoreID:  11334706
CoreID:  11334707
CoreID:  11334709
CoreID:  11334716
CoreID:  11334717
CoreID:  11334718
CoreID:  11334719
CoreID:  11334720
CoreID:  11334721
CoreID:  11334723
CoreID:  11334724
CoreID:  11334725
CoreID:  11334727
CoreID:  11334728
CoreID:  11334729
CoreID:  11334730
CoreID:  11334735
CoreID:  11334736
CoreID:  11334737
CoreID:  11334738
CoreID:  11334744
CoreID:  11334747
CoreID:  11334749
CoreID:  11334753
CoreID:  11334755
CoreID:  11334756
CoreID:  11334759
CoreID:  11334761
CoreID:  11334762
CoreID:  11334763
CoreID:  11334764
CoreID:  11334765
CoreID:  11334766
CoreID:  11334768
CoreID:  11334770
CoreID:  11334772
CoreID:  11334773
CoreID:  11334782
CoreID:  11334805
CoreID:  11334806
CoreID:  11334840
CoreID:  11334841
CoreID:  11334842
CoreID:  11334843
CoreID:  11334844
CoreID:  11334845
CoreID:  11334846
CoreID:  11334847
CoreID:  11334849
CoreID:  1

CoreID:  11335417
CoreID:  11335418
CoreID:  11335419
CoreID:  11335420
CoreID:  11335421
CoreID:  11335423
CoreID:  11335424
CoreID:  11335425
CoreID:  11335426
CoreID:  11335427
CoreID:  11335428
CoreID:  11335429
CoreID:  11335430
CoreID:  11335431
CoreID:  11335433
CoreID:  11335434
CoreID:  11335436
CoreID:  11335437
CoreID:  11335438
CoreID:  11335439
CoreID:  11335440
CoreID:  11335443
CoreID:  11335444
CoreID:  11335446
CoreID:  11335447
CoreID:  11335448
CoreID:  11335451
CoreID:  11335452
CoreID:  11335454
CoreID:  11335455
CoreID:  11335456
CoreID:  11335457
CoreID:  11335458
CoreID:  11335461
CoreID:  11335462
CoreID:  11335464
CoreID:  11335465
CoreID:  11335466
CoreID:  11335467
CoreID:  11335468
CoreID:  11335469
CoreID:  11335470
CoreID:  11335476
CoreID:  11335477
CoreID:  11335478
CoreID:  11335480
CoreID:  11335481
CoreID:  11335483
CoreID:  11335488
CoreID:  11335490
CoreID:  11335491
CoreID:  11335492
CoreID:  11335493
CoreID:  11335495
CoreID:  11335496
CoreID:  1

CoreID:  11336320
CoreID:  11336321
CoreID:  11336322
CoreID:  11336323
CoreID:  11336324
CoreID:  11336325
CoreID:  11336328
CoreID:  11336329
CoreID:  11336330
CoreID:  11336331
CoreID:  11336332
CoreID:  11336333
CoreID:  11336334
CoreID:  11336335
CoreID:  11336336
CoreID:  11336337
CoreID:  11336338
CoreID:  11336339
CoreID:  11336340
CoreID:  11336342
CoreID:  11336343
CoreID:  11336344
CoreID:  11336345
CoreID:  11336346
CoreID:  11336347
CoreID:  11336348
CoreID:  11336349
CoreID:  11336350
CoreID:  11336351
CoreID:  11336355
CoreID:  11336356
CoreID:  11336357
CoreID:  11336359
CoreID:  11336360
CoreID:  11336361
CoreID:  11336362
CoreID:  11336363
CoreID:  11336364
CoreID:  11336365
CoreID:  11336366
CoreID:  11336367
CoreID:  11336368
CoreID:  11336369
CoreID:  11336370
CoreID:  11336373
CoreID:  11336375
CoreID:  11336378
CoreID:  11336380
CoreID:  11336381
CoreID:  11336382
CoreID:  11336383
CoreID:  11336384
CoreID:  11336385
CoreID:  11336387
CoreID:  11336388
CoreID:  1

CoreID:  11337188
CoreID:  11337189
CoreID:  11337191
CoreID:  11337192
CoreID:  11337193
CoreID:  11337194
CoreID:  11337195
CoreID:  11337196
CoreID:  11337197
CoreID:  11337198
CoreID:  11337199
CoreID:  11337200
CoreID:  11337201
CoreID:  11337202
CoreID:  11337203
CoreID:  11337204
CoreID:  11337205
CoreID:  11337206
CoreID:  11337207
CoreID:  11337208
CoreID:  11337209
CoreID:  11337210
CoreID:  11337211
CoreID:  11337212
CoreID:  11337213
CoreID:  11337214
CoreID:  11337215
CoreID:  11337216
CoreID:  11337217
CoreID:  11337218
CoreID:  11337219
CoreID:  11337220
CoreID:  11337222
CoreID:  11337224
CoreID:  11337226
CoreID:  11337227
CoreID:  11337228
CoreID:  11337229
CoreID:  11337231
CoreID:  11337232
CoreID:  11337233
CoreID:  11337234
CoreID:  11337235
CoreID:  11337236
CoreID:  11337237
CoreID:  11337239
CoreID:  11337241
CoreID:  11337242
CoreID:  11337243
CoreID:  11337244
CoreID:  11337251
CoreID:  11337252
CoreID:  11337253
CoreID:  11337254
CoreID:  11337255
CoreID:  1

CoreID:  12080925
CoreID:  12080926
CoreID:  12080928
CoreID:  12080929
CoreID:  12080930
CoreID:  12080931
CoreID:  12080932
CoreID:  12080933
CoreID:  12080934
CoreID:  12080935
CoreID:  12080936
CoreID:  12080937
CoreID:  12080938
CoreID:  12080939
CoreID:  12080940
CoreID:  12080941
CoreID:  12080942
CoreID:  12080943
CoreID:  12080944
CoreID:  12080945
CoreID:  12080946
CoreID:  12080948
CoreID:  12080949
CoreID:  12080950
CoreID:  12080954
CoreID:  12080956
CoreID:  12080958
CoreID:  12080959
CoreID:  12080960
CoreID:  12080962
CoreID:  12080964
CoreID:  12080965
CoreID:  12080966
CoreID:  12080967
CoreID:  12080968
CoreID:  12080969
CoreID:  12080971
CoreID:  12080972
CoreID:  12080973
CoreID:  12080974
CoreID:  12080977
CoreID:  12080978
CoreID:  12080979
CoreID:  12080980
CoreID:  12080982
CoreID:  12080983
CoreID:  12080984
CoreID:  12080985
CoreID:  12080986
CoreID:  12080987
CoreID:  12080988
CoreID:  12080989
CoreID:  12080991
CoreID:  12080992
CoreID:  12080993
CoreID:  1

CoreID:  12081542
CoreID:  12081543
CoreID:  12081546
CoreID:  12081547
CoreID:  12081548
CoreID:  12081549
CoreID:  12081550
CoreID:  12081552
CoreID:  12081553
CoreID:  12081554
CoreID:  12081555
CoreID:  12081556
CoreID:  12081557
CoreID:  12081558
CoreID:  12081559
CoreID:  12081560
CoreID:  12081561
CoreID:  12081562
CoreID:  12081563
CoreID:  12081564
CoreID:  12081565
CoreID:  12081566
CoreID:  12081568
CoreID:  12081569
CoreID:  12081570
CoreID:  12081571
CoreID:  12081572
CoreID:  12081574
CoreID:  12081575
CoreID:  12081576
CoreID:  12081577
CoreID:  12081578
CoreID:  12081579
CoreID:  12081580
CoreID:  12081581
CoreID:  12081582
CoreID:  12081583
CoreID:  12081584
CoreID:  12081585
CoreID:  12081586
CoreID:  12081587
CoreID:  12081588
CoreID:  12081590
CoreID:  12081591
CoreID:  12081592
CoreID:  12081593
CoreID:  12081594
CoreID:  12081595
CoreID:  12081596
CoreID:  12081597
CoreID:  12081600
CoreID:  12081601
CoreID:  12081602
CoreID:  12081604
CoreID:  12081605
CoreID:  1

CoreID:  12082116
CoreID:  12082117
CoreID:  12082118
CoreID:  12082119
CoreID:  12082120
CoreID:  12082121
CoreID:  12082122
CoreID:  12082123
CoreID:  12082124
CoreID:  12082125
CoreID:  12082127
CoreID:  12082128
CoreID:  12082129
CoreID:  12082130
CoreID:  12082131
CoreID:  12082132
CoreID:  12082133
CoreID:  12082134
CoreID:  12082135
CoreID:  12082136
CoreID:  12082137
CoreID:  12082138
CoreID:  12082139
CoreID:  12082140
CoreID:  12082142
CoreID:  12082143
CoreID:  12082144
CoreID:  12082145
CoreID:  12082146
CoreID:  12082147
CoreID:  12082148
CoreID:  12082149
CoreID:  12082150
CoreID:  12082151
CoreID:  12082153
CoreID:  12082155
CoreID:  12082156
CoreID:  12082157
CoreID:  12082158
CoreID:  12082160
CoreID:  12082161
CoreID:  12082162
CoreID:  12082163
CoreID:  12082165
CoreID:  12082166
CoreID:  12082167
CoreID:  12082168
CoreID:  12082169
CoreID:  12082170
CoreID:  12082171
CoreID:  12082172
CoreID:  12082173
CoreID:  12082174
CoreID:  12082175
CoreID:  12082176
CoreID:  1

CoreID:  12082671
CoreID:  12082672
CoreID:  12082673
CoreID:  12082674
CoreID:  12082676
CoreID:  12082677
CoreID:  12082678
CoreID:  12082679
CoreID:  12082682
CoreID:  12082683
CoreID:  12082684
CoreID:  12082685
CoreID:  12082687
CoreID:  12082689
CoreID:  12082690
CoreID:  12082691
CoreID:  12082692
CoreID:  12082693
CoreID:  12082695
CoreID:  12082696
CoreID:  12082699
CoreID:  12082701
CoreID:  12082702
CoreID:  12082703
CoreID:  12082704
CoreID:  12082705
CoreID:  12082706
CoreID:  12082707
CoreID:  12082708
CoreID:  12082709
CoreID:  12082710
CoreID:  12082714
CoreID:  12082717
CoreID:  12082718
CoreID:  12082719
CoreID:  12082720
CoreID:  12082722
CoreID:  12082723
CoreID:  12082725
CoreID:  12082726
CoreID:  12082727
CoreID:  12082728
CoreID:  12082729
CoreID:  12082732
CoreID:  12082733
CoreID:  12082735
CoreID:  12082736
CoreID:  12082738
CoreID:  12082740
CoreID:  12082741
CoreID:  12082742
CoreID:  12082743
CoreID:  12082744
CoreID:  12082745
CoreID:  12082746
CoreID:  1

CoreID:  12083390
CoreID:  12083391
CoreID:  12083392
CoreID:  12083393
CoreID:  12083394
CoreID:  12083397
CoreID:  12083400
CoreID:  12083401
CoreID:  12083402
CoreID:  12083403
CoreID:  12083404
CoreID:  12083405
CoreID:  12083406
CoreID:  12083407
CoreID:  12083408
CoreID:  12083409
CoreID:  12083410
CoreID:  12083411
CoreID:  12083412
CoreID:  12083413
CoreID:  12083414
CoreID:  12083415
CoreID:  12083416
CoreID:  12083418
CoreID:  12083419
CoreID:  12083421
CoreID:  12083422
CoreID:  12083423
CoreID:  12083424
CoreID:  12083425
CoreID:  12083426
CoreID:  12083427
CoreID:  12083428
CoreID:  12083429
CoreID:  12083430
CoreID:  12083431
CoreID:  12083432
CoreID:  12083433
CoreID:  12083434
CoreID:  12083435
CoreID:  12083436
CoreID:  12083437
CoreID:  12083438
CoreID:  12083439
CoreID:  12083441
CoreID:  12083442
CoreID:  12083450
CoreID:  12083469
CoreID:  12083474
CoreID:  12083483
CoreID:  12083484
CoreID:  12083486
CoreID:  12083495
CoreID:  12083515
CoreID:  12083516
CoreID:  1

CoreID:  12084114
CoreID:  12084115
CoreID:  12084116
CoreID:  12084118
CoreID:  12084119
CoreID:  12084120
CoreID:  12084121
CoreID:  12084122
CoreID:  12084124
CoreID:  12084125
CoreID:  12084128
CoreID:  12084130
CoreID:  12084131
CoreID:  12084132
CoreID:  12084133
CoreID:  12084134
CoreID:  12084135
CoreID:  12084137
CoreID:  12084138
CoreID:  12084139
CoreID:  12084140
CoreID:  12084141
CoreID:  12084142
CoreID:  12084162
CoreID:  12084168
CoreID:  12084169
CoreID:  12084170
CoreID:  12084171
CoreID:  12084172
CoreID:  12084173
CoreID:  12084174
CoreID:  12084175
CoreID:  12084177
CoreID:  12084178
CoreID:  12084179
CoreID:  12084180
CoreID:  12084182
CoreID:  12084183
CoreID:  12084184
CoreID:  12084185
CoreID:  12084186
CoreID:  12084187
CoreID:  12084188
CoreID:  12084190
CoreID:  12084191
CoreID:  12084192
CoreID:  12084193
CoreID:  12084194
CoreID:  12084195
CoreID:  12084197
CoreID:  12084198
CoreID:  12084199
CoreID:  12084200
CoreID:  12084201
CoreID:  12084202
CoreID:  1

CoreID:  12084899
CoreID:  12084906
CoreID:  12084907
CoreID:  12084908
CoreID:  12084909
CoreID:  12084910
CoreID:  12084912
CoreID:  12084913
CoreID:  12084914
CoreID:  12084915
CoreID:  12084917
CoreID:  12084919
CoreID:  12084920
CoreID:  12084931
CoreID:  12084932
CoreID:  12084934
CoreID:  12084935
CoreID:  12084936
CoreID:  12084937
CoreID:  12084938
CoreID:  12084939
CoreID:  12084941
CoreID:  12084943
CoreID:  12084945
CoreID:  12084946
CoreID:  12084947
CoreID:  12084948
CoreID:  12084949
CoreID:  12084951
CoreID:  12084953
CoreID:  12084954
CoreID:  12084955
CoreID:  12084956
CoreID:  12084957
CoreID:  12084958
CoreID:  12084959
CoreID:  12084960
CoreID:  12084961
CoreID:  12084962
CoreID:  12084963
CoreID:  12084964
CoreID:  12084965
CoreID:  12084967
CoreID:  12084968
CoreID:  12084969
CoreID:  12084971
CoreID:  12084972
CoreID:  12084973
CoreID:  12084974
CoreID:  12084975
CoreID:  12084976
CoreID:  12084977
CoreID:  12084978
CoreID:  12084980
CoreID:  12084981
CoreID:  1

CoreID:  12085694
CoreID:  12085695
CoreID:  12085696
CoreID:  12085698
CoreID:  12085700
CoreID:  12085702
CoreID:  12085703
CoreID:  12085704
CoreID:  12085706
CoreID:  12085709
CoreID:  12085713
CoreID:  12085714
CoreID:  12085715
CoreID:  12085716
CoreID:  12085717
CoreID:  12085718
CoreID:  12085720
CoreID:  12085721
CoreID:  12085723
CoreID:  12085724
CoreID:  12085730
CoreID:  12085735
CoreID:  12085736
CoreID:  12085738
CoreID:  12085744
CoreID:  12085745
CoreID:  12085747
CoreID:  12085753
CoreID:  12085754
CoreID:  12085755
CoreID:  12085756
CoreID:  12085757
CoreID:  12085758
CoreID:  12085760
CoreID:  12085761
CoreID:  12085762
CoreID:  12085763
CoreID:  12085764
CoreID:  12085765
CoreID:  12085766
CoreID:  12085767
CoreID:  12085768
CoreID:  12085769
CoreID:  12085770
CoreID:  12085771
CoreID:  12085772
CoreID:  12085774
CoreID:  12085775
CoreID:  12085776
CoreID:  12085779
CoreID:  12085780
CoreID:  12085781
CoreID:  12085783
CoreID:  12085784
CoreID:  12085785
CoreID:  1

CoreID:  12086536
CoreID:  12086537
CoreID:  12086538
CoreID:  12086539
CoreID:  12086540
CoreID:  12086543
CoreID:  12086544
CoreID:  12086545
CoreID:  12086546
CoreID:  12086547
CoreID:  12086548
CoreID:  12086550
CoreID:  12086551
CoreID:  12086552
CoreID:  12086553
CoreID:  12086554
CoreID:  12086555
CoreID:  12086556
CoreID:  12086557
CoreID:  12086558
CoreID:  12086559
CoreID:  12086560
CoreID:  12086561
CoreID:  12086562
CoreID:  12086563
CoreID:  12086564
CoreID:  12086565
CoreID:  12086566
CoreID:  12086567
CoreID:  12086569
CoreID:  12086571
CoreID:  12086572
CoreID:  12086575
CoreID:  12086576
CoreID:  12086577
CoreID:  12086578
CoreID:  12086579
CoreID:  12086581
CoreID:  12086582
CoreID:  12086585
CoreID:  12086586
CoreID:  12086591
CoreID:  12086592
CoreID:  12086593
CoreID:  12086595
CoreID:  12086596
CoreID:  12086597
CoreID:  12086598
CoreID:  12086599
CoreID:  12086600
CoreID:  12086601
CoreID:  12086602
CoreID:  12086603
CoreID:  12086604
CoreID:  12086605
CoreID:  1

CoreID:  12087296
CoreID:  12087297
CoreID:  12087298
CoreID:  12087299
CoreID:  12087300
CoreID:  12087301
CoreID:  12087302
CoreID:  12087303
CoreID:  12087304
CoreID:  12087305
CoreID:  12087307
CoreID:  12087308
CoreID:  12087310
CoreID:  12087316
CoreID:  12087319
CoreID:  12087320
CoreID:  12087321
CoreID:  12087322
CoreID:  12087323
CoreID:  12087325
CoreID:  12087326
CoreID:  12087329
CoreID:  12087331
CoreID:  12087333
CoreID:  12087334
CoreID:  12087335
CoreID:  12087337
CoreID:  12087339
CoreID:  12087340
CoreID:  12087341
CoreID:  12087347
CoreID:  12087350
CoreID:  12087351
CoreID:  12087352
CoreID:  12087353
CoreID:  12087354
CoreID:  12087355
CoreID:  12087356
CoreID:  12087357
CoreID:  12087360
CoreID:  12087361
CoreID:  12087362
CoreID:  12087363
CoreID:  12087364
CoreID:  12087368
CoreID:  12087369
CoreID:  12087370
CoreID:  12087372
CoreID:  12087373
CoreID:  12087374
CoreID:  12087375
CoreID:  12087377
CoreID:  12087378
CoreID:  12087379
CoreID:  12087380
CoreID:  1

CoreID:  12241666
CoreID:  12241668
CoreID:  12241669
CoreID:  12241743
CoreID:  12241746
CoreID:  12241747
CoreID:  12241749
CoreID:  12241751
CoreID:  12241752
CoreID:  12241753
CoreID:  12241754
CoreID:  12241755
CoreID:  12241756
CoreID:  12241758
CoreID:  12241759
CoreID:  12241760
CoreID:  12241761
CoreID:  12241762
CoreID:  12241763
CoreID:  12241764
CoreID:  12241765
CoreID:  12241766
CoreID:  12241767
CoreID:  12241768
CoreID:  12241769
CoreID:  12241901
CoreID:  12241902
CoreID:  12241903
CoreID:  12241973
CoreID:  12241975
CoreID:  12241976
CoreID:  12241977
CoreID:  12241978
CoreID:  12241980
CoreID:  12241981
CoreID:  12241983
CoreID:  12241984
CoreID:  12241985
CoreID:  12242032
CoreID:  12242099
CoreID:  12242118
CoreID:  12242348
CoreID:  12242666
CoreID:  129692477
CoreID:  129692482
CoreID:  129692484
CoreID:  129692506
CoreID:  129692508
CoreID:  129692510
CoreID:  129692514
CoreID:  129692515
CoreID:  129692516
CoreID:  129692519
CoreID:  129692520
CoreID:  12969252

CoreID:  13358648
CoreID:  13358649
CoreID:  13358650
CoreID:  13358651
CoreID:  13358652
CoreID:  13358653
CoreID:  13358654
CoreID:  13358655
CoreID:  13358656
CoreID:  13358657
CoreID:  13358658
CoreID:  13358659
CoreID:  13358660
CoreID:  13358661
CoreID:  13358662
CoreID:  13358663
CoreID:  13358664
CoreID:  13358665
CoreID:  13358666
CoreID:  13358668
CoreID:  13358669
CoreID:  13358670
CoreID:  13358671
CoreID:  13358672
CoreID:  13358673
CoreID:  13358674
CoreID:  13358675
CoreID:  13358676
CoreID:  13358677
CoreID:  13358678
CoreID:  13358679
CoreID:  13358680
CoreID:  13358681
CoreID:  13358682
CoreID:  13358683
CoreID:  13358684
CoreID:  13358685
CoreID:  13358686
CoreID:  13358687
CoreID:  13358688
CoreID:  13358689
CoreID:  13358691
CoreID:  13358692
CoreID:  13358693
CoreID:  13358694
CoreID:  13358695
CoreID:  13358697
CoreID:  13358698
CoreID:  13358699
CoreID:  13358700
CoreID:  13358701
CoreID:  13358703
CoreID:  13358704
CoreID:  13358705
CoreID:  13358706
CoreID:  1

CoreID:  14057232
CoreID:  14057245
CoreID:  14057247
CoreID:  14057248
CoreID:  14057249
CoreID:  14057252
CoreID:  14057478
CoreID:  14057487
CoreID:  14057488
CoreID:  14057489
CoreID:  14057492
CoreID:  14057494
CoreID:  14057495
CoreID:  14057496
CoreID:  14057500
CoreID:  14057502
CoreID:  14057503
CoreID:  14057504
CoreID:  14057735
CoreID:  14057736
CoreID:  14057737
CoreID:  14057740
CoreID:  14057742
CoreID:  14057743
CoreID:  14057750
CoreID:  14057751
CoreID:  14057759
CoreID:  14057764
CoreID:  14057991
CoreID:  14057997
CoreID:  14057999
CoreID:  14058000
CoreID:  14058017
CoreID:  14058018
CoreID:  14058019
CoreID:  14058240
CoreID:  14058248
CoreID:  14058271
CoreID:  14058497
CoreID:  14058498
CoreID:  14058503
CoreID:  14058507
CoreID:  14058510
CoreID:  14058512
CoreID:  14058513
CoreID:  14058514
CoreID:  14058521
CoreID:  14058527
CoreID:  14058529
CoreID:  14058752
CoreID:  14058753
CoreID:  14058754
CoreID:  14058760
CoreID:  14058761
CoreID:  14058769
CoreID:  1

CoreID:  16279163
CoreID:  16279171
CoreID:  16279179
CoreID:  16397217
CoreID:  16397218
CoreID:  16397219
CoreID:  16423738
CoreID:  16423739
CoreID:  16423741
CoreID:  16423871
CoreID:  16423874
CoreID:  16423877
CoreID:  16423884
CoreID:  16423888
CoreID:  16423890
CoreID:  16423891
CoreID:  16423892
CoreID:  16423893
CoreID:  16423894
CoreID:  16423895
CoreID:  16423896
CoreID:  16423897
CoreID:  16423898
CoreID:  16423899
CoreID:  16423904
CoreID:  16423905
CoreID:  16423906
CoreID:  16423907
CoreID:  16423908
CoreID:  16423909
CoreID:  16423910
CoreID:  16423911
CoreID:  16423912
CoreID:  16423913
CoreID:  16423914
CoreID:  16423915
CoreID:  16423916
CoreID:  16423917
CoreID:  16423918
CoreID:  16423944
CoreID:  16423945
CoreID:  16423946
CoreID:  16424074
CoreID:  16436745
CoreID:  16436746
CoreID:  16436753
CoreID:  16436760
CoreID:  16436761
CoreID:  16436763
CoreID:  16436770
CoreID:  16509857
CoreID:  16509858
CoreID:  16509861
CoreID:  16673513
CoreID:  16673514
CoreID:  1

CoreID:  19452943
CoreID:  19452944
CoreID:  19452946
CoreID:  19452951
CoreID:  19452953
CoreID:  19452960
CoreID:  19456169
CoreID:  19456170
CoreID:  19456171
CoreID:  19456172
CoreID:  19456173
CoreID:  19456176
CoreID:  19456177
CoreID:  19456180
CoreID:  19456181
CoreID:  19456183
CoreID:  19456184
CoreID:  19456187
CoreID:  19456189
CoreID:  19456190
CoreID:  19456192
CoreID:  19456194
CoreID:  19456195
CoreID:  19456196
CoreID:  19456197
CoreID:  19456198
CoreID:  19456203
CoreID:  19456204
CoreID:  19456205
CoreID:  19456206
CoreID:  19456208
CoreID:  19456210
CoreID:  19456219
CoreID:  19456223
CoreID:  19456224
CoreID:  19456225
CoreID:  19456226
CoreID:  19456230
CoreID:  19456233
CoreID:  19456235
CoreID:  19456236
CoreID:  19456237
CoreID:  19456238
CoreID:  19456239
CoreID:  19456240
CoreID:  19456241
CoreID:  19456243
CoreID:  19456244
CoreID:  19456245
CoreID:  19456246
CoreID:  19456247
CoreID:  19456265
CoreID:  19456267
CoreID:  19610544
CoreID:  19610569
CoreID:  1

CoreID:  33756325
CoreID:  33756326
CoreID:  33756327
CoreID:  33756328
CoreID:  33756329
CoreID:  33756330
CoreID:  33756331
CoreID:  33756332
CoreID:  33756333
CoreID:  33756364
CoreID:  33756365
CoreID:  33756369
CoreID:  33756371
CoreID:  33756374
CoreID:  33756377
CoreID:  33756382
CoreID:  33756384
CoreID:  33756389
CoreID:  33756393
CoreID:  33756396
CoreID:  33756398
CoreID:  33756399
CoreID:  33756401
CoreID:  33756402
CoreID:  33756404
CoreID:  33756406
CoreID:  33756408
CoreID:  33756409
CoreID:  33756410
CoreID:  33756411
CoreID:  33756413
CoreID:  33756414
CoreID:  33756415
CoreID:  33756416
CoreID:  33756417
CoreID:  33756418
CoreID:  33756419
CoreID:  33756421
CoreID:  33756422
CoreID:  33756533
CoreID:  33756534
CoreID:  33756536
CoreID:  33756537
CoreID:  33756538
CoreID:  33756540
CoreID:  33756541
CoreID:  33756543
CoreID:  33756544
CoreID:  33756545
CoreID:  33756547
CoreID:  33756548
CoreID:  33756549
CoreID:  33756551
CoreID:  33756552
CoreID:  33756553
CoreID:  3

CoreID:  33757762
CoreID:  33757763
CoreID:  33757764
CoreID:  33757765
CoreID:  33757766
CoreID:  33757767
CoreID:  33757768
CoreID:  33757769
CoreID:  33757770
CoreID:  33757771
CoreID:  33757773
CoreID:  33757774
CoreID:  33757775
CoreID:  33757776
CoreID:  33757778
CoreID:  33757779
CoreID:  33757780
CoreID:  33757781
CoreID:  33757785
CoreID:  33757787
CoreID:  33757788
CoreID:  33757789
CoreID:  33757790
CoreID:  33757791
CoreID:  33757792
CoreID:  33757794
CoreID:  33757795
CoreID:  33757818
CoreID:  33757821
CoreID:  33757823
CoreID:  33757825
CoreID:  33757826
CoreID:  33757827
CoreID:  33757829
CoreID:  33757830
CoreID:  33757831
CoreID:  33757832
CoreID:  33757833
CoreID:  33757834
CoreID:  33757836
CoreID:  33757837
CoreID:  33757838
CoreID:  33757841
CoreID:  33757844
CoreID:  33757845
CoreID:  33757847
CoreID:  33757848
CoreID:  33757849
CoreID:  33757850
CoreID:  33757851
CoreID:  33757853
CoreID:  33757854
CoreID:  33757855
CoreID:  33757859
CoreID:  33757862
CoreID:  3

CoreID:  42032353
CoreID:  42032362
CoreID:  42032366
CoreID:  42032371
CoreID:  42032384
CoreID:  42032388
CoreID:  42032391
CoreID:  42032420
CoreID:  42032421
CoreID:  42032429
CoreID:  42032432
CoreID:  42032436
CoreID:  42032452
CoreID:  42032454
CoreID:  42032457
CoreID:  42032461
CoreID:  42032472
CoreID:  42032481
CoreID:  42032485
CoreID:  42032486
CoreID:  42032487
CoreID:  42032493
CoreID:  42032506
CoreID:  42032508
CoreID:  42032512
CoreID:  42032516
CoreID:  42032526
CoreID:  42032534
CoreID:  42032539
CoreID:  42032545
CoreID:  42032550
CoreID:  42032554
CoreID:  42032555
CoreID:  42032558
CoreID:  42032564
CoreID:  42032570
CoreID:  42032585
CoreID:  42032587
CoreID:  42032589
CoreID:  42032591
CoreID:  42032592
CoreID:  42032596
CoreID:  42032601
CoreID:  42032604
CoreID:  42032624
CoreID:  42032631
CoreID:  42032641
CoreID:  42032647
CoreID:  42032648
CoreID:  42032649
CoreID:  42032652
CoreID:  42032653
CoreID:  42032655
CoreID:  42032700
CoreID:  42032701
CoreID:  4

CoreID:  78068291
CoreID:  78068292
CoreID:  78068294
CoreID:  78068296
CoreID:  78068298
CoreID:  78068300
CoreID:  78068302
CoreID:  78068304
CoreID:  78068305
CoreID:  78068306
CoreID:  78068309
CoreID:  78068310
CoreID:  78068311
CoreID:  78068312
CoreID:  78068313
CoreID:  78068314
CoreID:  78068315
CoreID:  78068316
CoreID:  78068317
CoreID:  78068318
CoreID:  78068319
CoreID:  78068320
CoreID:  78068321
CoreID:  78068323
CoreID:  78068324
CoreID:  78068325
CoreID:  78068326
CoreID:  78068327
CoreID:  78068328
CoreID:  78068330
CoreID:  78068331
CoreID:  78068332
CoreID:  78068333
CoreID:  78068335
CoreID:  78068336
CoreID:  78068338
CoreID:  78068339
CoreID:  78068340
CoreID:  78068342
CoreID:  78068345
CoreID:  78068346
CoreID:  78068347
CoreID:  78068348
CoreID:  78068349
CoreID:  78068352
CoreID:  78068353
CoreID:  78068354
CoreID:  78068355
CoreID:  78068357
CoreID:  78068358
CoreID:  78068360
CoreID:  78068361
CoreID:  78068362
CoreID:  78068364
CoreID:  78068365
CoreID:  7

CoreID:  83144351
CoreID:  83144352
CoreID:  83144353
CoreID:  83144357
CoreID:  83144358
CoreID:  83144360
CoreID:  83144361
CoreID:  83144363
CoreID:  83144364
CoreID:  83144365
CoreID:  83144366
CoreID:  83144367
CoreID:  83144368
CoreID:  83144369
CoreID:  83144370
CoreID:  83144371
CoreID:  83144372
CoreID:  83144373
CoreID:  83144374
CoreID:  83144375
CoreID:  83144376
CoreID:  83144382
CoreID:  83144383
CoreID:  83144384
CoreID:  83144385
CoreID:  83144386
CoreID:  83144388
CoreID:  83144389
CoreID:  83144390
CoreID:  85123921
CoreID:  85123922
CoreID:  85123923
CoreID:  85123924
CoreID:  85123925
CoreID:  85123926
CoreID:  85123927
CoreID:  85123928
CoreID:  85123930
CoreID:  85123932
CoreID:  85123934
CoreID:  85123935
CoreID:  85123936
CoreID:  85123937
CoreID:  85123941
CoreID:  85123945
CoreID:  85123946
CoreID:  85123947
CoreID:  85123948
CoreID:  85123949
CoreID:  85123950
CoreID:  85123951
CoreID:  85123952
CoreID:  85123953
CoreID:  85123954
CoreID:  85123955
CoreID:  8

In [22]:
print(len(kein_autor))
print(len(kein_titel))
print(len(error_papers))

print(len(ergebnis_label))
print(len(ergebnis_tokens))
print(len(ergebnis_label_real))

106
13321
2
15550
15550
15550


# Evaluation

## Accuracy

In [25]:
acc = []
acc_100 = []
for i in range(len(ergebnis_tokens)):
    acc.append(accuracy_score(ergebnis_label[i], ergebnis_label_real[i]))
    if ergebnis_label[i]== ergebnis_label_real[i]:
        acc_100.append(1)
        
print("Accuracy: " + str(np.mean(acc)*100) + " %")
print("Completly correct classifications: " + str(len(acc_100)/len(acc)*100) + " %")

Accuracy: 96.55426828645383 %
Completly correct classifications: 0.13504823151125403 %


## Macro Jaccard Score

In [26]:
Jaccard_score = []
for p in range(len(ergebnis_label_real)):
    jac = jaccard_score(ergebnis_label_real[p], ergebnis_label[p] , average="macro")
    Jaccard_score.append(jac)
    
print("Macro Jaccard Score: " + str(np.mean(Jaccard_score)*100) + " %")


Macro Jaccard Score: 27.60339278754594 %


## Macro F1-Score

In [27]:
F1_score = []
for p in range(len(ergebnis_label_real)):
    f1 = f1_score(ergebnis_label_real[p], ergebnis_label[p] , average="macro")
    F1_score.append(f1)
    
print("Macro F1 Score: " + str(np.mean(F1_score)*100) + " %")


Macro F1 Score: 28.127084463729908 %
