# Import Packages & Define functions

In [1]:
# read csv
import pandas as pd
import os
# PdfMiner
import glob
import numpy as np
from io import StringIO
from io import BytesIO
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
# Token Vectorization
from langdetect import detect 
import fasttext.util
import fasttext
########################################
from random import random
from numpy import array
from numpy import cumsum
########################################
import regex as re  
import string
import re
########################################
from datetime import datetime
import collections
########################################
# Keras imports for ML Model
import tensorflow as tf
from keras.models import Sequential
from keras.models import Model
from keras.layers import Dense, Conv2D, MaxPooling2D, LSTM, Bidirectional
from keras.layers import Dropout, Flatten, GlobalAveragePooling2D, Embedding
from keras.utils import np_utils
from keras import optimizers
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score

from dataclasses import dataclass
from bs4 import BeautifulSoup

Using TensorFlow backend.


In [2]:
def read_tei(tei_file):
    with open(tei_file, 'r',encoding="utf-8") as tei:
        soup = BeautifulSoup(tei, 'lxml')
        return soup
    raise RuntimeError('Cannot generate a soup from the input')

In [3]:
def elem_to_text(elem, default=''):
    if elem:
        return elem.getText()
    else:
        return default

In [4]:


@dataclass
class Person:
    firstname: str
    middlename: str
    surname: str

turing_author = Person(firstname='Alan', middlename='M', surname='Turing')

f"{turing_author.firstname} {turing_author.surname} authored many influential publications in computer science."

'Alan Turing authored many influential publications in computer science.'

In [5]:
class TEIFile(object):
    def __init__(self, filename):
        self.filename = filename
        self.soup = read_tei(filename)
        self._text = None
        self._title = ''
        self._abstract = ''

    @property
    def doi(self):
        idno_elem = self.soup.find('idno', type='DOI')
        if not idno_elem:
            return ''
        else:
            return idno_elem.getText()

    @property
    def title(self):
        if not self._title:
            self._title = self.soup.title.getText()
        return self._title

    @property
    def abstract(self):
        if not self._abstract:
            abstract = self.soup.abstract.getText(separator=' ', strip=True)
            self._abstract = abstract
        return self._abstract

    @property
    def authors(self):
        authors_in_header = self.soup.analytic.find_all('author')

        result = []
        for author in authors_in_header:
            persname = author.persname
            if not persname:
                continue
            firstname = elem_to_text(persname.find("forename", type="first"))
            middlename = elem_to_text(persname.find("forename", type="middle"))
            surname = elem_to_text(persname.surname)
            person = Person(firstname, middlename, surname)
            result.append(person)
        return result
    
    @property
    def text(self):
        if not self._text:
            divs_text = []
            for div in self.soup.body.find_all("div"):
                # div is neither an appendix nor references, just plain text.
                if not div.get("type"):
                    div_text = div.get_text(separator=' ', strip=True)
                    divs_text.append(div_text)

            plain_text = " ".join(divs_text)
            self._text = plain_text
        return self._text

In [6]:
# Function that get the fullpath of files in a directory
def listdir_fullpath(d):
    return [os.path.join(d, f) for f in os.listdir(d)]

In [7]:
### Funktion zum entfernen von Zeilenumbrüchen
def removePassage(my_str):
    my_str1 = re.sub("\\\\ud", " ", my_str)
    my_str2 = re.sub("\\\\n", " ", my_str1)
    return(my_str2)

### Funktion zum parsen von PDF[nur erste Seite] zu String Format
def extract_page_one(path):
    output_string = StringIO()
    
    with open(path, 'rb') as in_file:
        parser = PDFParser(in_file)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        interpreter.process_page(list(PDFPage.create_pages(doc))[0])
        return(output_string)
    


### Funktion zum Vergrößern der Input & Output Vektoren um NEWLINES
def add_newlines(Tokens,Real_Tokens,y_final):
    y_final_REAL = []
    k = 0
    m = 0
    for i in range(len(Tokens)):
        if k == 0:
            j=i
        else:
            if m == 0:
                j = k+1
            else:
                j = m+1
        if Tokens[i] == Real_Tokens[j] : # Wenn Tokens gleich sind, dann übernehme y_final_REAL
            y_final_REAL.append(y_final[i])
            m = j
        else:
            for k in range(j,len(Real_Tokens)): # Sonst gehe die Real_Tokens durch bis match vorhanden

                if Real_Tokens[k] == 'NEWLINE':
                    y_final_REAL.append('Sonstiges')
                
#                 elif Real_Tokens[k] not in Tokens:
#                     y_final_REAL.append('Sonstiges')

                else:
                    y_final_REAL.append(y_final[i])
                    m=k
                    break

    RealTokens_final = Real_Tokens[:len(y_final_REAL)]
    
    index_title = [i for i, e in enumerate(y_final_REAL) if e == 'I-title']
    if index_title==[]:
        return(RealTokens_final,y_final_REAL)
    else:
        end_title = max(index_title)

        ### lable NEWLINES im Titel als "I-title"        
        for i in range(len(RealTokens_final)):
            if RealTokens_final[i]=='NEWLINE':
                if (y_final_REAL[i+1] =='I-title' or end_title>i) and y_final_REAL[i-1] in ('B-title','I-title'):
                    y_final_REAL[i] = 'I-title'
        ## Kann passieren, dass im Titel mehrere NEWLINES aufeinander folgen. Daher end_title>i um festzustellen, ob iwann später noch ein Titel Label kommt        

        return(RealTokens_final,y_final_REAL)

#Create Word Vector representation
def detect_and_vectorize(tokens_sequence): #### input is the tokens list of ONE PAPER e.g. ergebnis_tokens[2]
    
    tokens_vectorized = []
    lang = detect(' '.join(tokens_sequence))
    
    if (lang == 'ru'):
        for i in range(len(tokens_sequence)):
            tokens_vectorized.append(np.float16(ft_ru.get_word_vector(tokens_sequence[i])))
                
    elif (lang == 'bg'):
        for i in range(len(tokens_sequence)):
            tokens_vectorized.append(np.float16(ft_bg.get_word_vector(tokens_sequence[i])))
                
    else:  ## assume language == uk
        for i in range(len(tokens_sequence)):
            tokens_vectorized.append(np.float16(ft_uk.get_word_vector(tokens_sequence[i])))
    
    while len(tokens_vectorized)<1000:
        tokens_vectorized.append(np.zeros(60))
      
    if len(tokens_vectorized)>1000:
        del tokens_vectorized[1000:] 

    return np.array(tokens_vectorized)

### Additional Features
punctuations = '''!()[]{};:'"\<>/?@#$%^&*«»_–~.,-'''

def compute_additional_features(tokens_sequence): #### input is the tokens list of ONE PAPER e.g. ergebnis_tokens[2]
    
    tokens = tokens_sequence
    feature_upper = []
    feature_capitalized = []
    feature_autor_format = []
    feature_punctation = []
    feature_newline = []
    feature_array = []
    
    while len(tokens)<1000:
        tokens.append(str(0))
    if len(tokens)>1000:
        del tokens[1000:] 
    #print(tokens)    
    for i in range(len(tokens)):
        if tokens[i] !='NEWLINE':
            if str(tokens[i]).isupper():
                feature_upper.append(1)

            else:
                feature_upper.append(0)
        else: 
            feature_upper.append(0)

        if tokens[i] !='NEWLINE':
            if str(tokens[i][0]).isupper():
                feature_capitalized.append(1)

            else:
                feature_capitalized.append(0)
        else: 
            feature_capitalized.append(0)

        if tokens[i] !='NEWLINE':
            if re.match('.\.',str(tokens[i])) != None and str(tokens[i]).isupper():
                feature_autor_format.append(1)

            else:
                feature_autor_format.append(0)
        else: 
            feature_autor_format.append(0)

        if tokens[i] !='NEWLINE':
            if any((c in punctuations) for c in str(tokens[i])):
                feature_punctation.append(1)
            else:
                feature_punctation.append(0)
        else: 
            feature_punctation.append(0)
                
        if tokens[i] =='NEWLINE':
            feature_newline.append(1)
        else: 
            feature_newline.append(0)
    df = pd.DataFrame(list(zip(feature_upper, feature_capitalized,feature_autor_format ,feature_punctation,feature_newline)))  
    feature_array = df.to_numpy(copy=True)
    
    return np.array(feature_array)

In [8]:
### Grobid: Entferne [[...]]
### Funktion zum entfernen von 'Arrays' aus Autoren
def removeAutor_grobid(my_str):
    my_str1 = re.sub("\[\['", "", my_str) 
    my_str2 = re.sub("'\]\]", "", my_str1)
    my_str3 = re.sub("'", "", my_str2) 
    return(my_str3)

In [9]:
### Funktion zum entfernen von 'Arrays' aus Autoren
def removeAutor(my_str):
    my_str1 = re.sub("\['", "", my_str)
    my_str2 = re.sub("'\]", "", my_str1)
    my_str3 = re.sub("'", "", my_str2)
    return(my_str3)


# Read TEI Files

In [None]:
### DO NOT RUN! Predictions of GROBID are already saved in 'grobid_16467.csv'
### Transform the tei.xml files into a dataframe


all_files = listdir_fullpath("D:\_final_selection_16478\GROBID_ALL")
# all_files = listdir_fullpath("D:\_final_selection_16478\out2")

Frame = pd.DataFrame({"core_id": [] , "title": [] , "authors": [] })
i = 0
for tei_doc in all_files:
    tei = TEIFile(tei_doc)
    core_id = re.sub(".tei.xml","",re.sub("D:\\\\_final_selection_16478\\\\GROBID_ALL\\\\Core_ID_","",tei_doc))
    authors = []
    for i in range(len(tei.authors)):
        if len(tei.authors[i].firstname)==1:
            forename = tei.authors[i].firstname + ". " + tei.authors[i].middlename + "."
            surname = tei.authors[i].surname
            name = [forename , surname]
        elif len(tei.authors[i].middlename)==1:
            forename = tei.authors[i].firstname +" " +tei.authors[i].middlename + "."
            surname = tei.authors[i].surname
            name = [forename , surname]
        else:
            forename = tei.authors[i].firstname + " " +tei.authors[i].middlename
            surname = tei.authors[i].surname
            name = [forename , surname]
        authors.append(name)
    
    
    Frame = Frame.append(pd.DataFrame(data = {"core_id": core_id , "title":tei.title , "authors":str(authors)},index = [i]), ignore_index=True)
    i =+ 1

# Frame.to_excel("grobid_0_8000.xlsx")
# Frame.to_excel("grobid_8000_16478.xlsx")
Frame.to_excel("grobid_16467.xlsx")
Frame.to_csv('grobid_16467.csv')
Frame

# Evaluate GROBID prediction

In [11]:
df_meta = pd.read_csv('grobid_16467.csv', sep = ',')#  , encoding= 'utf-16')
df_meta.drop('Unnamed: 0' , axis = 1 , inplace=True)
df_meta

Unnamed: 0,core_id,title,authors
0,11083759,Середовище проведення освітніх вебінарів «WIP ...,"[['Юрій ', 'Богачков']]"
1,11083794,Комплексна підготовка дистанційних матеріалів ...,[]
2,11083797,,[]
3,11083801,,[]
4,11083807,Інструментальні засоби профорієнтації,[]
...,...,...,...
16462,95313001,,[]
16463,95313006,,[]
16464,95313010,,[]
16465,95313012,ТЕХНОЛОГІЯ ПОКВАРТИРНОГО ОБЛІКУ ТЕПЛОВОЇ ЕНЕРГІЇ,"[['Горбачьова А О', '']]"


In [None]:
path_15553 = "D:/_final_selection_16478/PDFs_15553/"


pdf = os.listdir(path_15553)


files_core_id = []
files_paths = []
for elem in pdf:
    core = int(re.sub(".pdf","",re.sub("Core_ID_","",elem)))
    if core in list(df_meta.core_id):
        files_core_id.append(core)
#         files_paths.append("D:/_final_selection_16478/all_pdf/" + elem)
        files_paths.append(path_15553 + elem)
print(len(files_paths))
print(files_paths[0])



In [None]:
df_meta = df_meta[df_meta.core_id.isin(files_core_id)].reset_index()
df_meta.drop('index' , axis = 1 , inplace=True)
print(df_meta.shape)
print("{} Titel und {} Autoren sind NA".format(sum(df_meta.title.isna()),sum(df_meta.authors == "[]")))

In [15]:
### extract text 

all_pdf_text = [] 
start_time = datetime.now()
for i in range(len(files_paths)):
    try:
        all_pdf_text.append(extract_page_one(files_paths[i]).getvalue())
        if i % 500 == 0:
            print(str((i/len(files_paths))*100)+'%')
    except:
        all_pdf_text.append("Einlesen nicht moeglich")
        print("FEHLER")
    
end_time = datetime.now()

0.0%
3.2150205761316872%
6.4300411522633745%
9.645061728395062%
12.860082304526749%
16.075102880658438%
19.290123456790123%
22.505144032921812%
25.720164609053498%
28.935185185185187%
32.150205761316876%
35.365226337448554%
38.58024691358025%
41.79526748971193%
45.010288065843625%
48.2253086419753%
51.440329218106996%
54.65534979423868%
57.870370370370374%
61.08539094650206%
64.30041152263375%
67.51543209876543%
70.73045267489711%
73.9454732510288%
77.1604938271605%
80.37551440329219%
83.59053497942386%
86.80555555555556%
90.02057613168725%
93.23559670781893%
96.4506172839506%
99.6656378600823%


In [18]:
### Import REAL Meta Data
path_meta = 'D:/_final_selection_16478/'

df_meta_real = pd.read_csv(path_meta + 'final_items_15553.csv', sep = ',')#  , encoding= 'utf-16')
# df_meta.drop('Unnamed: 0' , axis = 1 , inplace=True)
# df_meta_real

### get all titles from meta data with core_ids of fulltext
titles_real = []
for i in range(len(files_core_id)):
    index = df_meta_real.index[df_meta_real['coreId'] == int(files_core_id[i])].tolist()
    if index == []:
        titles_real.append('Keine Meta Daten gefunden')
    else: 
        index = index[0]
        title_pdf  = df_meta_real.loc[index,'title']
        titles_real.append(title_pdf)
len(titles_real)

### Get autor for the PDF´s
######## PROBLEM: verschiedene Schreibweisen Meta <-> PDF
autors_real = []
for i in range(len(files_core_id)):
    index = df_meta_real.index[df_meta_real['coreId'] == int(files_core_id[i])].tolist()
    index = index[0]
    autor_pdf  = df_meta_real.loc[index,'authors']

    autor_pdf = removeAutor(autor_pdf).split(",")
    for j in range(len(autor_pdf)):
        autor_pdf[j] = ' '.join(autor_pdf[j].split()) ## Entferne überflüssige Whitespaces (auch am Anfang)
        
    autors_real.append(autor_pdf)
len(autors_real)

15552

# Get Label vectors for the GROBID Prediction

In [19]:
### get all titles from meta data with core_ids of fulltext
titles = []
for i in range(len(df_meta.core_id)):
    index = df_meta.index[df_meta['core_id'] == int(df_meta.core_id[i])].tolist()
    if df_meta.title.isna()[i]:
        titles.append('Keine Meta Daten gefunden')
    else: 
        index = index[0]
        title_pdf  = df_meta.loc[index,'title']
        titles.append(title_pdf)

len(titles)

15552

In [20]:
### Get autor for the PDF´s
######## PROBLEM: verschiedene Schreibweisen Meta <-> PDF
autors = []
for i in range(len(df_meta.core_id)):
    index = df_meta.index[df_meta['core_id'] == int(df_meta.core_id[i])].tolist()
    index = index[0]
    autor_pdf  = df_meta.loc[index,'authors']

    autor_pdf = removeAutor_grobid(autor_pdf).split(",")
    for j in range(len(autor_pdf)):
        autor_pdf[j] = ' '.join(autor_pdf[j].split()) ## Entferne überflüssige Whitespaces (auch am Anfang)
        
    autors.append(autor_pdf)
len(autors)

15552

In [None]:
### Funktion, die die Daten labelt 

kein_autor = []
kein_titel = []
error_papers = []
ergebnis_tokens = []
ergebnis_label = []
### Real Meta
ergebnis_label_real = []
###

anzahl_papers = len(df_meta.core_id)

for paper in range(anzahl_papers):

    title = ' '.join(removePassage(titles[paper]).split()).lower() # Remove excces Whitespace & to lowercase
    title = re.sub("\(","\(",title) # () as non-regex string
    title = re.sub("\)","\)",title)
    title = re.sub("\*","\*",title) # * as non-regex string

    title_index = re.search(title, ' '.join(all_pdf_text[paper].split()).lower()) # search for the title
    
    #### Real Meta
    title_real = ' '.join(removePassage(titles_real[paper]).split()).lower() # Remove excces Whitespace & to lowercase
    title_real = re.sub("\(","\(",title_real) # () as non-regex string
    title_real = re.sub("\)","\)",title_real)
    title_real = re.sub("\*","\*",title_real) # * as non-regex string

    title_index_real = re.search(title_real, ' '.join(all_pdf_text[paper].split()).lower()) # search for the title
    ######
    
#     print('CoreID:  ' + str(files_core_id[paper]))

    if title_index==None:
        Text_pdf_0 = ' '.join(all_pdf_text[paper].split())
        
        kein_titel.append(df_meta.core_id[paper])
        y_final= np.repeat('Sonstiges',len(Text_pdf_0.split()))
    else:
        
        Text_pdf_0 = ' '.join(all_pdf_text[paper].split())

        ##### TITLE ################################################
        if title_index.start()==0:
            teil_B = ""
        else:
            teil_B = Text_pdf_0[0:title_index.start()-1]
        teil_T = Text_pdf_0[title_index.start():title_index.end()]
        teil_E = Text_pdf_0[title_index.end()+1:len(Text_pdf_0)]

        y_teil1 = np.repeat('Sonstiges',len(teil_B.split()))
        y_teil2 = np.append(['B-title'],np.repeat('I-title',len(teil_T.split())-1))
        y_teil3 = np.repeat('Sonstiges',len(teil_E.split()))

        y_final = np.concatenate((y_teil1, y_teil2 , y_teil3), axis=None)
        
    ### Real Meta    
    if title_index_real!=None:
        Text_pdf_0 = ' '.join(all_pdf_text[paper].split())

        ##### TITLE ################################################
        if title_index_real.start()==0:
            teil_B_real = ""
        else:
            teil_B_real = Text_pdf_0[0:title_index_real.start()-1]
        teil_T_real = Text_pdf_0[title_index_real.start():title_index_real.end()]
        teil_E_real = Text_pdf_0[title_index_real.end()+1:len(Text_pdf_0)]

        y_teil1_r = np.repeat('Sonstiges',len(teil_B_real.split()))
        y_teil2_r = np.append(['B-title'],np.repeat('I-title',len(teil_T_real.split())-1))
        y_teil3_r = np.repeat('Sonstiges',len(teil_E_real.split()))

        y_final_real = np.concatenate((y_teil1_r, y_teil2_r , y_teil3_r), axis=None)
    ###
    
    ##### Get Text
    all_pdf_text1 = re.sub("\\n"," NEWLINE ",all_pdf_text[paper])
    Text_pdf_0_NL = ' '.join(all_pdf_text1.split())

    Tokens = Text_pdf_0.split()
    Labels = y_final
    Real_Tokens = Text_pdf_0_NL.split()

    Tokens = all_pdf_text[paper].split()

    Tokens_final_lower = []
    for i in range(len(Tokens)):
        Tokens_final_lower.append(Tokens[i].lower())
    try:
        if autors[paper]!= ['[]']:

            autors_surname = []
            for i in range(len(autors[paper])):
                if i % 2 == 0:
                    autors_surname.append(autors[paper][i])

            autors_surname_lower = []
            for i in range(len(autors_surname)):
                autors_surname_lower.append(autors_surname[i].lower())

            if re.match('.\.',autors[paper][1]) == None:
                autors_forename = []
                for i in range(len(autors[paper])):
                    if i % 2 == 1:
                        autors_forename.append(autors[paper][i].split())

                autors_forename = list(np.concatenate((autors_forename), axis=None))
                autors_forename_lower = []
                for i in range(len(autors_forename)):
                    autors_forename_lower.append(autors_forename[i].lower())

                autors_surname_lower = list(np.concatenate((autors_forename_lower,autors_surname_lower), axis=None))


            vec_autor = []
            for token in Tokens_final_lower:
                line = any(word in token for word in autors_surname_lower)
                vec_autor.append(line)

            index_autor = [i for i, e in enumerate(vec_autor) if e == True]

            if title_index!=None:
                if len(index_autor)>(len(autors_surname_lower)):
                    diff = len(index_autor) - len(autors_surname_lower)
                    dist = []
                    for j in range(len(index_autor)):
                        dist.append(abs(index_autor[j]-np.where(y_final=="B-title")[0][0]))

                    dict1 = dict(zip(dist , index_autor))

                    dist.sort(reverse = True)

                    for k in range(len(dist[0:diff])):
                        vec_autor[dict1[dist[0:diff][k]]] = False

            for i in range(len(y_final)):
                if vec_autor[i] == True:
                    y_final[i] = 'autor'

            if True not in vec_autor:
                kein_autor.append(files_core_id[paper])

            if re.match('.\.',autors[paper][1]) != None:

                index_autor_true = [i for i, e in enumerate(vec_autor) if e == True]

                for w in range(len(index_autor_true)):
                    index = index_autor_true[w]
                    for t in range(index - 4,index + 4):
                        if re.match('.\.',Tokens_final_lower[t]) != None and Tokens[t].isupper():
                            y_final[t] = 'autor'
        ### Real Meta
        autors_surname_real = []
        for i in range(len(autors_real[paper])):
            if i % 2 == 0:
                autors_surname_real.append(autors_real[paper][i])

        autors_surname_lower_real = []
        for i in range(len(autors_surname_real)):
            autors_surname_lower_real.append(autors_surname_real[i].lower())

        if re.match('.\.',autors_real[paper][1]) == None:
            autors_forename_real = []
            for i in range(len(autors_real[paper])):
                if i % 2 == 1:
                    autors_forename_real.append(autors_real[paper][i].split())

            autors_forename_real = list(np.concatenate((autors_forename_real), axis=None))
            autors_forename_lower_real = []
            for i in range(len(autors_forename_real)):
                autors_forename_lower_real.append(autors_forename_real[i].lower())

            autors_surname_lower_real = list(np.concatenate((autors_forename_lower_real,autors_surname_lower_real), axis=None))


        vec_autor_real = []
        for token in Tokens_final_lower:
            line_real = any(word in token for word in autors_surname_lower_real)
            vec_autor_real.append(line_real)

        index_autor_real = [i for i, e in enumerate(vec_autor_real) if e == True]

        if title_index_real!=None:
            if len(index_autor_real)>(len(autors_surname_lower_real)):
                diff = len(index_autor_real) - len(autors_surname_lower_real)
                dist = []
                for j in range(len(index_autor_real)):
                    dist.append(abs(index_autor_real[j]-np.where(y_final_real=="B-title")[0][0]))

                dict1 = dict(zip(dist , index_autor_real))

                dist.sort(reverse = True)

                for k in range(len(dist[0:diff])):
                    vec_autor_real[dict1[dist[0:diff][k]]] = False

        for i in range(len(y_final_real)):
            if vec_autor_real[i] == True:
                y_final_real[i] = 'autor'

#         if True not in vec_autor_real:
#             kein_autor.append(files_core_id[paper])

        if re.match('.\.',autors_real[paper][1]) != None:

            index_autor_true_real = [i for i, e in enumerate(vec_autor_real) if e == True]

            for w in range(len(index_autor_true_real)):
                index = index_autor_true_real[w]
                for t in range(index - 4,index + 4):
                    if re.match('.\.',Tokens_final_lower[t]) != None and Tokens[t].isupper():
                        y_final_real[t] = 'autor'
        ###

        RealTokens_final = add_newlines(Tokens,Real_Tokens,y_final)[0]
        y_final_REAL = add_newlines(Tokens,Real_Tokens,y_final)[1]
        ### Real Meta
        y_final_REAL2 = add_newlines(Tokens,Real_Tokens,y_final_real)[1]
        ergebnis_label_real.append(y_final_REAL2)
        ###
        ergebnis_label.append(y_final_REAL)
        ergebnis_tokens.append(RealTokens_final)
    except:
        error_papers.append(files_core_id[paper])

In [22]:
print(len(kein_autor))
print(len(kein_titel))
print(len(error_papers))

print(len(ergebnis_label))
print(len(ergebnis_tokens))
print(len(ergebnis_label_real))

106
13321
2
15550
15550
15550


# Evaluation

## Accuracy

In [25]:
acc = []
acc_100 = []
for i in range(len(ergebnis_tokens)):
    acc.append(accuracy_score(ergebnis_label[i], ergebnis_label_real[i]))
    if ergebnis_label[i]== ergebnis_label_real[i]:
        acc_100.append(1)
        
print("Accuracy: " + str(np.mean(acc)*100) + " %")
print("Completly correct classifications: " + str(len(acc_100)/len(acc)*100) + " %")

Accuracy: 96.55426828645383 %
Completly correct classifications: 0.13504823151125403 %


## Macro Jaccard Score

In [26]:
Jaccard_score = []
for p in range(len(ergebnis_label_real)):
    jac = jaccard_score(ergebnis_label_real[p], ergebnis_label[p] , average="macro")
    Jaccard_score.append(jac)
    
print("Macro Jaccard Score: " + str(np.mean(Jaccard_score)*100) + " %")


Macro Jaccard Score: 27.60339278754594 %


## Macro F1-Score

In [27]:
F1_score = []
for p in range(len(ergebnis_label_real)):
    f1 = f1_score(ergebnis_label_real[p], ergebnis_label[p] , average="macro")
    F1_score.append(f1)
    
print("Macro F1 Score: " + str(np.mean(F1_score)*100) + " %")


Macro F1 Score: 28.127084463729908 %
