In [1]:
from sklearn.metrics import confusion_matrix, f1_score, recall_score, precision_score
import re
import pandas as pd
import numpy as np

In [2]:
# Paths
pred_file_path = '/data/rsg/nlp/juanmoo1/projects/04_polymer/00_annotation/workdir/ner_evaluation/split/model_200/eval_predictions.txt'

In [3]:
def load_paragraphs_from_file(path):
    def clean_line(l):
        l = re.sub('\s+|\t', ' ', l.replace('\t', ' ')).strip()
        return l
    
    with open(path, 'r') as f:
        lines = f.readlines()

    paragraphs = []

    i = 0
    while i < len(lines):
        while i < len(lines) and len(clean_line(lines[i])) == 0:
            i += 1

        j = i + 1
        while j < len(lines) and len(clean_line(lines[j])) > 0:
            j += 1
        paragraphs.append([clean_line(line) for line in lines[i:j]])
        i = j
    return paragraphs

## Entity Level Evaluation

In [7]:
from seqeval.metrics import f1_score, recall_score, precision_score, classification_report

def get_report_from_pred_file(pred_file_path):
    tokens = []
    labels = []
    predictions = []
    paragraphs = load_paragraphs_from_file(pred_file_path)
    
    for p in paragraphs:
        if len(p) == 0:
            continue
        toks, labs, preds = zip(*[e.strip().split() for e in p if len(e.strip()) > 0])

        toks = list(toks)
        labs = list(labs)
        preds = list(preds)

        tokens.append(toks)
        labels.append(labs)
        predictions.append(preds)
    
    f1 = f1_score(labels, predictions)
    precision = precision_score(labels, predictions)
    recall = recall_score(labels, predictions)
    report = classification_report(labels, predictions)
    report = parse_report(report)
    
    return {
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'report': report
    }

In [10]:
import re
import pandas as pd

def parse_line(line):
    line = re.sub('\s+', ' ', line.strip())
    return line.split()

def parse_report(report):
    lines = report.split('\n')
    
    columns = parse_line(lines[0])
    
    
    amounts = [parse_line(l) for l in lines[2:11]]
    rows = []
    for l in amounts:
        label = l[0]
        values = [float(e) for e in l[1:]]
        
        rows.append({
            'label': label,
            'precision': values[0],
            'recall': values[1],
            'f1-score': values[2],
            'support': values[3]
        })
        
    report = pd.DataFrame(rows)
    return report.set_index('label')

In [11]:
res = get_report_from_pred_file(pred_file_path)

In [77]:
import os
from glob import glob

k_splits_dir = '/data/rsg/nlp/juanmoo1/projects/04_polymer/01_ner_bert/data/reagent_grouping/ner_evaluation/k_splits'

In [78]:
glob_exp = os.path.join(k_splits_dir, '**', 'eval_predictions.txt')
pred_files = glob(glob_exp, recursive=True)

In [79]:
results = []
for pred_file in pred_files:
    results.append(get_report_from_pred_file(pred_file))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [80]:
# Get all labels
label_set = set()
for e in results:
    report = e['report']
    report_labels = set(report.index)
    label_set |= report_labels
label_set = sorted(list(label_set))

In [81]:
# Set non-existent values to zero
for e in results:
    report = e['report']
    for l in label_set:
        if (l not in report.index):
            report = report.append(pd.Series(name=l, data={'precision': 0, 'recall':0, 'f1-score':0, 'support':0}))
    report.sort_index(axis=0)
    e['report'] = report

In [None]:
# Set non-existent values to zero
for e in results:
    report = e['report']
    print(report)

In [83]:
# Average from reports
reports = []
for e in results:
    report = e['report']
    reports.append(report)
report_sum = sum(reports)
report_avg = report_sum/len(reports)

In [84]:
# Metric Averages
f1s = [r['f1'] for r in results]
precisions = [r['recall'] for r in results]
recalls = [r['precision'] for r in results]

f1_avg = np.mean(f1s)
f1_std = np.var(f1s)**0.5
precision_avg = np.mean(precisions)
precision_std = np.var(precisions)**0.5
recall_avg = np.mean(recalls)
recall_std = np.var(recalls)**0.5

In [85]:
# Print report 
import csv
print('10-fold Cross Validation Results:')
print()
print('Overall Statistics:')
print()
print(f'F1-SCORE:\tMEAN: {f1_avg}\tSTD:{f1_std}')
print(f'PRECISION:\tMEAN: {precision_avg}\tSTD:{precision_std}')
print(f'RECALL:\t\tMEAN: {recall_avg}\tSTD:{recall_std}')
print()
print('Category Averages:')
print(report_avg)
report_avg.to_csv('report.csv', quoting=csv.QUOTE_ALL)

10-fold Cross Validation Results:

Overall Statistics:

F1-SCORE:	MEAN: 0.8833767136052298	STD:0.025023695037861036
PRECISION:	MEAN: 0.9014315714871618	STD:0.01948830253633368
RECALL:		MEAN: 0.866261167088268	STD:0.0322362564055037

Category Averages:
                      precision  recall  f1-score  support
label                                                     
Action                    0.870   0.900     0.885     87.2
ActionInput               0.790   0.904     0.841     37.1
Amount                    0.920   0.939     0.928     71.4
Atmosphere_Condition      0.808   0.851     0.825     11.0
Data                      0.598   0.655     0.620      2.5
Lighting                  0.150   0.058     0.083      1.1
Product                   0.367   0.291     0.303      1.5
Product_ref               0.000   0.000     0.000      0.1
Reagent                   0.891   0.878     0.883     48.8
RemovedChemical           0.590   0.638     0.600      6.0
RemovedChemical_ref       0.000   0.000 