# Evaluation of the NE application

The goal of this notebook to evaluate the existing application.

In [1]:
import json
import pandas as pd
import os 
from datetime import datetime


In [2]:
import spacy

# Load SpaCy's pre-trained English model
nlp = spacy.load("en_core_web_sm")

  hasattr(torch, "has_mps")
  and torch.has_mps  # type: ignore[attr-defined]
  return torch._C._cuda_getDeviceCount() > 0
2024-11-07 16:13:50.260819: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-07 16:13:50.287091: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-07 16:13:51.347068: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: CUDA_ERROR_UNKNOWN: unknown error
2024-11-07 16:13:51.347088: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:168] retrieving

In [3]:
def load_data(file_path, usecols=None, nrows=None):
    df = pd.read_csv(file_path, usecols=usecols, nrows=nrows)
    return df

def save_results(output_file_path, metrics_file_path, df, metrics):
    df.to_csv(output_file_path, index=False)
    print(f"Results saved into {output_file_path}, rows: {df.shape[0]}")

    print(metrics)
    with open(metrics_file_path, "w", encoding='utf-8') as fp:
        json.dump(metrics, fp, check_circular=True)
    print(f"Metrics saved into {metrics_file_path}")
    
from collections import defaultdict

def extract_named_entities(text_list):
    # Initialize result dictionary
    result = {
        'persons': [],
        'organizations': [],
        'locations': []
    }

    # Iterate through each text with SpaCy
    for doc in nlp.pipe(text_list):
        # Extract entities for each category
        persons = [ent.text for ent in doc.ents if ent.label_ == 'PERSON']
        organizations = [ent.text for ent in doc.ents if ent.label_ == 'ORG']
        locations = [ent.text for ent in doc.ents if ent.label_ == 'GPE']
        
        # Join entities with ';' or set as an empty string if no entities are found
        result['persons'].append(";".join(persons) if persons else "")
        result['organizations'].append(";".join(organizations) if organizations else "")
        result['locations'].append(";".join(locations) if locations else "")
    
    return result

def predict_entities(df):
    nes = extract_named_entities(df["text"])
    df.loc[:, "persons_pred"] = nes['persons']
    df.loc[:, "organizations_pred"] = nes['organizations']
    df.loc[:, "locations_pred"] = nes['locations']
    return df

def evaluate_binary(df):
    def calculate_metrics(row):
        true_set = set(row[column].split(';'))
        pred_set = set(row[column+'_pred'].split(';'))
        
        tp = len(true_set & pred_set)  # True Positives
        fp = len(pred_set - true_set)  # False Positives
        fn = len(true_set - pred_set)  # False Negatives
        support = len(true_set)        # Support: the number of true values (in 'persons')        
        return pd.Series([tp, fp, fn, support])
    
    overall_metrics = {}    
    for column in 'persons organizations locations'.split():
        # Apply the function to each row
        df[['TP_'+column, 'FP_'+column, 'FN_'+column, 'support_'+column]] = df.apply(calculate_metrics, axis=1)
        
        # Calculate precision, recall, F1 for each row
        df['precision_'+column] = df['TP_'+column] / (df['TP_'+column] + df['FP_'+column])
        df['recall_'+column] = df['TP_'+column] / (df['TP_'+column] + df['FN_'+column])
        df['F1_'+column] = 2 * (df['precision_'+column] * df['recall_'+column]) / (df['precision_'+column] + df['recall_'+column])
        
        # Fill NaN values (where precision/recall is undefined) with 0
        df.fillna(0, inplace=True)
        
        # Calculate overall precision, recall, F1-score, and support
        overall_metrics[column] ={
            "precision": round(sum(df['precision_'+column] * df['support_'+column]) /sum(df['support_'+column]), 3),
            "recall": round(sum(df['recall_'+column] * df['support_'+column]) /sum(df['support_'+column]), 3),
            "F1": round(sum(df['F1_'+column] * df['support_'+column]) /sum(df['support_'+column]), 3),
            "support": int(df['support_'+column].sum())
        }
    return df, overall_metrics   

def evaluate_baseline(extractor_name, nrows=None):
    eval_file_path, eval_and_scores_file_path, metrics_file_path = get_files(extractor_name=extractor_name)

    # Load data
    used_columns = "text persons organizations locations".split()
    df = load_data(eval_file_path, usecols=used_columns, nrows=nrows)
    print(f"Loaded {df.shape}")

    df.fillna('', inplace=True)
    print("Extracting NE...")
    df_pred = predict_entities(df)
    print(f'Extracted. Res df: {df.shape}, {df.columns}')
    # print(df)

    # Evaluate predictions against true labels
    out_df, overall_metrics = evaluate_binary(df)

    # Save the predictions to a CSV file (required format for submission)
    save_results(eval_and_scores_file_path, metrics_file_path, df=out_df, metrics=overall_metrics)
    print("Finish")
    return out_df, overall_metrics

def get_files(extractor_name, data_dir="../data/external/hf", eval_dataset='conll2003_transformed'):
    eval_file_path = f"{data_dir}/{eval_dataset}.csv"
    output_dir = f"{data_dir}/{extractor_name}"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        print(f"Created {output_dir} directory")
    file_prefix = datetime.now().strftime("%Y%m%d_%H%M%S")
    eval_and_scores_file_path = f"{output_dir}/{file_prefix}.with_scores.csv"
    metrics_file_path = f"{output_dir}/{file_prefix}.scores.json"

    return eval_file_path, eval_and_scores_file_path, metrics_file_path

In [5]:
# data_dir = 'external/hf'
# eval_dataset = "conll2003_transformed"

extractor_name = 'spacy'

out_df, overall_metrics = evaluate_baseline(extractor_name=extractor_name, nrows=100)
overall_metrics

Loaded (100, 4)
Extracting NE...
Extracted. Res df: (100, 7), Index(['text', 'persons', 'locations', 'organizations', 'persons_pred',
       'organizations_pred', 'locations_pred'],
      dtype='object')
Results saved into ../data/external/hf/spacy/20241107_161421.with_scores.csv, rows: 100
{'persons': {'precision': 0.836, 'recall': 0.708, 'F1': 0.743, 'support': 161}, 'organizations': {'precision': 0.82, 'recall': 0.82, 'F1': 0.82, 'support': 100}, 'locations': {'precision': 0.933, 'recall': 0.867, 'F1': 0.89, 'support': 120}}
Metrics saved into ../data/external/hf/spacy/20241107_161421.scores.json
Finish


{'persons': {'precision': 0.836, 'recall': 0.708, 'F1': 0.743, 'support': 161},
 'organizations': {'precision': 0.82,
  'recall': 0.82,
  'F1': 0.82,
  'support': 100},
 'locations': {'precision': 0.933,
  'recall': 0.867,
  'F1': 0.89,
  'support': 120}}