# Baseline with spaCy models

It is a baseline solution using [`spaCy` package](https://spacy.io/) for [`Named Entity Recognition` (`NER`)](https://spacy.io/usage/linguistic-features#named-entities). 
This code will load the dataset, perform NER, and evaluate it against the provided labels for persons, organizations, and locations.

The `baseline_hf` notebook is the first notebook for modeling. It holds all detailed comments about data preparation and modeling.

## Setting up

Install all necessary packages and initialize the library names.

In [None]:
!pip install spacy
!python3 -m spacy download en_core_web_sm

In [None]:
!pip install pydantic==1.10.12

In [5]:
from collections import defaultdict
from pathlib import Path

import pandas as pd
import spacy

# Load SpaCy's pre-trained English model
nlp = spacy.load("en_core_web_sm")

## Data loading

In [4]:
# Concatenate all df-s
def load_data(file_path, usecols=None, nrows=None):
    df = pd.read_csv(file_path, usecols=usecols, nrows=nrows)
    print(f"Loaded {df.shape} {file_path}, {list(df.columns) =}")
    return df
    
def load_all_data(data_path, used_columns, nrows=None):
    suffixes = "train test validation".split()
    
    all_df = pd.concat([load_data(
        data_path / f"conll2003_transformed.{suffix}.csv",
        usecols=used_columns, 
        nrows=nrows
    ) for suffix in suffixes])
    print(f"Summary df: {all_df.shape}, {list(all_df.columns) =}")
    return all_df


DATA_PATH = Path('../data/external/hf')
used_columns = "text persons organizations locations".split()
nrows = None
df = load_all_data(DATA_PATH, used_columns=used_columns, nrows=nrows)

Loaded (14041, 4) ../data/external/hf/conll2003_transformed.train.csv, list(df.columns) =['text', 'persons', 'locations', 'organizations']
Loaded (3453, 4) ../data/external/hf/conll2003_transformed.test.csv, list(df.columns) =['text', 'persons', 'locations', 'organizations']
Loaded (3250, 4) ../data/external/hf/conll2003_transformed.validation.csv, list(df.columns) =['text', 'persons', 'locations', 'organizations']
Summary df: (20744, 4), list(all_df.columns) =['text', 'persons', 'locations', 'organizations']


## NE extraction

In [6]:
def extract_named_entities(text_list):
    # Initialize result dictionary
    result = {
        'persons': [],
        'organizations': [],
        'locations': []
    }

    # Iterate through each text
    for text in text_list:
        # Process the text with SpaCy
        doc = nlp(text)
        
        # Extract entities for each category
        persons = [ent.text for ent in doc.ents if ent.label_ == 'PERSON']
        organizations = [ent.text for ent in doc.ents if ent.label_ == 'ORG']
        locations = [ent.text for ent in doc.ents if ent.label_ == 'GPE']
        
        # Join entities with ';' or set as empty string if no entities found
        result['persons'].append(";".join(persons) if persons else "")
        result['organizations'].append(";".join(organizations) if organizations else "")
        result['locations'].append(";".join(locations) if locations else "")
    
    return result


In [7]:
texts = [
    "Andrew Clyde, Kelly, and Louie Gohmert visited Florida and Orlando.",
    "Lloyd Smucker mentioned Trump while discussing San Francisco, California, and Napa County.",
    "Timothy Kelly ruled in favor of Trump in the Mexican court case."
]

named_entities = extract_named_entities(texts)
named_entities

{'persons': ['Andrew Clyde;Kelly;Louie Gohmert', '', 'Timothy Kelly'],
 'organizations': ['', 'Lloyd Smucker;Trump', 'Trump'],
 'locations': ['Florida;Orlando', 'San Francisco;California;Napa County', '']}

In [8]:
def predict_entities(df):
    nes = extract_named_entities(df["text"])
    df.loc[:, "persons_pred"] = nes['persons']
    df.loc[:, "organizations_pred"] = nes['organizations']
    df.loc[:, "locations_pred"] = nes['locations']
    return df

In [10]:
df_pred = predict_entities(df)
print(df_pred.shape)
df_pred.head(1)

(20744, 7)


Unnamed: 0,text,persons,locations,organizations,persons_pred,organizations_pred,locations_pred
0,EU rejects German call to boycott British lamb .,,,EU,,EU,


## Evaluation

In [11]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_binary(df):
    def calculate_metrics(row):
        true_set = set(row[column].split(';'))
        pred_set = set(row[column+'_pred'].split(';'))
        
        tp = len(true_set & pred_set)  # True Positives
        fp = len(pred_set - true_set)  # False Positives
        fn = len(true_set - pred_set)  # False Negatives
        support = len(true_set)        # Support: the number of true values (in 'persons')        
        return pd.Series([tp, fp, fn, support])
    
    overall_metrics = {}    
    for column in 'persons organizations locations'.split():
        # Apply the function to each row
        df[['TP_'+column, 'FP_'+column, 'FN_'+column, 'support_'+column]] = df.apply(calculate_metrics, axis=1)
        
        # Calculate precision, recall, F1 for each row
        df['precision_'+column] = df['TP_'+column] / (df['TP_'+column] + df['FP_'+column])
        df['recall_'+column] = df['TP_'+column] / (df['TP_'+column] + df['FN_'+column])
        df['F1_'+column] = 2 * (df['precision_'+column] * df['recall_'+column]) / (df['precision_'+column] + df['recall_'+column])
        
        # Fill NaN values (where precision/recall is undefined) with 0
        df.fillna(0, inplace=True)
        
        # Calculate overall precision, recall, F1-score, and support
        overall_metrics[column] ={
            "precision": round(sum(df['precision_'+column] * df['support_'+column]) /sum(df['support_'+column]), 3),
            "recall": round(sum(df['recall_'+column] * df['support_'+column]) /sum(df['support_'+column]), 3),
            "F1": round(sum(df['F1_'+column] * df['support_'+column]) /sum(df['support_'+column]), 3),
            "support": int(df['support_'+column].sum())
        }
    return df, overall_metrics   


In [12]:
# Example DataFrame
data = {
    'persons': ["Andrew Clyde;Kelly;Louie Gohmert", "Lloyd Smucker;Trump", "Timothy Kelly;Trump"],
    'organizations': ["Andrew Clyde;Kelly;Louie Gohmert", "Lloyd Smucker;Trump", "Timothy Kelly;Trump"],
    'locations': ["Florida;Orlando", "San Francisco;California;Napa County", "Mexican"],
    'persons_pred': ["Andrew Clyde;Louie Gohmert;something", "Trump;Lloyd Smucker", "Timothy Kelly;some"],
    'organizations_pred': ["Andrew Clyde;Louie Gohmert;something", "Trump;Lloyd Smucker", "Timothy Kelly;some"],
    'locations_pred': ["Florida;Orlando;something", "San Francisco;California;Napa County;some","Mexican;New York"],
}
df_test = pd.DataFrame(data)

df_test, overall_metrics = evaluate_binary(df_test)

In [13]:
df_test

Unnamed: 0,persons,organizations,locations,persons_pred,organizations_pred,locations_pred,TP_persons,FP_persons,FN_persons,support_persons,...,precision_organizations,recall_organizations,F1_organizations,TP_locations,FP_locations,FN_locations,support_locations,precision_locations,recall_locations,F1_locations
0,Andrew Clyde;Kelly;Louie Gohmert,Andrew Clyde;Kelly;Louie Gohmert,Florida;Orlando,Andrew Clyde;Louie Gohmert;something,Andrew Clyde;Louie Gohmert;something,Florida;Orlando;something,2,1,1,3,...,0.666667,0.666667,0.666667,2,1,0,2,0.666667,1.0,0.8
1,Lloyd Smucker;Trump,Lloyd Smucker;Trump,San Francisco;California;Napa County,Trump;Lloyd Smucker,Trump;Lloyd Smucker,San Francisco;California;Napa County;some,2,0,0,2,...,1.0,1.0,1.0,3,1,0,3,0.75,1.0,0.857143
2,Timothy Kelly;Trump,Timothy Kelly;Trump,Mexican,Timothy Kelly;some,Timothy Kelly;some,Mexican;New York,1,1,1,2,...,0.5,0.5,0.5,1,1,0,1,0.5,1.0,0.666667


In [14]:
overall_metrics

{'persons': {'precision': 0.714, 'recall': 0.714, 'F1': 0.714, 'support': 7},
 'organizations': {'precision': 0.714,
  'recall': 0.714,
  'F1': 0.714,
  'support': 7},
 'locations': {'precision': 0.681, 'recall': 1.0, 'F1': 0.806, 'support': 6}}

## Whole pipeline

### Code

In [24]:
import json

def load_data(file_path, usecols=None, nrows=None):
    df = pd.read_csv(file_path, usecols=usecols, nrows=nrows)
    print(f"Loaded {df.shape} {file_path}, {list(df.columns) =}")
    return df
    

def save_results(output_file_path, metrics_file_path, df, metrics):
    df.to_csv(output_file_path, index=False)
    print(f"Results saved into {output_file_path}, rows: {df.shape[0]}")

    print(metrics)
    with open(metrics_file_path, "w", encoding='utf-8') as fp:
        json.dump(metrics, fp, check_circular=True)
    print(f"Metrics saved into {metrics_file_path}")
    

    
def evaluate_baseline(data_file_path, output_file_path, metrics_file_path, do_cleanup=True, nrows=None):
    # Load data
    used_columns = "text persons organizations locations".split()
    df = load_data(data_file_path, usecols=used_columns, nrows=nrows)
    print(f"Loaded {df.shape}")

    if do_cleanup:
        df = data_cleanup(df)
        print(f"Cleaned up to {df.shape}")

    df.fillna('', inplace=True)
    print("Extracting NE")
    df_pred = predict_entities(df)
    print(f'Extracted. Res df: {df.shape}, {df.columns}')
    # print(df)

    # Evaluate predictions against true labels
    out_df, overall_metrics = evaluate_binary(df)

    # Save the predictions to a CSV file (required format for submission)
    save_results(output_file_path, metrics_file_path, df=out_df, metrics=overall_metrics)
    print("Finish")
    return out_df, overall_metrics

### Evaluation on `conll2003`

In [25]:
from_source = 'external/hf'
dataset_name = "conll2003_transformed.all"
extractor_type = 'spacy'

data_file_path = f"../data/{from_source}/{dataset_name}.csv"
output_file_path = f"../data/{from_source}/{extractor_type}/with_scores.csv"
metrics_file_path = f"../data/{from_source}/{extractor_type}/scores.json"
nrows = None

out_df, overall_metrics = evaluate_baseline(data_file_path, output_file_path, metrics_file_path, do_cleanup=False, nrows=nrows)

Loaded (20744, 4) ../data/external/hf/conll2003_transformed.all.csv, list(df.columns) =['text', 'persons', 'locations', 'organizations']
Loaded (20744, 4)
Extracting NE
Extracted. Res df: (20744, 7), Index(['text', 'persons', 'locations', 'organizations', 'persons_pred',
       'organizations_pred', 'locations_pred'],
      dtype='object')
Results saved into ../data/external/hf/spacy/with_scores.csv, rows: 20744
{'persons': {'precision': 0.843, 'recall': 0.803, 'F1': 0.814, 'support': 24248}, 'organizations': {'precision': 0.651, 'recall': 0.635, 'F1': 0.638, 'support': 23191}, 'locations': {'precision': 0.848, 'recall': 0.825, 'F1': 0.83, 'support': 23498}}
Metrics saved into ../data/external/hf/spacy/scores.json
Finish


In [26]:
overall_metrics

{'persons': {'precision': 0.843,
  'recall': 0.803,
  'F1': 0.814,
  'support': 24248},
 'organizations': {'precision': 0.651,
  'recall': 0.635,
  'F1': 0.638,
  'support': 23191},
 'locations': {'precision': 0.848,
  'recall': 0.825,
  'F1': 0.83,
  'support': 23498}}