# Baseline with NLTK package

It is a baseline solution using [`NLTK` package](https://www.nltk.org/) for `Named Entity Recognition` (`NER`). 
This code will load the dataset, perform NER using NLTK, and evaluate it against the provided labels for persons, organizations, and locations.

The `baseline_hf` notebook is the first notebook for modeling. It holds all detailed comments about data preparation and modeling.

## Setting up

Install all necessary packages and initialize the library names.

In [None]:
!pip install nltk

In [6]:
from collections import defaultdict
from pathlib import Path

import nltk
from nltk import word_tokenize, pos_tag, ne_chunk

import pandas as pd

In [3]:
# Download required resources from NLTK
nltk.download('punkt')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to /home/leo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/leo/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/leo/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/leo/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## Data loading

In [8]:
# Concatenate all df-s
def load_data(file_path, usecols=None, nrows=None):
    df = pd.read_csv(file_path, usecols=usecols, nrows=nrows)
    print(f"Loaded {df.shape} {file_path}, {list(df.columns) =}")
    return df
    
def load_all_data(data_path, used_columns, nrows=None):
    suffixes = "train test validation".split()
    
    all_df = pd.concat([load_data(
        data_path / f"conll2003_transformed.{suffix}.csv",
        usecols=used_columns, 
        nrows=nrows
    ) for suffix in suffixes])
    print(f"Summary df: {all_df.shape}, {list(all_df.columns) =}")
    return all_df


DATA_PATH = Path('../data/external/hf')
used_columns = "text persons organizations locations".split()
nrows = None
df = load_all_data(DATA_PATH, used_columns=used_columns, nrows=nrows)

Loaded (14041, 4) ../data/external/hf/conll2003_transformed.train.csv, list(df.columns) =['text', 'persons', 'locations', 'organizations']
Loaded (3453, 4) ../data/external/hf/conll2003_transformed.test.csv, list(df.columns) =['text', 'persons', 'locations', 'organizations']
Loaded (3250, 4) ../data/external/hf/conll2003_transformed.validation.csv, list(df.columns) =['text', 'persons', 'locations', 'organizations']
Summary df: (20744, 4), list(all_df.columns) =['text', 'persons', 'locations', 'organizations']


In [9]:
df.head(1)

Unnamed: 0,text,persons,locations,organizations
0,EU rejects German call to boycott British lamb .,,,EU


## NE extraction

In [21]:
def extract_ne(text):
    # Tokenize the input text
    tokens = word_tokenize(text)

    # Get part of speech tags for the tokens
    pos_tags = pos_tag(tokens)

    # Perform Named Entity Chunking (NE chunking)
    chunks = ne_chunk(pos_tags, binary=False)

    # Extract only Persons, Organizations, Locations
    named_entities = defaultdict(list)
    ne_code2name = {'PERSON': 'persons', 'ORGANIZATION': 'organizations', 'GPE': 'locations'}  # GPE = Geopolitical Entity (Locations)
    for chunk in chunks:
        if hasattr(chunk, 'label'):
            entity_name = ' '.join(c[0] for c in chunk)
            entity_type = chunk.label()
            if entity_type in ne_code2name:
                named_entities[ne_code2name[entity_type]].append(entity_name.strip())

    return dict(named_entities)

# Example usage
text = "Apple is looking at buying U.K. startup for $1 billion. Elon Musk is the CEO of SpaceX."

named_entities = extract_ne(text)
named_entities


{'locations': ['Apple'],
 'persons': ['Elon Musk'],
 'organizations': ['CEO of SpaceX']}

In [22]:
def add_nes(ness, nes):
    for k, v_ness in ness.items():
        ness[k].append(";".join(nes[k]) if k in nes else "")
    return ness
    
def extract_ne_batch(texts):
    ness = {'locations': [],
             'persons': [],
             'organizations': [],
           }
    for text in texts:
        nes = extract_ne(text)
        ness = add_nes(ness, nes)
    return ness

In [23]:
texts = ["Apple or General Electric is looking at buying U.K. startup for $1 billion. Elon Musk is the CEO of SpaceX.",
        "Tesla plant is build in Germany."]
extract_ne_batch(texts)

{'locations': ['', 'Tesla;Germany'],
 'persons': ['Elon Musk', ''],
 'organizations': ['General Electric;CEO of SpaceX', '']}

In [24]:
def predict_entities(df):
    nes = extract_ne_batch(df["text"])
    df["persons_pred"] = nes['persons']
    df["organizations_pred"] = nes['organizations']
    df["locations_pred"] = nes['locations']
    return df

In [26]:
df_pred = predict_entities(df)
print(df_pred.shape)
df_pred.head(1)

(20744, 7)


Unnamed: 0,text,persons,locations,organizations,persons_pred,organizations_pred,locations_pred
0,EU rejects German call to boycott British lamb .,,,EU,,,EU;German;British


## Evaluation

In [17]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_binary(df):
    def calculate_metrics(row):
        true_set = set(row[column].split(';'))
        pred_set = set(row[column+'_pred'].split(';'))
        
        tp = len(true_set & pred_set)  # True Positives
        fp = len(pred_set - true_set)  # False Positives
        fn = len(true_set - pred_set)  # False Negatives
        support = len(true_set)        # Support: the number of true values (in 'persons')        
        return pd.Series([tp, fp, fn, support])
    
    overall_metrics = {}    
    for column in 'persons organizations locations'.split():
        # Apply the function to each row
        df[['TP_'+column, 'FP_'+column, 'FN_'+column, 'support_'+column]] = df.apply(calculate_metrics, axis=1)
        
        # Calculate precision, recall, F1 for each row
        df['precision_'+column] = df['TP_'+column] / (df['TP_'+column] + df['FP_'+column])
        df['recall_'+column] = df['TP_'+column] / (df['TP_'+column] + df['FN_'+column])
        df['F1_'+column] = 2 * (df['precision_'+column] * df['recall_'+column]) / (df['precision_'+column] + df['recall_'+column])
        
        # Fill NaN values (where precision/recall is undefined) with 0
        df.fillna(0, inplace=True)
        
        # Calculate overall precision, recall, F1-score, and support
        overall_metrics[column] ={
            "precision": round(sum(df['precision_'+column] * df['support_'+column]) /sum(df['support_'+column]), 3),
            "recall": round(sum(df['recall_'+column] * df['support_'+column]) /sum(df['support_'+column]), 3),
            "F1": round(sum(df['F1_'+column] * df['support_'+column]) /sum(df['support_'+column]), 3),
            "support": int(df['support_'+column].sum())
        }
    return df, overall_metrics   


In [18]:
# Example DataFrame
data = {
    'persons': ["Andrew Clyde;Kelly;Louie Gohmert", "Lloyd Smucker;Trump", "Timothy Kelly;Trump"],
    'organizations': ["Andrew Clyde;Kelly;Louie Gohmert", "Lloyd Smucker;Trump", "Timothy Kelly;Trump"],
    'locations': ["Florida;Orlando", "San Francisco;California;Napa County", "Mexican"],
    'persons_pred': ["Andrew Clyde;Louie Gohmert;something", "Trump;Lloyd Smucker", "Timothy Kelly;some"],
    'organizations_pred': ["Andrew Clyde;Louie Gohmert;something", "Trump;Lloyd Smucker", "Timothy Kelly;some"],
    'locations_pred': ["Florida;Orlando;something", "San Francisco;California;Napa County;some","Mexican;New York"],
}
df_test = pd.DataFrame(data)

df_test, overall_metrics = evaluate_binary(df_test)

In [38]:
df_test

Unnamed: 0,persons,organizations,locations,persons_pred,organizations_pred,locations_pred,TP_persons,FP_persons,FN_persons,support_persons,...,precision_organizations,recall_organizations,F1_organizations,TP_locations,FP_locations,FN_locations,support_locations,precision_locations,recall_locations,F1_locations
0,Andrew Clyde;Kelly;Louie Gohmert,Andrew Clyde;Kelly;Louie Gohmert,Florida;Orlando,Andrew Clyde;Louie Gohmert;something,Andrew Clyde;Louie Gohmert;something,Florida;Orlando;something,2,1,1,3,...,0.666667,0.666667,0.666667,2,1,0,2,0.666667,1.0,0.8
1,Lloyd Smucker;Trump,Lloyd Smucker;Trump,San Francisco;California;Napa County,Trump;Lloyd Smucker,Trump;Lloyd Smucker,San Francisco;California;Napa County;some,2,0,0,2,...,1.0,1.0,1.0,3,1,0,3,0.75,1.0,0.857143
2,Timothy Kelly;Trump,Timothy Kelly;Trump,Mexican,Timothy Kelly;some,Timothy Kelly;some,Mexican;New York,1,1,1,2,...,0.5,0.5,0.5,1,1,0,1,0.5,1.0,0.666667


In [19]:
overall_metrics

{'persons': {'precision': 0.714, 'recall': 0.714, 'F1': 0.714, 'support': 7},
 'organizations': {'precision': 0.714,
  'recall': 0.714,
  'F1': 0.714,
  'support': 7},
 'locations': {'precision': 0.681, 'recall': 1.0, 'F1': 0.806, 'support': 6}}

## Whole pipeline

### Code

In [27]:
import json

def load_data(file_path, usecols=None, nrows=None):
    df = pd.read_csv(file_path, usecols=usecols, nrows=nrows)
    print(f"Loaded {df.shape} {file_path}, {list(df.columns) =}")
    return df
    
def save_results(output_file_path, metrics_file_path, df, metrics):
    df.to_csv(output_file_path, index=False)
    print(f"Results saved into {output_file_path}, rows: {df.shape[0]}")

    print(metrics)
    with open(metrics_file_path, "w", encoding='utf-8') as fp:
        json.dump(metrics, fp, check_circular=True)
    print(f"Metrics saved into {metrics_file_path}")
    

    
def evaluate_baseline(data_file_path, output_file_path, metrics_file_path, do_cleanup=True, nrows=None):
    # Load data
    used_columns = "text persons organizations locations".split()
    df = load_data(data_file_path, usecols=used_columns, nrows=nrows)
    print(f"Loaded {df.shape}")

    if do_cleanup:
        df = data_cleanup(df)
        print(f"Cleaned up to {df.shape}")

    df.fillna('', inplace=True)
    print("Extracting NE")
    df_pred = predict_entities(df)
    print(f'Extracted. Res df: {df.shape}, {df.columns}')
    # print(df)

    # Evaluate predictions against true labels
    out_df, overall_metrics = evaluate_binary(df)

    # Save the predictions to a CSV file (required format for submission)
    save_results(output_file_path, metrics_file_path, df=out_df, metrics=overall_metrics)
    print("Finish")
    return out_df, overall_metrics

### Evaluation on `conll2003`

In [29]:
from_source = 'external/hf'
dataset_name = "conll2003_transformed.all"
extractor_type = 'nltk'

data_file_path = f"../data/{from_source}/{dataset_name}.csv"
output_file_path = f"../data/{from_source}/{extractor_type}/with_scores.csv"
metrics_file_path = f"../data/{from_source}/{extractor_type}/scores.json"
nrows = None

out_df, overall_metrics = evaluate_baseline(data_file_path, output_file_path, metrics_file_path, do_cleanup=False, nrows=nrows)

Loaded (20744, 4) ../data/external/hf/conll2003_transformed.all.csv, list(df.columns) =['text', 'persons', 'locations', 'organizations']
Loaded (20744, 4)
Extracting NE
Extracted. Res df: (20744, 7), Index(['text', 'persons', 'locations', 'organizations', 'persons_pred',
       'organizations_pred', 'locations_pred'],
      dtype='object')
Results saved into ../data/external/hf/nltk/with_scores.csv, rows: 20744
{'persons': {'precision': 0.791, 'recall': 0.779, 'F1': 0.778, 'support': 24248}, 'organizations': {'precision': 0.606, 'recall': 0.597, 'F1': 0.597, 'support': 23191}, 'locations': {'precision': 0.703, 'recall': 0.703, 'F1': 0.694, 'support': 23498}}
Metrics saved into ../data/external/hf/nltk/with_scores.csv
Finish


In [30]:
overall_metrics

{'persons': {'precision': 0.791,
  'recall': 0.779,
  'F1': 0.778,
  'support': 24248},
 'organizations': {'precision': 0.606,
  'recall': 0.597,
  'F1': 0.597,
  'support': 23191},
 'locations': {'precision': 0.703,
  'recall': 0.703,
  'F1': 0.694,
  'support': 23498}}