In [3]:
from copy import deepcopy
import pandas as pd
import spacy

## SpaCy baseline

In [1]:
target_entities = {"PERSON": "PER", "ORG": "ORG", "GPE": "LOC", "LOC": "LOC"}

In [2]:
def process_tag(token, target_entities):
    """
    Process a single token and return its IOB-tag
    
    Args:
        token (spacy.tokens.Token): spaCy token to process
        target_entities (dict): Dictionary mapping spaCy entity types to custom types
        
    Returns:
        str: IOB tag
    """
    # Get token index and text
    i = token.i
    token_text = token.text
    
    # Check if token is part of an entity
    if token.ent_type_ in target_entities:
        # Map entity type to custom type
        custom_type = target_entities[token.ent_type_]
        
        # Determine IOB tag
        if token.ent_iob_ == "B":
            tag = f"B-{custom_type}"
        elif token.ent_iob_ == "I":
            tag = f"I-{custom_type}"
        else:
            tag = "O"
    else:
        tag = "O"
    
    return tag

In [4]:
from difflib import SequenceMatcher
def align_tokens(original_tokens, predicted_tokens):
    """
    Align predicted tokens to original tokens using sequence matching.
    Returns a list of predicted labels aligned to original tokens.
    """
    aligned_labels = []
    i = 0  # index in predicted_tokens
    buffer = ''
    
    for orig in original_tokens:
        reconstructed = ''
        labels = []
        
        while i < len(predicted_tokens) and len(reconstructed.replace(" ", "")) < len(orig.replace(" ", "")):
            token = predicted_tokens[i]
            reconstructed += token.text
            # labels.append(token.ent_type_ if token.ent_type_ else 'O')
            labels.append(process_tag(token, target_entities))
            i += 1

        # Heuristic: use first entity tag if multiple (or 'O')
        aligned_labels.append(labels[0] if labels else 'O')

    return aligned_labels

def apply_spacy_model_to_test(model_path, tsv_path):
    """
    Apply a spaCy model to TSV tokenized test data (no gold labels), align predictions.

    Args:
        model_path (str): spaCy model path
        tsv_path (str):CSV/TSV file path
        
    Returns:
        pd.DataFrame: DataFrame with tokens and predicted labels
    """
    nlp = spacy.load(model_path)
    df = pd.read_csv(tsv_path, sep='\t')

    results = []
    current_tokens = []
    current_ids = []

    for _, row in df.iterrows():
        token = row['token'] if pd.notna(row['token']) else None
        tid = row['ID'] if pd.notna(row['ID']) else None

        if token:
            current_tokens.append(token)
            current_ids.append(tid)
        else:
            if current_tokens:
                doc = nlp(" ".join(current_tokens))
                aligned_labels = align_tokens(current_tokens, list(doc))

                for idx, tok, label in zip(current_ids, current_tokens, aligned_labels):
                    results.append({
                        'ID': idx,
                        'token': tok,
                        'tag': label
                    })

                current_tokens = []
                current_ids = []

    if current_tokens:
        doc = nlp(" ".join(current_tokens))
        aligned_labels = align_tokens(current_tokens, list(doc))

        for idx, tok, label in zip(current_ids, current_tokens, aligned_labels):
            results.append({
                'ID': idx,
                'token': tok,
                'tag': label
            })

    return pd.DataFrame(results)

# Example usage
model_path = "en_core_web_sm"
tsv_path = "../data/test.csv"
predicted_df = apply_spacy_model_to_test(model_path, tsv_path)

# Save or view results
print(predicted_df.head())

   ID token tag
0   0    So   O
1   1   had   O
2   2   his   O
3   3   way   O
4   4    of   O


In [5]:
predicted_df[['ID', 'tag']].to_csv('../data/predictions_spacy_baseline.csv', sep=',', index=0, header=['ID', 'tag'])

In [10]:
def apply_spacy_model_to_train(model_path, tsv_path):
    """
    Apply a spaCy model to TSV tokenized test data (no gold labels), align predictions.

    Args:
        model_path (str): spaCy model path
        tsv_path (str):CSV/TSV file path
        
    Returns:
        pd.DataFrame: DataFrame with tokens and predicted labels
    """
    nlp = spacy.load(model_path)
    df = pd.read_csv(tsv_path, sep='\t')

    results = []
    current_tokens = []
    current_ids = []
    # current_gold = []

    for _, row in df.iterrows():
        token = row['token'] if pd.notna(row['token']) else None
        tid = row['ID'] if pd.notna(row['ID']) else None
        # gold = row['tag'] if pd.notna(row['tag']) else None

        if token:
            current_tokens.append(token)
            current_ids.append(tid)
            # current_gold.append(gold)
        else:
            if current_tokens:
                doc = nlp(" ".join(current_tokens))
                aligned_labels = align_tokens(current_tokens, list(doc))

                for idx, tok, label,  in zip(current_ids, current_tokens, aligned_labels):
                    results.append({
                        'ID': idx,
                        'token': tok,
                        # 'gold': gtag,
                        'tag': label
                    })

                current_tokens = []
                current_ids = []
                # # current_gold = []

    if current_tokens:
        doc = nlp(" ".join(current_tokens))
        aligned_labels = align_tokens(current_tokens, list(doc))

        for idx, tok, label, in zip(current_ids, current_tokens, aligned_labels, ):
            results.append({
                'ID': idx,
                'token': tok,
                # 'gold': gtag,
                'tag': label
                })

    return pd.DataFrame(results)

# Use on Training data
model_path = "en_core_web_sm"
tsv_path = "../data/train.csv"
predicted_df_train = apply_spacy_model_to_train(model_path, tsv_path)

# print(predicted_df_train.head())

In [12]:
orig_df = pd.read_csv(tsv_path, sep='\t')

In [13]:
orig_df

Unnamed: 0,ID,token,tag
0,0,Book,O
1,1,I,O
2,2,I.,O
3,3,On,O
4,4,a,O
...,...,...,...
179058,179058,North,B-LOC
179059,179059,',I-LOC
179060,179060,ard,I-LOC
179061,179061,"""",O


In [15]:
predicted_df_train

Unnamed: 0,ID,token,tag
0,0,Book,O
1,1,I,O
2,2,I.,O
3,3,On,O
4,4,a,O
...,...,...,...
179055,179058,North,O
179056,179059,',O
179057,179060,ard,O
179058,179061,"""",O


In [14]:
predicted_df_train.tag.value_counts()

tag
O        174661
B-PER      1898
B-LOC       682
B-ORG       651
I-PER       591
I-ORG       390
I-LOC       187
Name: count, dtype: int64

In [17]:
orig_df.tag.value_counts()

tag
O        158321
I-PER      9924
B-PER      6875
I-LOC      2019
B-LOC      1613
I-ORG       213
B-ORG        98
Name: count, dtype: int64

In [18]:
orig_df['gold'] = orig_df['tag']

In [None]:
merged_df = pd.merge(predicted_df_train, orig_df[['ID', 'gold']], on='ID', how='left')


#### Error Analysis: spaCy

In [41]:
%pip install "setuptools-scm<8.0"
%pip install seqeval


Collecting setuptools-scm<8.0
  Downloading setuptools_scm-7.1.0-py3-none-any.whl.metadata (24 kB)
Collecting tomli>=1.0.0 (from setuptools-scm<8.0)
  Using cached tomli-2.2.1-py3-none-any.whl.metadata (10 kB)
Downloading setuptools_scm-7.1.0-py3-none-any.whl (43 kB)
Using cached tomli-2.2.1-py3-none-any.whl (14 kB)
Installing collected packages: tomli, setuptools-scm
Successfully installed setuptools-scm-7.1.0 tomli-2.2.1
Note: you may need to restart the kernel to use updated packages.
Collecting seqeval
  Using cached seqeval-1.2.2.tar.gz (43 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25ldone
[?25h  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16178 sha256=07380445f881bb16e8d394d9daa300efd0d9deaa55bdaa09daba984fddcb5868
  Stored in directory: /home/tacit/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqe

In [46]:
from seqeval.metrics import classification_report

print(classification_report([merged_df['gold'].to_list()], [merged_df['tag'].to_list()]))

              precision    recall  f1-score   support

         LOC       0.52      0.22      0.31      1613
         ORG       0.04      0.24      0.06        98
         PER       0.52      0.14      0.22      6875

   micro avg       0.42      0.16      0.23      8586
   macro avg       0.36      0.20      0.20      8586
weighted avg       0.51      0.16      0.24      8586



In [22]:
errors = merged_df[merged_df['tag'] != merged_df['gold']]
errors

Unnamed: 0,ID,token,tag,gold
17,17,Faust,B-LOC,O
19,19,the,B-ORG,O
20,20,Academy,I-ORG,O
21,21,of,I-ORG,O
22,22,Music,I-ORG,O
...,...,...,...,...
179030,179033,to,O,I-PER
179031,179034,settle,O,I-PER
179055,179058,North,O,B-LOC
179056,179059,',O,I-LOC


In [23]:
errors['mismatch'] = errors['tag'] + '|' + errors['gold']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors['mismatch'] = errors['tag'] + '|' + errors['gold']


In [25]:
errors.mismatch.value_counts()

mismatch
O|I-PER        9385
O|B-PER        5333
O|I-LOC        1778
O|B-LOC        1126
B-PER|O         591
I-PER|O         283
B-ORG|B-PER     251
I-ORG|O         216
B-ORG|O         215
O|I-ORG         135
B-LOC|B-PER     125
B-LOC|O         111
B-PER|I-PER     110
B-ORG|B-LOC      68
O|B-ORG          67
I-LOC|O          65
B-ORG|I-PER      62
B-LOC|I-LOC      56
B-PER|B-LOC      47
I-ORG|I-PER      45
I-ORG|I-LOC      42
B-LOC|I-PER      30
B-ORG|I-LOC      21
I-PER|B-PER      15
B-ORG|I-ORG       9
I-ORG|B-PER       9
I-LOC|I-PER       8
B-PER|I-LOC       8
I-ORG|B-LOC       7
I-PER|I-LOC       4
I-PER|B-LOC       4
I-ORG|B-ORG       4
I-LOC|B-LOC       2
I-PER|B-ORG       1
I-LOC|B-PER       1
B-PER|I-ORG       1
B-LOC|B-ORG       1
I-LOC|I-ORG       1
Name: count, dtype: int64

In [29]:
errors[errors['mismatch'] == 'O|I-PER'].token.value_counts().head(30)

token
,           317
of          308
man         261
the         185
and         161
mother      147
who         132
father      125
a           119
young       108
boy          94
woman        88
men          82
with         79
in           77
old          76
to           76
one          68
people       68
child        67
little       63
wife         59
girl         53
lady         53
family       46
children     42
his          41
dear         41
doctor       40
sister       38
Name: count, dtype: int64

In [33]:
errors[(errors['mismatch'] == 'B-LOC|O')| (errors['mismatch'] == 'I-LOC|O')].token.value_counts().head(30)

token
veranda          12
Aubert           11
the               6
St.               5
's                5
Paul              4
Victoria          4
thou              4
P.S.              4
Gables            4
Green             4
Puritan           3
Washington        2
Barricade         2
Duval             2
North             2
Street            2
Gate              2
West              2
Jewry             2
Grange            2
Old               2
gardenia          2
Elizabethan       2
Thrushcross       2
Battery           2
Lake              2
Dinah             2
Esq.              2
veranda-study     1
Name: count, dtype: int64

 Baseline spaCy model missed PER NE pecultiar to LitBank: generational terms and coreference mentions(pronouns, common nouns such as 'man')