# Named Entity Recognition and Classification (NERC)

In [44]:
import csv
import spacy

import sklearn_crfsuite

import utils

import pandas as pd

from collections import Counter
from typing import List, Dict, Set, Tuple, Optional
from datasets import load_dataset, Dataset
from sklearn_crfsuite.metrics import flat_classification_report

Add the following in the README

In [45]:
# !pip install spacy[transformers]
# python -m spacy download en_core_web_sm ADD ! at start
# python -m spacy download en_core_web_trf

## Data Collection and Preprocessing

Link to dataset: https://huggingface.co/datasets/eriktks/conll2003

In [46]:
ner_train_data_raw: Dataset = load_dataset("conll2003", trust_remote_code=True, split="train") 
ner_train_data_raw

Dataset({
    features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
    num_rows: 14041
})

In [47]:
ner_labels = ner_train_data_raw.features["ner_tags"].feature.names
pos_labels = ner_train_data_raw.features["pos_tags"].feature.names

Please, adjust the following files accordingly

In [48]:
train_data_ner_file: str = r"C:\Users\jayde\OneDrive\School\Text Mining for AI\final_project_tm\data\train_data\NER-train.tsv"
test_data_ner_file: str = r"C:\Users\jayde\OneDrive\School\Text Mining for AI\final_project_tm\data\test_data\NER-test.tsv"

By analysing the test set, we discover that it used the spaCy NER.

Run the following in a new cell to see the results:

```python
test_bio_ner_tags = utils.gather_test_bio_ner_tags(file_name)
test_bio_ner_tags
```

Call the following to convert the data to a CSV file:

```python
utils.nerc_data_to_file(raw_data, file_name)
```

In [49]:
df: pd.DataFrame = pd.read_csv(train_data_ner_file, sep="\t")
df.head()

Unnamed: 0,sentence_id,token_id,token,bio_ner_tag
0,0,0,EU,B-ORG
1,0,1,rejects,O
2,0,2,German,B-NORP
3,0,3,call,O
4,0,4,to,O


## Setting up the Model - CRF

Note for ourselves: we could even split the data into train and test still | Or, we use the test data provided by the dataset and just load it seperately

### Data setup

First, we need to ensure the data is organised properly for a model to use it as training data.

In [50]:
X, y = utils.gather_tokens_and_tags(df=df)

In [51]:
X_features = [utils.sentence_to_features(sentence=sent) for sent in X]

The following 2 code cells represent the first instance of the X_features (training features), and the number of occurrences in the training data for all BIO_NER_tags respectively.

In [52]:
f"X_features[0]: {X_features[0]}"

"X_features[0]: [{'bias': 1.0, 'word.lower()': 'eu', 'word[-3:]': 'EU', 'word[-2:]': 'EU', 'word.isupper()': True, 'word.istitle()': False, 'word.isdigit()': False, 'pos': 'NNP', 'word.shape': 'XX', 'BOS': True, '+1:word.lower()': 'rejects', '+1:word.istitle()': False, '+1:word.isupper()': False, '+1:pos': 'xxxxxxx'}, {'bias': 1.0, 'word.lower()': 'rejects', 'word[-3:]': 'cts', 'word[-2:]': 'ts', 'word.isupper()': False, 'word.istitle()': False, 'word.isdigit()': False, 'pos': 'VBZ', 'word.shape': 'xxxxxxx', '-1:word.lower()': 'eu', '-1:word.istitle()': False, '-1:word.isupper()': True, '-1:pos': 'NNP', '-1:word.shape': 'XX', '+1:word.lower()': 'german', '+1:word.istitle()': True, '+1:word.isupper()': False, '+1:pos': 'Xxxxxx'}, {'bias': 1.0, 'word.lower()': 'german', 'word[-3:]': 'man', 'word[-2:]': 'an', 'word.isupper()': False, 'word.istitle()': True, 'word.isdigit()': False, 'pos': 'JJ', 'word.shape': 'Xxxxxx', '-1:word.lower()': 'rejects', '-1:word.istitle()': False, '-1:word.isup

In [61]:
train_tag_counts = Counter(tag for seq in y for tag in seq)
f"Train tag counts: {train_tag_counts}"

"Train tag counts: Counter({'O': 156806, 'I-DATE': 8082, 'B-CARDINAL': 7663, 'B-GPE': 6308, 'B-PERSON': 5780, 'B-DATE': 5682, 'I-PERSON': 5228, 'B-ORG': 4699, 'I-ORG': 4377, 'B-NORP': 2064, 'I-CARDINAL': 1513, 'I-GPE': 1058, 'B-ORDINAL': 964, 'I-MONEY': 735, 'B-MONEY': 629, 'I-TIME': 534, 'I-QUANTITY': 530, 'I-PERCENT': 426, 'B-TIME': 416, 'B-QUANTITY': 384, 'B-PERCENT': 331, 'I-EVENT': 271, 'B-LOC': 201, 'B-EVENT': 195, 'I-FAC': 181, 'I-WORK_OF_ART': 177, 'B-PRODUCT': 155, 'I-LOC': 149, 'B-FAC': 126, 'I-NORP': 122, 'I-LAW': 104, 'B-WORK_OF_ART': 83, 'I-PRODUCT': 81, 'B-LAW': 42, 'B-LANGUAGE': 26})"

This is what a complete sentence as a pair of X_features and y looks like:

In [None]:
print("X_features[0]:", X_features[0])
print("y[0]:", y[0])

X_features[0]: [{'bias': 1.0, 'word.lower()': 'eu', 'word[-3:]': 'EU', 'word[-2:]': 'EU', 'word.isupper()': True, 'word.istitle()': False, 'word.isdigit()': False, 'pos': 'NNP', 'word.shape': 'XX', 'BOS': True, '+1:word.lower()': 'rejects', '+1:word.istitle()': False, '+1:word.isupper()': False, '+1:pos': 'xxxxxxx'}, {'bias': 1.0, 'word.lower()': 'rejects', 'word[-3:]': 'cts', 'word[-2:]': 'ts', 'word.isupper()': False, 'word.istitle()': False, 'word.isdigit()': False, 'pos': 'VBZ', 'word.shape': 'xxxxxxx', '-1:word.lower()': 'eu', '-1:word.istitle()': False, '-1:word.isupper()': True, '-1:pos': 'NNP', '-1:word.shape': 'XX', '+1:word.lower()': 'german', '+1:word.istitle()': True, '+1:word.isupper()': False, '+1:pos': 'Xxxxxx'}, {'bias': 1.0, 'word.lower()': 'german', 'word[-3:]': 'man', 'word[-2:]': 'an', 'word.isupper()': False, 'word.istitle()': True, 'word.isdigit()': False, 'pos': 'JJ', 'word.shape': 'Xxxxxx', '-1:word.lower()': 'rejects', '-1:word.istitle()': False, '-1:word.isupp

### Training of the CRF

Next, we set up the model, and train it on the data using .fit() . The parameters c1 and c2 are for L1 Regularisation and L2 Regularisation respectively. 

L1 Regularisation - encourages sparsity in the model by setting some feature weights to 0. This helps with feature selection.

L2 Regularisation - encourages smaller weights overall, but not 0. This helps smooth out the model and reduces the effect of any single feature.

As seen earlier in the Counter object, the data is not balanced very well. Therefore, we set these paramaters relatively high, so penalties for high-frequency features are penalised stronger if they aren't helpful for the task, or if the features result in large model weights.

In [62]:
crf = sklearn_crfsuite.CRF(
    algorithm="lbfgs",
    c1=0.6,
    c2=0.6,
    max_iterations=100,
    all_possible_transitions=True
)

crf.fit(X_features, y)

## Evaluation of the CRF

First, we need to gather the test data properly.

In [63]:
df_test: pd.DataFrame = pd.read_csv(test_data_ner_file, sep="\t")
df_test.head()


Unnamed: 0,sentence_id,token_id,token,bio_ner_tag
0,0,0,If,O
1,0,1,you're,O
2,0,2,visiting,O
3,0,3,Paris,B-LOCATION
4,0,4,",",O


In [64]:
X_test, y_test = utils.gather_tokens_and_tags(df=df_test)
X_test_features = [utils.sentence_to_features(sent) for sent in X_test]

In [65]:
y_pred = crf.predict(X_test_features)
crf_report = flat_classification_report(y_test, y_pred, zero_division=0)

In [67]:
print(crf_report)

               precision    recall  f1-score   support

   B-CARDINAL       0.00      0.00      0.00         0
       B-DATE       0.00      0.00      0.00         0
      B-EVENT       0.00      0.00      0.00         0
        B-GPE       0.00      0.00      0.00         0
   B-LOCATION       0.00      0.00      0.00         3
        B-ORG       0.33      0.25      0.29         8
     B-PERSON       0.64      0.75      0.69        12
B-WORK_OF_ART       0.00      0.00      0.00         6
       I-DATE       0.00      0.00      0.00         0
      I-EVENT       0.00      0.00      0.00         0
        I-GPE       0.00      0.00      0.00         0
   I-LOCATION       0.00      0.00      0.00         2
        I-ORG       0.57      0.80      0.67         5
     I-PERSON       0.80      0.62      0.70        13
I-WORK_OF_ART       0.00      0.00      0.00         8
            O       0.91      0.94      0.93       159

     accuracy                           0.80       216
    macr