In [None]:
!pip install spacy
!pip install matplotlib
!pip install scikit-learn

In [None]:
import json
import spacy
from spacy.cli.train import train
from spacy.util import minibatch, compounding
import random
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, precision_recall_fscore_support
from sklearn.model_selection import train_test_split


In [None]:
# Load the training data
with open('C:\\Users\\josed\\codes\\spacy_NER_trainig\\maped_data.json', 'r', encoding='utf-8') as f:
    TRAIN_DATA = json.load(f)


In [None]:
# Split the data into training and development sets (80% train, 20% dev)
train_data, dev_data = train_test_split(TRAIN_DATA, test_size=0.2, random_state=42)


In [None]:
# Save the split data
train_data_path = Path('path/to/output/train_data.spacy')
dev_data_path = Path('path/to/output/dev_data.spacy')

with open(train_data_path, 'w', encoding='utf-8') as f:
    json.dump(train_data, f, ensure_ascii=False, indent=2)

with open(dev_data_path, 'w', encoding='utf-8') as f:
    json.dump(dev_data, f, ensure_ascii=False, indent=2)


In [None]:
# Load the Portuguese language model
# Load transformer-based model or initialize blank model based on configuration
try:
    nlp = spacy.load("pt_core_news_lg")
except OSError:
    nlp = spacy.blank('pt')
    print("Loading blank Portuguese model. Make sure to have a pre-trained model for better performance.")


In [None]:
# Update the config paths
cfg_path = Path('base_config.cfg')
filled_cfg_path = Path('config.cfg')
output_dir = Path('C:\\Users\\josed\\codes\\spacy_NER_trainig\\model')


In [None]:
# Fill the configuration with default settings
!python -m spacy init fill-config {cfg_path} {filled_cfg_path}


In [None]:
# Update the filled configuration file to include the train and dev paths
with open(filled_cfg_path, 'r') as f:
    config = f.read()

config = config.replace("train = null", f"train = {train_data_path}")
config = config.replace("dev = null", f"dev = {dev_data_path}")

with open(filled_cfg_path, 'w') as f:
    f.write(config)


In [None]:
# Train the model using the filled configuration
train(str(filled_cfg_path), output_path=str(output_dir))


In [None]:
# Plot training loss
# Since training with the CLI does not provide direct losses here,
# an alternative would be monitoring the output directory logs.

# Load trained model
nlp = spacy.load(output_dir)


In [None]:
# Evaluate the model
examples = random.sample(dev_data, 10)  # Randomly pick some examples for testing
predictions, true_labels = [], []

for text, annotations in examples:
    doc = nlp(text)
    predicted_labels = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
    true_entities = annotations['entities']

    predictions.extend([label for _, _, label in predicted_labels])
    true_labels.extend([label for _, _, label in true_entities])


In [None]:
# Calculate metrics
precision, recall, f1_score, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1_score:.3f}")


In [None]:
# Display detailed classification report
print("\nClassification Report:\n")
print(classification_report(true_labels, predictions))
