In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install nltk



In [None]:
import json
import numpy as np
import pandas as pd
import requests

In [None]:
PATH = '/content/drive/MyDrive/NLP_dataset/annotations.json'
url = "https://raw.githubusercontent.com/jejejery/IF5153-Tugas-Besar-Analisis-Promo/main/data/data_ner_annotations.json"
# read json
# with open(PATH) as f:
#   data = json.load(f)
#read json from url

response = requests.get(url)
data = response.json()

data

In [None]:
import random
# Shuffle data untuk memastikan distribusi acak
random.seed(42)  # Untuk reprodusibilitas
random.shuffle(data['annotations'])

In [None]:

split_ratio = 0.85  # 85% train, 15% test
data_annotations = data['annotations']
train_size = int(len(data_annotations) * split_ratio)
train_data = data_annotations[:train_size]
test_data = data_annotations[train_size:]

In [None]:
import spacy
from spacy.training.example import Example
from spacy.util import minibatch, compounding


nlp = spacy.blank("id")
nlp.add_pipe('ner')
nlp.begin_training()

ner=nlp.get_pipe("ner")

pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]

unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

# Add label to model NER
for _, annotations in train_data:
    for ent in annotations["entities"]:
        ner.add_label(ent[2])

optimizer = nlp.begin_training()

for epoch in range(30):
    random.shuffle(train_data)
    losses = {}
    batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        examples = [Example.from_dict(nlp.make_doc(text), ann) for text, ann in batch]
        nlp.update(examples, drop=0.3, losses=losses)
    print(f"Losses at epoch {epoch}: {losses}")



Losses at epoch 0: {'ner': 3447.8441379614205}
Losses at epoch 1: {'ner': 2217.024534631687}
Losses at epoch 2: {'ner': 1920.492260533483}
Losses at epoch 3: {'ner': 1708.3351862458146}
Losses at epoch 4: {'ner': 1486.6851791677757}
Losses at epoch 5: {'ner': 1395.1171281634752}
Losses at epoch 6: {'ner': 1324.8743170393188}
Losses at epoch 7: {'ner': 1184.943350036738}
Losses at epoch 8: {'ner': 1062.1979066168237}
Losses at epoch 9: {'ner': 1014.2302659002289}
Losses at epoch 10: {'ner': 1084.603600123131}
Losses at epoch 11: {'ner': 943.9337054617233}
Losses at epoch 12: {'ner': 799.8786049815302}
Losses at epoch 13: {'ner': 799.3880852931787}
Losses at epoch 14: {'ner': 745.3660398243195}
Losses at epoch 15: {'ner': 759.1729677018767}
Losses at epoch 16: {'ner': 684.3305327237655}
Losses at epoch 17: {'ner': 642.8751814068104}
Losses at epoch 18: {'ner': 640.5230058419003}
Losses at epoch 19: {'ner': 603.3670813211212}
Losses at epoch 20: {'ner': 538.1850413071985}
Losses at epoch 

In [None]:
#export the model
output_dir = "/content/drive/MyDrive/TUGAS BESAR NLP/MODEL/Promo-NER/"
nlp.to_disk(output_dir)

In [None]:
from collections import Counter

def evaluate_ner(actual_entities, predicted_entities):
    tp = 0  # True Positives
    fp = 0  # False Positives
    fn = 0  # False Negatives

    # Iterasi untuk mencocokkan actual dan predicted
    for actual_text, actual_label in actual_entities:
        match_found = False
        for predicted_text, predicted_label in predicted_entities:
            if actual_label == predicted_label:  # Cek label cocok
                # Cek subset match
                if predicted_text in actual_text or actual_text in predicted_text:
                    tp += 1
                    match_found = True
                    break
        if not match_found:
            fn += 1  # Tidak ditemukan pasangan yang cocok untuk entitas aktual

    # Hitung false positives
    for predicted_text, predicted_label in predicted_entities:
        match_found = False
        for actual_text, actual_label in actual_entities:
            if predicted_label == actual_label:  # Cek label cocok
                # Cek subset match
                if predicted_text in actual_text or actual_text in predicted_text:
                    match_found = True
                    break
        if not match_found:
            fp += 1  # Tidak ditemukan pasangan yang cocok untuk entitas prediksi

    # Total Entities
    total_entities = len(actual_entities) + len(predicted_entities) - tp

    # Hitung metrik
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    accuracy = tp / total_entities if total_entities > 0 else 0

    return {"precision": precision, "recall": recall, "f1": f1, "accuracy": accuracy}

# Evaluasi data test
overall_results = Counter()  # Untuk menghitung rata-rata
for text, annotations in test_data:

    # Parse annotations to extract actual entities
    actual_entities = [(text[ent[0]:ent[1]], ent[2]) for ent in annotations["entities"]]

    # Predicted annotations
    doc = nlp(text)  # Predict using the trained model
    predicted_entities = [(ent.text, ent.label_) for ent in doc.ents]

    # Hitung metrik untuk teks ini
    results = evaluate_ner(actual_entities, predicted_entities)


    # Update hasil keseluruhan
    overall_results.update(results)

# Hitung rata-rata untuk keseluruhan data test
average_precision = overall_results["precision"] / len(test_data)
average_recall = overall_results["recall"] / len(test_data)
average_f1 = overall_results["f1"] / len(test_data)

print("Overall Performance:")
print(f"Precision: {average_precision:.2f}, Recall: {average_recall:.2f}, F1: {average_f1:.2f}")

Overall Performance:
Precision: 0.74, Recall: 0.61, F1: 0.64


In [None]:
#export test_data to test_data_path
#test_data is json array
test_data_path = "/content/drive/MyDrive/TUGAS BESAR NLP/COMBINED/test_dataset_ner.json"

with open(test_data_path, "w", encoding="utf-8") as json_file:
    json.dump(test_data, json_file, ensure_ascii=False, indent=4)

print(f"Data berhasil diekspor ke {test_data_path}")

Data berhasil diekspor ke /content/drive/MyDrive/TUGAS BESAR NLP/COMBINED/test_dataset_ner.json
