In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
!pip install datasets transformers torch scikit-learn




In [None]:
import json
import torch
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from datasets import load_dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
)
from torch.nn import CrossEntropyLoss


# Clean JSONL File
def clean_jsonl(input_file, output_file):
    fixed_lines = []
    with open(input_file, "r", encoding="utf-8") as file:
        for i, line in enumerate(file, start=1):
            try:
                json_data = json.loads(line.strip())
                fixed_lines.append(json.dumps(json_data))
            except json.JSONDecodeError as e:
                print(f"Skipping invalid line {i}: {e}")
    with open(output_file, "w", encoding="utf-8") as fixed_file:
        fixed_file.write("\n".join(fixed_lines))
    print(f"Cleaned file saved to: {output_file}")


# Extract All Unique Labels
def extract_labels(dataset):
    unique_labels = set()
    for labels in dataset["train"]["ner_tags"]:
        unique_labels.update(labels)
    return sorted(list(unique_labels))


# Convert Labels to IDs
def convert_labels_to_ids(dataset, label2id):
    def map_labels(example):
        example["ner_tags"] = [label2id[label] for label in example["ner_tags"]]
        return example
    return dataset.map(map_labels)


# Tokenize and Align Labels
def tokenize_and_align_labels(examples, tokenizer, label2id):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding="max_length",
        is_split_into_words=True,
        max_length=128,
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                aligned_labels.append(-100)
            elif word_idx != previous_word_idx:
                aligned_labels.append(label[word_idx])
            else:
                aligned_labels.append(-100)
            previous_word_idx = word_idx
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


# Compute Label Weights
def compute_label_weights(dataset, label_list):
    label_counts = {label: 0 for label in range(len(label_list))}
    for labels in dataset["train"]["labels"]:
        for label in labels:
            if label != -100:
                label_counts[label] += 1
    total_count = sum(label_counts.values())
    weights = {
        label: (total_count / (len(label_list) * count)) if count > 0 else 0.0
        for label, count in label_counts.items()
    }
    print(f"Label Counts: {label_counts}")
    print(f"Label Weights: {weights}")
    return torch.tensor([weights[label] for label in range(len(label_list))]).to(
        "cuda" if torch.cuda.is_available() else "cpu"
    )


# Custom Trainer with Weighted Loss
class WeightedLossTrainer(Trainer):
    def __init__(self, *args, label_weights, **kwargs):
        super().__init__(*args, **kwargs)
        self.label_weights = label_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = CrossEntropyLoss(weight=self.label_weights, ignore_index=-100)
        loss = loss_fct(logits.view(-1, len(self.label_weights)), labels.view(-1))
        return (loss, outputs) if return_outputs else loss


# Metrics for Evaluation
def compute_metrics(predictions, label_list):
    preds, labels = predictions
    preds = np.argmax(preds, axis=2)
    true_preds = [[p for p, l in zip(pred, label) if l != -100] for pred, label in zip(preds, labels)]
    true_labels = [[l for p, l in zip(pred, label) if l != -100] for pred, label in zip(preds, labels)]
    flat_preds = [p for seq in true_preds for p in seq]
    flat_labels = [l for seq in true_labels for l in seq]
    report = classification_report(flat_labels, flat_preds, output_dict=True, zero_division=0)
    return {
        "precision": report["macro avg"]["precision"],
        "recall": report["macro avg"]["recall"],
        "f1": report["macro avg"]["f1-score"],
    }


# Main Pipeline
def main_pipeline(train_file, val_file, model_checkpoint, submission_file, output_dir="./results"):
    # Step 1: Clean JSONL files
    train_cleaned = "train_cleaned.jsonl"
    val_cleaned = "validation_cleaned.jsonl"
    clean_jsonl(train_file, train_cleaned)
    clean_jsonl(val_file, val_cleaned)

    # Load datasets
    dataset_dict = DatasetDict({
        "train": load_dataset("json", data_files=train_cleaned, split="train"),
        "validation": load_dataset("json", data_files=val_cleaned, split="train"),
    })

    #  Extract and map labels
    unique_labels = extract_labels(dataset_dict)
    label2id = {label: idx for idx, label in enumerate(unique_labels)}
    id2label = {idx: label for idx, label in enumerate(unique_labels)}
    dataset_dict = convert_labels_to_ids(dataset_dict, label2id)

    # Tokenize dataset
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    tokenized_datasets = dataset_dict.map(
        lambda examples: tokenize_and_align_labels(examples, tokenizer, label2id),
        batched=True,
        remove_columns=["tokens", "ner_tags"],
    )

    # Compute label weights
    label_weights = compute_label_weights(tokenized_datasets, unique_labels)

    # Load model and training arguments
    model = AutoModelForTokenClassification.from_pretrained(
        model_checkpoint, num_labels=len(unique_labels)
    )
    training_args = TrainingArguments(
        output_dir=output_dir,
        eval_strategy="steps",
        eval_steps=500,
        save_steps=1000,
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir="./logs",
        save_total_limit=2,
        report_to="none",
    )

    # Train the model
    trainer = WeightedLossTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        data_collator=DataCollatorForTokenClassification(tokenizer),
        compute_metrics=lambda p: compute_metrics(p, unique_labels),
        label_weights=label_weights,
    )
    trainer.train()
    trainer.evaluate()

    # Save the model
    trainer.save_model("./ner_model")
    tokenizer.save_pretrained("./ner_model")
    print("Model and tokenizer saved!")

    # Load sample submission and predict
    submission_df = pd.read_csv(submission_file)
    predictions = trainer.predict(tokenized_datasets["validation"])
    preds = np.argmax(predictions.predictions, axis=2)
    submission_updates = []
    for i, example in enumerate(tokenized_datasets["validation"]):
        for j, token in enumerate(example["input_ids"]):
            if token != tokenizer.pad_token_id:  # Exclude padding
                submission_updates.append((f"{i}_{j}", int(preds[i][j])))

    updates_df = pd.DataFrame(submission_updates, columns=["id", "ne"])
    updates_df["id"] = updates_df["id"].apply(lambda x: f"{int(x.split('_')[0]):05d}_{x.split('_')[1]}")
    updates_df = updates_df[updates_df["id"].isin(submission_df["id"])]
    submission_df = submission_df.merge(updates_df, on="id", how="left")
    submission_df["ne"] = submission_df["ne_y"].fillna(0).astype(int)
    submission_df.drop(columns=["ne_y"], inplace=True)
    submission_df.to_csv("submission.csv", index=False)
    print("Submission file saved successfully!")


# Define inputs and run pipeline
train_file = "/content/train.jsonl"
val_file = "/content/validation.jsonl"
submission_file = "/content/sample_submission.csv"
model_checkpoint = "airesearch/wangchanberta-base-att-spm-uncased"
main_pipeline(train_file, val_file, model_checkpoint, submission_file)

Cleaned file saved to: train_cleaned.jsonl
Cleaned file saved to: validation_cleaned.jsonl


Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/63310 [00:00<?, ? examples/s]

Map:   0%|          | 0/12662 [00:00<?, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/282 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/546 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/905k [00:00<?, ?B/s]

Map:   0%|          | 0/63310 [00:00<?, ? examples/s]

Map:   0%|          | 0/12662 [00:00<?, ? examples/s]

Label Counts: {0: 416, 1: 16916, 2: 14132, 3: 31253, 4: 30653, 5: 14, 6: 12940, 7: 46417, 8: 44764, 9: 824, 10: 20076, 11: 92, 12: 8392, 13: 11649, 14: 27713, 15: 8992, 16: 964, 17: 28215, 18: 18280, 19: 281, 20: 44, 21: 109, 22: 8297, 23: 30685, 24: 38910, 25: 11185, 26: 1619, 27: 42709, 28: 19997, 29: 326, 30: 217, 31: 1990261}
Label Weights: {0: 185.34720552884616, 1: 4.558077411917711, 2: 5.456017371921879, 3: 2.4671051579048413, 4: 2.5153961276220924, 5: 5507.459821428572, 6: 5.95861186244204, 7: 1.6611249649912747, 8: 1.7224653181127692, 9: 93.57334648058253, 10: 3.8406274905359634, 11: 838.0917119565217, 12: 9.187850035748331, 13: 6.6189748047042665, 14: 2.7822479522245875, 15: 8.57478175044484, 16: 79.98385632780084, 17: 2.732746322877902, 18: 4.2179670404814, 19: 274.3930160142349, 20: 1752.3735795454545, 21: 707.3801605504588, 22: 9.29305019886706, 23: 2.512772934658628, 24: 1.9816098046774608, 25: 6.893557219490389, 26: 47.62472977146387, 27: 1.8053440141422183, 28: 3.855800

model.safetensors:   0%|          | 0.00/423M [00:00<?, ?B/s]

Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at airesearch/wangchanberta-base-att-spm-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Precision,Recall,F1
500,1.3825,0.633428,0.420189,0.675748,0.470046
1000,0.6211,0.441493,0.444611,0.743531,0.526504
1500,0.528,0.369715,0.465699,0.807925,0.563297
2000,0.4238,0.310586,0.537295,0.822475,0.612765
2500,0.3797,0.305706,0.498068,0.830167,0.594923
3000,0.4185,0.29035,0.487455,0.83561,0.587157
3500,0.3313,0.262665,0.535259,0.846307,0.622533
4000,0.3526,0.26089,0.543329,0.830608,0.635156
4500,0.3676,0.237038,0.52618,0.847881,0.620122
5000,0.3509,0.238695,0.549981,0.859711,0.638891


Model and tokenizer saved!
Submission file saved successfully!


In [None]:
import pandas as pd

submission_df = pd.read_csv("submission.csv")
print(submission_df.head())
print(submission_df.columns)

        id  ne_x  ne
0  03795_0   0.0  31
1  03795_1   0.0  31
2  03795_2   1.0   1
3  03795_3   6.0   1
4  03795_4   6.0  30
Index(['id', 'ne_x', 'ne'], dtype='object')


In [None]:
import pandas as pd

submission_df = pd.read_csv("sample_submission.csv")
print(submission_df.head())
print(submission_df.columns)

        id   ne
0  03795_0  0.0
1  03795_1  0.0
2  03795_2  1.0
3  03795_3  6.0
4  03795_4  6.0
Index(['id', 'ne'], dtype='object')


In [None]:
import pandas as pd

submission_df = pd.read_csv("submission.csv")

# Drop the 'ne_x' column if it exists
if 'ne_x' in submission_df.columns:
    submission_df = submission_df.drop(columns=['ne_x'])

# Debug: Check the structure of submission_df
print(submission_df.head())
print(submission_df.columns)
sample_submission_df = pd.read_csv("sample_submission.csv")

# Check column names
print("Sample Submission Columns:", sample_submission_df.columns)
print("Submission Columns:", submission_df.columns)

# Ensure columns match
submission_df.columns = sample_submission_df.columns

# Debug: Final check
print(submission_df.head())
print("Final Submission Columns:", submission_df.columns)

# Save the corrected submission file
submission_df.to_csv("submission_corrected.csv", index=False)
print("Corrected submission file saved as 'submission_corrected.csv'")


        id  ne
0  03795_0  31
1  03795_1  31
2  03795_2   1
3  03795_3   1
4  03795_4  30
Index(['id', 'ne'], dtype='object')
Sample Submission Columns: Index(['id', 'ne'], dtype='object')
Submission Columns: Index(['id', 'ne'], dtype='object')
        id  ne
0  03795_0  31
1  03795_1  31
2  03795_2   1
3  03795_3   1
4  03795_4  30
Final Submission Columns: Index(['id', 'ne'], dtype='object')
Corrected submission file saved as 'submission_corrected.csv'


In [None]:
print(len(sample_submission_df))
print(len(submission_df))

213091
213091


In [None]:
submission_df = pd.read_csv("submission_corrected.csv")
print(submission_df["ne"].value_counts())


ne
0     178051
31     24645
8       1001
28       886
23       773
27       753
7        717
17       650
18       623
1        512
24       491
12       450
4        434
2        426
3        405
14       381
13       364
22       335
10       295
25       260
6        250
15       142
9         64
30        44
26        40
29        37
19        21
11        18
16        15
21         5
20         3
Name: count, dtype: int64


In [None]:
# Extract unique labels from the dataset
unique_labels = set()
for example in dataset:
    unique_labels.update(example["ner_tags"])

# Create a mapping of labels to integers
label_mapping = {label: idx for idx, label in enumerate(sorted(unique_labels))}
print(f"Generated Label Mapping: {label_mapping}")


Generated Label Mapping: {'B_BRN': 0, 'B_DES': 1, 'B_DTM': 2, 'B_LOC': 3, 'B_MEA': 4, 'B_NAME': 5, 'B_NUM': 6, 'B_ORG': 7, 'B_PER': 8, 'B_TRM': 9, 'B_TTL': 10, 'E_BRN': 11, 'E_DES': 12, 'E_DTM': 13, 'E_LOC': 14, 'E_MEA': 15, 'E_NUM': 16, 'E_ORG': 17, 'E_PER': 18, 'E_TRM': 19, 'E_TTL': 20, 'I_BRN': 21, 'I_DES': 22, 'I_DTM': 23, 'I_LOC': 24, 'I_MEA': 25, 'I_NUM': 26, 'I_ORG': 27, 'I_PER': 28, 'I_TRM': 29, 'I_TTL': 30, 'O': 31}


In [None]:
print(dataset[0]["ner_tags"])  # Should show integers, e.g., [0, 1, 2, ...]

[7, 17, 31, 31, 31, 31, 31, 31, 31, 31, 31]


In [None]:
def map_labels(example):
    # Map `ner_tags` using the dynamically generated label_mapping
    example["ner_tags"] = [label_mapping[label] for label in example["ner_tags"]]
    return example

# Apply the mapping
dataset = dataset.map(map_labels)


Map:   0%|          | 0/63310 [00:00<?, ? examples/s]