https://huggingface.co/monsoon-nlp/bert-base-thai

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict, Counter
import numpy as np
import math

# Data

In [3]:
df = pd.read_csv('../datasets/tscc_v0.1-judgement.csv')
df.head()

Unnamed: 0,issueid,dekaid,year,category,issueno,lawids,fact,decision,isact,isexternalelements,isinternalelement,isintent,isneglect,iscause,isjustify,isexcuse,isguilty,isattempt,isattemptimpossible
0,1,1478/2528,2528,LB,1,"CC-288-00,CC-083-00,CC-063-00",จำเลยกับพวกร่วมกันใช้อาวุธปืนยิงผู้ตายถูกที่ด้...,จำเลยจึงมีความผิดฐานฆ่าผู้ตายโดยเจตนา,1,1,1,1,-1,1,0,0,1,0,-1
1,2,1548/2531,2531,LB,1,CC-288-00,จำเลยที่ 1 ซึ่งเคยมีเรื่องทะเลาะกับผู้ตายมาก่อ...,จำเลยที่ 1 จึงมีความผิดฐานฆ่าผู้ตายโดยเจตนา,1,1,1,1,-1,1,0,0,1,0,-1
2,3,1548/2531,2531,LB,2,"CC-290-00,CC-083-00",ส่วนจำเลยที่ 2 ที่ 3 และที่ 4 นั้น ได้ความว่าก...,การที่จำเลยที่ 1 ใช้เหล็กแหลมแทงผู้ตายโดยเจตนา...,1,1,1,1,-1,1,0,0,1,0,-1
3,4,1548/2531,2531,LB,3,"CC-288-00,CC-083-00",ส่วนจำเลยที่ 2 ที่ 3 และที่ 4 นั้น ได้ความว่าก...,การที่จำเลยที่ 1 ใช้เหล็กแหลมแทงผู้ตายโดยเจตนา...,0,-1,-1,-1,-1,-1,-1,-1,0,-1,-1
4,5,1697/2522,2522,LB,1,"CC-288-00,CC-083-00",โจทก์บรรยายฟ้องว่า จำเลยกับพวกที่ยังไม่ได้ตัวม...,จึงเป็นการกระทำโดยมีเจตนาฆ่าผู้ตาย แม้ข้อเท็จจ...,1,1,1,1,-1,1,0,0,1,0,-1


In [20]:
''' Original Split '''
year1, year2 = 2539, 2553

''' My Split '''
# year1, year2 = 2553, 2555

train_df = df[df['year'] < year1]
valid_df = df[df['year'].between(year1, year2)]
test_df = df[df['year'] > year2]
assert len(df) == len(train_df) + len(valid_df) + len(test_df)
print(len(train_df), len(valid_df), len(test_df))

694 267 246


In [21]:
train_x = train_df['fact'].tolist()
train_y = train_df['isguilty'].tolist()
valid_x = valid_df['fact'].tolist()
valid_y = valid_df['isguilty'].tolist()
test_x = test_df['fact'].tolist()
test_y = test_df['isguilty'].tolist()

## Model

In [22]:
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments

import pandas as pd
import numpy as np
import torch
import json

In [23]:
model_name = "airesearch/wangchanberta-base-att-spm-uncased"
#revision = "finetuned@wisesight_sentiment"
revision = None
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=2, revision=revision
)

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at airesearch/wangchanberta-base-att-spm-uncased and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
tokenizer = AutoTokenizer.from_pretrained(
                model_name,
                revision='main',
                model_max_length=416,)

# Dataset

In [25]:
class TSCCDataset(torch.utils.data.Dataset):
    
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [26]:
train_encodings = tokenizer(train_x, truncation=True, padding=True)
valid_encodings = tokenizer(valid_x, truncation=True, padding=True)
test_encodings = tokenizer(test_x, truncation=True, padding=True)

train_dataset = TSCCDataset(train_encodings, train_y)
valid_dataset = TSCCDataset(valid_encodings, valid_y)
test_dataset = TSCCDataset(test_encodings, test_y)

# Train

In [27]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

In [28]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="macro")
    acc = accuracy_score(labels, preds)
    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [29]:
training_args = TrainingArguments(
    output_dir='./orig_task',          # output directory
    num_train_epochs=10,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./orig_task_logs',            # directory for storing logs
    logging_steps=10,
    #evaluation_strategy='epoch'
)

In [30]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=valid_dataset,        # evaluation dataset
    compute_metrics=compute_metrics,
)

In [31]:
trainer.train()

  0%|          | 0/870 [00:00<?, ?it/s]

{'loss': 0.6684, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.11}
{'loss': 0.6521, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.23}
{'loss': 0.6766, 'learning_rate': 3e-06, 'epoch': 0.34}
{'loss': 0.6774, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.46}
{'loss': 0.6977, 'learning_rate': 5e-06, 'epoch': 0.57}
{'loss': 0.6511, 'learning_rate': 6e-06, 'epoch': 0.69}
{'loss': 0.6628, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.8}
{'loss': 0.6453, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.92}
{'loss': 0.6745, 'learning_rate': 9e-06, 'epoch': 1.03}
{'loss': 0.5948, 'learning_rate': 1e-05, 'epoch': 1.15}
{'loss': 0.6936, 'learning_rate': 1.1000000000000001e-05, 'epoch': 1.26}
{'loss': 0.7199, 'learning_rate': 1.2e-05, 'epoch': 1.38}
{'loss': 0.5818, 'learning_rate': 1.3000000000000001e-05, 'epoch': 1.49}
{'loss': 0.6167, 'learning_rate': 1.4000000000000001e-05, 'epoch': 1.61}
{'loss': 0.6545, 'learning_rate': 1.5e-05, 'epoch': 1.72}
{'loss': 0.6181, 'learni

TrainOutput(global_step=870, training_loss=0.43427459988100775, metrics={'train_runtime': 2429.6438, 'train_samples_per_second': 2.856, 'train_steps_per_second': 0.358, 'train_loss': 0.43427459988100775, 'epoch': 10.0})

In [32]:
result_eval = trainer.evaluate()
result_eval

  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.7310730218887329,
 'eval_accuracy': 0.7865168539325843,
 'eval_f1': 0.7859282911116425,
 'eval_precision': 0.7955623306233062,
 'eval_recall': 0.8039716641505052,
 'eval_runtime': 34.041,
 'eval_samples_per_second': 7.843,
 'eval_steps_per_second': 0.999,
 'epoch': 10.0}

In [33]:
save_path = "./models/2/"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

('./models/2/tokenizer_config.json',
 './models/2/special_tokens_map.json',
 './models/2/sentencepiece.bpe.model',
 './models/2/added_tokens.json',
 './models/2/tokenizer.json')