In [None]:
%pip install datasets
%pip install transformers



In [None]:
import re
import os
import random
import pandas as pd
import matplotlib.pyplot as plt

import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torch.nn import CrossEntropyLoss

from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from transformers import AutoTokenizer

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from datasets import ClassLabel, Sequence
from IPython.display import display, HTML
from tqdm import tqdm

In [None]:
%cd drive/MyDrive/平行語料
%ls

/content/drive/MyDrive/平行語料
iCorpus_train_p.csv    moedict_p.csv  聖經平行語料_p_final.csv
moedict_merge_eng.csv  TAT_p.csv


In [None]:
df1 = pd.read_csv("iCorpus_train_p.csv")
df2 = pd.read_csv("moedict_p.csv")
df3 = pd.read_csv("聖經平行語料_p_final.csv")
df4 = pd.read_csv("moedict_merge_eng.csv")
df5 = pd.read_csv("TAT_p.csv")

df1 = df1[["中文","台文"]]
df2 = df2[["中文","台文"]]
df3 = df3[["中文","台文"]]
df4 = df4[["中文","台文"]]
df5 = df5[["中文","台文"]]

df = pd.concat([df1, df2, df3, df4, df5], axis=0)
df.shape[0]

68588

In [None]:
data1 = df[['中文']]
data1['label'] = 0
data1['sentence'] = data1['中文']
data1 = data1[['sentence','label']]

data2 = df[['台文']]
data2['label'] = 1
data2['sentence'] = data2['台文']
data2 = data2[['sentence','label']]

data = pd.concat([data1, data2], axis=0)
data

Unnamed: 0,sentence,label
0,Obama大勝美國首位黑人總統。,0
1,駐美特派員曹郁芬華府五日報導。,0
2,歐巴瑪Obama大勝美國首位黑人總統，壓倒性勝利創造新歷史。,0
3,民主黨總統候選人歐巴瑪四日以壓倒性勝利，當選美國史上首位黑人總統，他在芝加哥的勝選演說中，對...,0
4,在已開出的選票中，歐巴瑪不僅以五十二％，（約六千兩百九十六萬票）的普選票，贏過共和黨候選人麥...,0
...,...,...
2659,伊偷畫本國地圖，,1
2660,講也奇怪，雙人同齊踏入店門，,1
2661,阿生的教室雄雄來一个頭家欲揣小工。,1
2662,欲創啥？,1




In [None]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = [str(t) for t in texts]
        self.labels = [int(i) for i in labels]
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}

In [None]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits

In [None]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in tqdm(data_loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in tqdm(data_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

In [None]:
def predict_language(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
    return "台文" if preds.item() == 1 else "中文"

In [None]:
# Set up parameters
bert_model_name = 'ckiplab/bert-base-chinese'
num_classes = 2
max_length = 128
batch_size = 16
num_epochs = 5
learning_rate = 2e-5

In [None]:
#split data
train_texts, val_texts, train_labels, val_labels = train_test_split(data['sentence'].tolist(), data['label'].tolist(), test_size=0.2, random_state=42)

In [None]:
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(bert_model_name, num_classes).to(device)

Some weights of BertModel were not initialized from the model checkpoint at ckiplab/bert-base-chinese and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [None]:
for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        train(model, train_dataloader, optimizer, scheduler, device)
        accuracy, report = evaluate(model, val_dataloader, device)
        print(f"Validation Accuracy: {accuracy:.4f}")
        print(report)

Epoch 1/5


100%|██████████| 6859/6859 [35:39<00:00,  3.21it/s]
100%|██████████| 1715/1715 [03:17<00:00,  8.68it/s]


Validation Accuracy: 0.9352
              precision    recall  f1-score   support

           0       0.92      0.96      0.94     13806
           1       0.96      0.91      0.93     13630

    accuracy                           0.94     27436
   macro avg       0.94      0.94      0.94     27436
weighted avg       0.94      0.94      0.94     27436

Epoch 2/5


100%|██████████| 6859/6859 [35:36<00:00,  3.21it/s]
100%|██████████| 1715/1715 [03:17<00:00,  8.69it/s]


Validation Accuracy: 0.9349
              precision    recall  f1-score   support

           0       0.97      0.90      0.93     13806
           1       0.90      0.97      0.94     13630

    accuracy                           0.93     27436
   macro avg       0.94      0.94      0.93     27436
weighted avg       0.94      0.93      0.93     27436

Epoch 3/5


100%|██████████| 6859/6859 [35:35<00:00,  3.21it/s]
100%|██████████| 1715/1715 [03:17<00:00,  8.67it/s]


Validation Accuracy: 0.9360
              precision    recall  f1-score   support

           0       0.96      0.91      0.93     13806
           1       0.91      0.96      0.94     13630

    accuracy                           0.94     27436
   macro avg       0.94      0.94      0.94     27436
weighted avg       0.94      0.94      0.94     27436

Epoch 4/5


100%|██████████| 6859/6859 [35:37<00:00,  3.21it/s]
100%|██████████| 1715/1715 [03:17<00:00,  8.67it/s]


Validation Accuracy: 0.9373
              precision    recall  f1-score   support

           0       0.97      0.90      0.94     13806
           1       0.91      0.97      0.94     13630

    accuracy                           0.94     27436
   macro avg       0.94      0.94      0.94     27436
weighted avg       0.94      0.94      0.94     27436

Epoch 5/5


100%|██████████| 6859/6859 [35:36<00:00,  3.21it/s]
100%|██████████| 1715/1715 [03:17<00:00,  8.69it/s]

Validation Accuracy: 0.9337
              precision    recall  f1-score   support

           0       0.91      0.96      0.94     13806
           1       0.96      0.91      0.93     13630

    accuracy                           0.93     27436
   macro avg       0.93      0.93      0.93     27436
weighted avg       0.93      0.93      0.93     27436






In [None]:
torch.save(model.state_dict(), "bert_classifier.pth")

In [None]:
# Test sentiment prediction
test_text = "歐巴瑪Obama大勝美國首位黑人總統，壓倒性勝利創造新歷史。"
sentiment = predict_language(test_text, model, tokenizer, device)
print(test_text)
print(f"Predicted language: {sentiment}")

歐巴瑪Obama大勝美國首位黑人總統，壓倒性勝利創造新歷史。
Predicted language: 中文
