# Transformer Fine-Tuning for Chinese NLI

Fine-tune encoder models on the JCLCv2 corpus for Native Language Identification.

**Instructions:**
1. Set runtime to **GPU** (Runtime → Change runtime type → T4 GPU)
2. Upload `JCLCv2.zip` to your Google Drive under `NNP/JCLCv2.zip`
   - Create the zip locally: `cd NNP && zip -r JCLCv2.zip JCLCv2/`
3. Run all cells

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Copy zip to local disk and extract (much faster than reading files from Drive)
import zipfile, os
ZIP_PATH = '/content/drive/MyDrive/NNP/JCLCv2.zip'
LOCAL_DIR = '/content/data'

if not os.path.exists(LOCAL_DIR + '/JCLCv2/index.csv'):
    print('Extracting data to local disk...')
    os.makedirs(LOCAL_DIR, exist_ok=True)
    with zipfile.ZipFile(ZIP_PATH, 'r') as zf:
        zf.extractall(LOCAL_DIR)
    print('Done.')
else:
    print('Data already extracted.')

DRIVE_DATA_DIR = LOCAL_DIR + '/JCLCv2'

In [None]:
!pip install -q transformers jieba scikit-learn tqdm accelerate

In [None]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
import torch
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset
from tqdm.auto import tqdm, trange
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)

print(f'CUDA available: {torch.cuda.is_available()}')
if torch.cuda.is_available():
    print(f'GPU: {torch.cuda.get_device_name(0)}')

## Configuration

In [None]:
# ── Choose your model ──────────────────────────────────────────────────
MODEL_NAME = 'google-bert/bert-base-chinese'

# Available models:
# 'google-bert/bert-base-chinese'
# 'google-bert/bert-base-uncased'
# 'google-bert/bert-large-uncased'
# 'google-bert/bert-base-multilingual-cased'
# 'hfl/chinese-roberta-wwm-ext'
# 'voidful/albert_chinese_base'

MAX_LENGTH = 512
BATCH_SIZE = 16
EPOCHS = 5
LR = 2e-5
WARMUP_RATIO = 0.1
RANDOM_SEED = 42

DATA_DIR = Path(DRIVE_DATA_DIR)
INDEX_CSV = DATA_DIR / 'index.csv'
OUTPUT_DIR = Path('/content/output')
RESULTS_DIR = Path('/content/results')
RESULTS_DIR.mkdir(exist_ok=True)

## Load & Split Data

In [None]:
def load_corpus(data_dir, index_csv):
    df = pd.read_csv(
        index_csv, header=None,
        names=['doc_id', 'context', 'native_language', 'gender'],
    )
    texts = []
    for doc_id in tqdm(df['doc_id'], desc='Loading texts'):
        path = data_dir / f'{doc_id}.txt'
        texts.append(path.read_text(encoding='utf-8').strip())
    df['text'] = texts
    return df


def stratified_split(df, seed=42):
    df = df.dropna(subset=['native_language'])
    counts = df['native_language'].value_counts()
    rare_langs = counts[counts < 3].index
    df_rare = df[df['native_language'].isin(rare_langs)]
    df_main = df[~df['native_language'].isin(rare_langs)]
    df_main = df_main[df_main['native_language'].map(df_main['native_language'].value_counts()) > 1]

    df_train, df_valtest = train_test_split(
        df_main, test_size=0.2, random_state=seed,
        stratify=df_main['native_language'],
    )
    df_valtest = df_valtest[df_valtest['native_language'].map(df_valtest['native_language'].value_counts()) > 1]
    df_val, df_test = train_test_split(
        df_valtest, test_size=0.5, random_state=seed,
        stratify=df_valtest['native_language'],
    )
    df_train = pd.concat([df_train, df_rare], ignore_index=True)
    df_train = df_train.sample(frac=1, random_state=seed).reset_index(drop=True)
    return df_train, df_val.reset_index(drop=True), df_test.reset_index(drop=True)


df = load_corpus(DATA_DIR, INDEX_CSV)
le = LabelEncoder()
df['label'] = le.fit_transform(df['native_language'])
train_df, val_df, test_df = stratified_split(df, RANDOM_SEED)
label_names = list(le.classes_)
num_classes = len(label_names)

print(f'Train: {len(train_df)}  Val: {len(val_df)}  Test: {len(test_df)}  Classes: {num_classes}')

## Dataset & Model

In [None]:
class NLIDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length, desc='Tokenizing'):
        batch_size = 256
        all_input_ids, all_attn, all_ttype = [], [], []
        for i in trange(0, len(texts), batch_size, desc=desc):
            enc = tokenizer(
                texts[i:i+batch_size], truncation=True,
                padding='max_length', max_length=max_length,
                return_tensors='pt',
            )
            all_input_ids.append(enc['input_ids'])
            all_attn.append(enc['attention_mask'])
            if 'token_type_ids' in enc:
                all_ttype.append(enc['token_type_ids'])
        self.encodings = {
            'input_ids': torch.cat(all_input_ids),
            'attention_mask': torch.cat(all_attn),
        }
        if all_ttype:
            self.encodings['token_type_ids'] = torch.cat(all_ttype)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item


print(f'Loading {MODEL_NAME}...')
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=num_classes, trust_remote_code=True,
)

train_dataset = NLIDataset(train_df['text'].tolist(), train_df['label'].tolist(), tokenizer, MAX_LENGTH, 'Tokenizing train')
val_dataset = NLIDataset(val_df['text'].tolist(), val_df['label'].tolist(), tokenizer, MAX_LENGTH, 'Tokenizing val')
test_dataset = NLIDataset(test_df['text'].tolist(), test_df['label'].tolist(), tokenizer, MAX_LENGTH, 'Tokenizing test')
print('Datasets ready.')

## Trainer Setup & Training

In [None]:
# Class weights for imbalanced data
counts = np.bincount(train_df['label'].values, minlength=num_classes).astype(float)
counts[counts == 0] = 1.0
class_weights = torch.tensor(len(train_df) / (num_classes * counts), dtype=torch.float32)


class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop('labels')
        outputs = model(**inputs)
        loss = torch.nn.functional.cross_entropy(
            outputs.logits, labels,
            weight=class_weights.to(outputs.logits.device),
        )
        return (loss, outputs) if return_outputs else loss


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        'accuracy': accuracy_score(labels, preds),
        'macro_f1': f1_score(labels, preds, average='macro', zero_division=0),
        'weighted_f1': f1_score(labels, preds, average='weighted', zero_division=0),
    }


safe_name = MODEL_NAME.replace('/', '_')

training_args = TrainingArguments(
    output_dir=str(OUTPUT_DIR / safe_name),
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE * 2,
    learning_rate=LR,
    warmup_ratio=WARMUP_RATIO,
    weight_decay=0.01,
    max_grad_norm=1.0,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='macro_f1',
    greater_is_better=True,
    save_total_limit=1,
    logging_steps=50,
    fp16=torch.cuda.is_available(),
    seed=RANDOM_SEED,
    report_to='none',
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

## Evaluation

In [None]:
# Validation
val_results = trainer.evaluate(val_dataset)
print('=== Validation ===')
for k, v in val_results.items():
    print(f'  {k}: {v:.4f}' if isinstance(v, float) else f'  {k}: {v}')

# Test
test_results = trainer.evaluate(test_dataset)
print('\n=== Test ===')
for k, v in test_results.items():
    print(f'  {k}: {v:.4f}' if isinstance(v, float) else f'  {k}: {v}')

In [None]:
# Full classification report on test set
test_pred = trainer.predict(test_dataset)
preds = np.argmax(test_pred.predictions, axis=-1)
labels = test_pred.label_ids

present = sorted(set(labels) | set(preds))
names = [label_names[i] for i in present]
print(classification_report(labels, preds, labels=present, target_names=names, zero_division=0))

In [None]:
# Save results & checkpoint to Drive
results = {
    'model': MODEL_NAME,
    'val': val_results,
    'test': test_results,
}
out_path = RESULTS_DIR / f'finetune_{safe_name}.json'
with open(out_path, 'w') as f:
    json.dump(results, f, indent=2)
print(f'Saved {out_path}')

ckpt_dir = Path(DRIVE_DATA_DIR).parent / 'checkpoints' / safe_name
ckpt_dir.mkdir(parents=True, exist_ok=True)
trainer.save_model(str(ckpt_dir))
tokenizer.save_pretrained(str(ckpt_dir))
print(f'Saved checkpoint to {ckpt_dir}')