# Transformer Fine-Tuning for Chinese NLI

Fine-tune encoder models on the JCLCv2 corpus for Native Language Identification.

**Instructions:**
1. Set runtime to **GPU** (Runtime → Change runtime type → T4 GPU)
2. Upload `JCLCv2.zip` to your Google Drive under `NNP/JCLCv2.zip`
   - Create the zip locally: `cd NNP && zip -r JCLCv2.zip JCLCv2/`
3. Run all cells

In [53]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Copy zip to local disk and extract (much faster than reading files from Drive)
import zipfile, os
ZIP_PATH = '/content/drive/MyDrive/NNP/JCLCv2.zip'
LOCAL_DIR = '/content/data'

if not os.path.exists(LOCAL_DIR + '/JCLCv2/index.csv'):
    print('Extracting data to local disk...')
    os.makedirs(LOCAL_DIR, exist_ok=True)
    with zipfile.ZipFile(ZIP_PATH, 'r') as zf:
        zf.extractall(LOCAL_DIR)
    print('Done.')
else:
    print('Data already extracted.')

DRIVE_DATA_DIR = LOCAL_DIR + '/JCLCv2'

Mounted at /content/drive
Data already extracted.


In [2]:
!pip install -q transformers jieba scikit-learn tqdm accelerate

In [54]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
import torch
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset
from tqdm.auto import tqdm, trange
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)

print(f'CUDA available: {torch.cuda.is_available()}')
if torch.cuda.is_available():
    print(f'GPU: {torch.cuda.get_device_name(0)}')

CUDA available: True
GPU: NVIDIA A100-SXM4-40GB


## Configuration

In [77]:
# ── Choose your model ──────────────────────────────────────────────────
MODEL_NAME = 'bert-large-uncased'

# Available models:
# 'google-bert/bert-base-chinese'
# 'google-bert/bert-base-uncased'
# 'google-bert/bert-large-uncased'
# 'google-bert/bert-base-multilingual-cased'
# 'hfl/chinese-roberta-wwm-ext'
# 'voidful/albert_chinese_base'

MAX_LENGTH = 512
BATCH_SIZE = 32
EPOCHS = 20
LR = 2e-5
WARMUP_RATIO = 0.1
RANDOM_SEED = 42

DATA_DIR = Path(DRIVE_DATA_DIR)
INDEX_CSV = DATA_DIR / 'index.csv'
OUTPUT_DIR = Path('/content/output')
RESULTS_DIR = Path('/content/results')
RESULTS_DIR.mkdir(exist_ok=True)

## Load & Split Data

In [56]:
def load_corpus(data_dir, index_csv):
    df = pd.read_csv(
        index_csv, header=None,
        names=['doc_id', 'context', 'native_language', 'gender'],
    )
    texts = []
    for doc_id in tqdm(df['doc_id'], desc='Loading texts'):
        path = data_dir / f'{doc_id}.txt'
        texts.append(path.read_text(encoding='utf-8').strip())
    df['text'] = texts
    return df


def stratified_split(df, seed=42):
    df = df.dropna(subset=['native_language'])
    counts = df['native_language'].value_counts()
    rare_langs = counts[counts < 3].index
    df_rare = df[df['native_language'].isin(rare_langs)]
    df_main = df[~df['native_language'].isin(rare_langs)]
    df_main = df_main[df_main['native_language'].map(df_main['native_language'].value_counts()) > 1]

    df_train, df_valtest = train_test_split(
        df_main, test_size=0.2, random_state=seed,
        stratify=df_main['native_language'],
    )
    df_valtest = df_valtest[df_valtest['native_language'].map(df_valtest['native_language'].value_counts()) > 1]
    df_val, df_test = train_test_split(
        df_valtest, test_size=0.5, random_state=seed,
        stratify=df_valtest['native_language'],
    )
    df_train = pd.concat([df_train, df_rare], ignore_index=True)
    df_train = df_train.sample(frac=1, random_state=seed).reset_index(drop=True)
    return df_train, df_val.reset_index(drop=True), df_test.reset_index(drop=True)


df = load_corpus(DATA_DIR, INDEX_CSV)
le = LabelEncoder()
df['label'] = le.fit_transform(df['native_language'])
train_df, val_df, test_df = stratified_split(df, RANDOM_SEED)
label_names = list(le.classes_)
num_classes = len(label_names)

print(f'Train: {len(train_df)}  Val: {len(val_df)}  Test: {len(test_df)}  Classes: {num_classes}')

Loading texts:   0%|          | 0/8739 [00:00<?, ?it/s]

Train: 6778  Val: 841  Test: 841  Classes: 53


## Dataset & Model

In [78]:
class NLIDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length, desc='Tokenizing'):
        batch_size = 256
        all_input_ids, all_attn, all_ttype = [], [], []
        for i in trange(0, len(texts), batch_size, desc=desc):
            enc = tokenizer(
                texts[i:i+batch_size], truncation=True,
                padding='max_length', max_length=max_length,
                return_tensors='pt',
            )
            all_input_ids.append(enc['input_ids'])
            all_attn.append(enc['attention_mask'])
            if 'token_type_ids' in enc:
                all_ttype.append(enc['token_type_ids'])
        self.encodings = {
            'input_ids': torch.cat(all_input_ids),
            'attention_mask': torch.cat(all_attn),
        }
        if all_ttype:
            self.encodings['token_type_ids'] = torch.cat(all_ttype)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item


print(f'Loading {MODEL_NAME}...')
from transformers import BertTokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=num_classes, trust_remote_code=True,
)

train_dataset = NLIDataset(train_df['text'].tolist(), train_df['label'].tolist(), tokenizer, MAX_LENGTH, 'Tokenizing train')
val_dataset = NLIDataset(val_df['text'].tolist(), val_df['label'].tolist(), tokenizer, MAX_LENGTH, 'Tokenizing val')
test_dataset = NLIDataset(test_df['text'].tolist(), test_df['label'].tolist(), tokenizer, MAX_LENGTH, 'Tokenizing test')
print('Datasets ready.')

Loading bert-large-uncased...


Loading weights:   0%|          | 0/391 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: bert-large-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
classifier.bias                            | MISSING    | 
classifier.weight                          | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Tokenizing train:   0%|          | 0/27 [00:00<?, ?it/s]

Tokenizing val:   0%|          | 0/4 [00:00<?, ?it/s]

Tokenizing test:   0%|          | 0/4 [00:00<?, ?it/s]

Datasets ready.


## Trainer Setup & Training

In [None]:
# Class weights for imbalanced data
counts = np.bincount(train_df['label'].values, minlength=num_classes).astype(float)
counts[counts == 0] = 1.0
class_weights = torch.tensor(len(train_df) / (num_classes * counts), dtype=torch.float32)


class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop('labels')
        outputs = model(**inputs)
        loss = torch.nn.functional.cross_entropy(
            outputs.logits, labels,
            weight=class_weights.to(outputs.logits.device),
        )
        return (loss, outputs) if return_outputs else loss


def compute_metrics(eval_pred):
    predictions, labels = eval_pred.predictions, eval_pred.label_ids

    # Handle cases where predictions might be a tuple/list (e.g., logits + hidden states)
    if isinstance(predictions, (tuple, list)):
        predictions = predictions[0] # Assume logits are the first element

    # Convert predictions to a NumPy array if it's a PyTorch tensor
    if isinstance(predictions, torch.Tensor):
        predictions = predictions.detach().cpu().numpy()

    # If 'predictions' is an array of objects (e.g., list of arrays/tensors),
    # attempt to stack them into a homogeneous 2D NumPy array.
    if isinstance(predictions, np.ndarray) and predictions.dtype == object and len(predictions) > 0:
        try:
            processed_predictions = []
            for item in predictions:
                if isinstance(item, torch.Tensor):
                    processed_predictions.append(item.detach().cpu().numpy())
                elif isinstance(item, np.ndarray):
                    processed_predictions.append(item)
                else:
                    # Attempt to convert any other type to numpy array
                    processed_predictions.append(np.asarray(item))
            predictions = np.vstack(processed_predictions)
        except ValueError as e:
            print(f"Warning: Failed to stack predictions due to inconsistent inner shapes. Error: {e}")
            if len(predictions) > 0:
                # Print example shapes for debugging if it still fails
                print(f"Example inner shapes: {[p.shape for p in processed_predictions[:min(5, len(processed_predictions))]]}")
            pass # Proceed, but np.argmax might still fail if shapes are truly inconsistent

    # Ensure the predictions array is 2D if possible (e.g., remove singleton dimensions)
    if predictions.ndim > 2:
        predictions = predictions.squeeze()

    # Finally, apply argmax. This is where the original error occurred.
    preds = np.argmax(predictions, axis=-1)

    return {
        'accuracy': accuracy_score(labels, preds),
        'macro_f1': f1_score(labels, preds, average='macro', zero_division=0),
        'weighted_f1': f1_score(labels, preds, average='weighted', zero_division=0),
    }


safe_name = MODEL_NAME.replace('/', '_')

training_args = TrainingArguments(
    output_dir=str(OUTPUT_DIR / safe_name),
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE * 2,
    learning_rate=LR,
    warmup_ratio=WARMUP_RATIO,
    weight_decay=0.01,
    max_grad_norm=1.0,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='macro_f1',
    greater_is_better=True,
    save_total_limit=1,
    logging_steps=50,
    fp16=torch.cuda.is_available(),
    seed=RANDOM_SEED,
    report_to='none',
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.


Epoch,Training Loss,Validation Loss


## Evaluation

In [75]:
# Validation
val_results = trainer.evaluate(val_dataset)
print('=== Validation ===')
for k, v in val_results.items():
    print(f'  {k}: {v:.4f}' if isinstance(v, float) else f'  {k}: {v}')

# Test
test_results = trainer.evaluate(test_dataset)
print('\n=== Test ===')
for k, v in test_results.items():
    print(f'  {k}: {v:.4f}' if isinstance(v, float) else f'  {k}: {v}')

=== Validation ===
  eval_loss: 2.1333
  eval_accuracy: 0.6385
  eval_macro_f1: 0.4827
  eval_weighted_f1: 0.6460
  eval_runtime: 0.8250
  eval_samples_per_second: 1019.4450
  eval_steps_per_second: 16.9710
  epoch: 20.0000

=== Test ===
  eval_loss: 2.6113
  eval_accuracy: 0.6361
  eval_macro_f1: 0.3908
  eval_weighted_f1: 0.6433
  eval_runtime: 0.8121
  eval_samples_per_second: 1035.5890
  eval_steps_per_second: 17.2390
  epoch: 20.0000


In [76]:
# Full classification report on test set
test_pred = trainer.predict(test_dataset)

predictions = test_pred.predictions
labels = test_pred.label_ids

# Apply the same robust handling for predictions as in compute_metrics
# Handle cases where predictions might be a tuple/list (e.g., logits + hidden states)
if isinstance(predictions, (tuple, list)):
    predictions = predictions[0] # Assume logits are the first element

# Convert predictions to a NumPy array if it's a PyTorch tensor
if isinstance(predictions, torch.Tensor):
    predictions = predictions.detach().cpu().numpy()

# If 'predictions' is an array of objects (e.g., list of arrays/tensors),
# attempt to stack them into a homogeneous 2D NumPy array.
if isinstance(predictions, np.ndarray) and predictions.dtype == object and len(predictions) > 0:
    try:
        processed_predictions = []
        for item in predictions:
            if isinstance(item, torch.Tensor):
                processed_predictions.append(item.detach().cpu().numpy())
            elif isinstance(item, np.ndarray):
                processed_predictions.append(item)
            else:
                processed_predictions.append(np.asarray(item))
        predictions = np.vstack(processed_predictions)
    except ValueError as e:
        print(f"Warning: Failed to stack predictions due to inconsistent inner shapes. Error: {e}")
        if len(predictions) > 0:
            print(f"Example inner shapes: {[p.shape for p in processed_predictions[:min(5, len(processed_predictions))]]}")
        # If stacking still fails, proceed with the original inhomogeneous array,
        # hoping np.argmax might handle it in some cases or fail gracefully later.
        pass

# Ensure the predictions array is 2D if possible (e.g., remove singleton dimensions)
if predictions.ndim > 2:
    predictions = predictions.squeeze()

preds = np.argmax(predictions, axis=-1)

present = sorted(set(labels) | set(preds))
names = [label_names[i] for i in present]
print(classification_report(labels, preds, labels=present, target_names=names, zero_division=0))

              precision    recall  f1-score   support

    Cambodia       0.36      0.61      0.45        33
      Canada       0.50      1.00      0.67         1
    Colombia       0.50      1.00      0.67         1
  Costa Rica       0.50      1.00      0.67         1
      France       0.50      0.50      0.50         2
     Germany       0.00      0.00      0.00         1
   Indonesia       0.73      0.66      0.69       338
       Italy       0.00      0.00      0.00         0
       Japan       0.95      0.70      0.81        27
  Kyrgyzstan       0.00      0.00      0.00         2
        Laos       0.48      0.72      0.57        40
  Madagascar       0.60      0.38      0.46         8
    Malaysia       1.00      0.60      0.75         5
    Mongolia       0.45      0.42      0.43        12
     Myanmar       0.43      0.54      0.48        41
 New Zealand       0.00      0.00      0.00         2
 North Korea       0.00      0.00      0.00         1
      Panama       0.50    

In [34]:
# Save results & checkpoint to Drive
results = {
    'model': MODEL_NAME,
    'val': val_results,
    'test': test_results,
}
out_path = RESULTS_DIR / f'finetune_{safe_name}.json'
with open(out_path, 'w') as f:
    json.dump(results, f, indent=2)
print(f'Saved {out_path}')

ckpt_dir = Path(DRIVE_DATA_DIR).parent / 'checkpoints' / safe_name
ckpt_dir.mkdir(parents=True, exist_ok=True)
trainer.save_model(str(ckpt_dir))
tokenizer.save_pretrained(str(ckpt_dir))
print(f'Saved checkpoint to {ckpt_dir}')

Saved /content/results/finetune_hfl_chinese-lert-large.json


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Saved checkpoint to /content/data/checkpoints/hfl_chinese-lert-large
