# Immigration Case Outcome Predictor - Training

In [None]:
# Step 1: Install (then restart kernel)
!pip install datasets transformers accelerate scikit-learn

In [None]:
# Step 2: RESTART KERNEL (Kernel -> Restart Kernel), then run from Step 3

In [None]:
# Step 3: Check GPU
import torch
print(f"PyTorch: {torch.__version__}")
print(f"GPU: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"Device: {torch.cuda.get_device_name(0)}")

In [None]:
# Step 4: Load dataset
from datasets import load_dataset
import pandas as pd

print("Loading Federal Court cases...")
dataset = load_dataset("refugee-law-lab/canadian-legal-data", "FC", split="train")
print(f"Loaded {len(dataset)} cases")
print(f"Columns: {dataset.column_names}")

In [None]:
# Step 5: Extract outcomes
def extract_outcome(text):
    if not text:
        return None
    text_lower = text.lower()
    if any(p in text_lower for p in ['application is dismissed', 'appeal is dismissed', 'application dismissed', 'hereby dismissed']):
        return 0
    elif any(p in text_lower for p in ['application is allowed', 'appeal is allowed', 'application allowed', 'hereby allowed']):
        return 1
    return None

# Find text column
text_col = None
for col in dataset.column_names:
    if 'text' in col.lower():
        text_col = col
        break
if not text_col:
    text_col = dataset.column_names[0]
print(f"Text column: {text_col}")

# Convert and extract
df = dataset.to_pandas()
df['label'] = df[text_col].apply(extract_outcome)
df_labeled = df[df['label'].notna()].copy()
df_labeled['label'] = df_labeled['label'].astype(int)

print(f"\nCases with outcomes: {len(df_labeled)}")
print(df_labeled['label'].value_counts().rename({0: 'Dismissed', 1: 'Allowed'}))

In [None]:
# Step 6: Prepare data
from sklearn.model_selection import train_test_split
from datasets import Dataset

# Sample if too large
if len(df_labeled) > 10000:
    df_labeled = df_labeled.sample(10000, random_state=42)
    print(f"Sampled to {len(df_labeled)} cases")

# Split
train_df, test_df = train_test_split(
    df_labeled[[text_col, 'label']], 
    test_size=0.2, 
    stratify=df_labeled['label'], 
    random_state=42
)
print(f"Train: {len(train_df)} | Test: {len(test_df)}")

# To HuggingFace format
train_ds = Dataset.from_pandas(train_df.reset_index(drop=True))
test_ds = Dataset.from_pandas(test_df.reset_index(drop=True))

In [None]:
# Step 7: Tokenize
from transformers import AutoTokenizer

MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(batch):
    texts = [str(t)[:5000] if t else "" for t in batch[text_col]]
    result = tokenizer(texts, truncation=True, max_length=512)
    result['labels'] = batch['label']
    return result

print("Tokenizing...")
train_tok = train_ds.map(tokenize, batched=True, remove_columns=[text_col, 'label'])
test_tok = test_ds.map(tokenize, batched=True, remove_columns=[text_col, 'label'])
print("Done!")

In [None]:
# Step 8: Setup trainer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=2,
    id2label={0: "Dismissed", 1: "Allowed"},
    label2id={"Dismissed": 0, "Allowed": 1}
)

def compute_metrics(pred):
    preds = np.argmax(pred.predictions, axis=-1)
    return {'accuracy': accuracy_score(pred.label_ids, preds), 'f1': f1_score(pred.label_ids, preds)}

args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    fp16=torch.cuda.is_available(),
    logging_steps=50,
    report_to="none"
)

trainer = Trainer(
    model=model, args=args,
    train_dataset=train_tok, eval_dataset=test_tok,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics
)
print("Ready!")

In [None]:
# Step 9: Train
trainer.train()

In [None]:
# Step 10: Evaluate
results = trainer.evaluate()
print(f"Accuracy: {results['eval_accuracy']:.3f}")
print(f"F1: {results['eval_f1']:.3f}")

In [None]:
# Step 11: Save
trainer.save_model("./immigration-predictor-model")
tokenizer.save_pretrained("./immigration-predictor-model")
!zip -r immigration-predictor-model.zip immigration-predictor-model/
print("\n✅ Download 'immigration-predictor-model.zip' from file browser")
print("⚠️  STOP NOTEBOOK when done!")

In [None]:
# Step 12: Test
from transformers import pipeline
clf = pipeline("text-classification", model="./immigration-predictor-model", device=0 if torch.cuda.is_available() else -1)
print(clf("The application for judicial review is dismissed."))
print(clf("The application is allowed and returned for redetermination."))