# SMS Claim Extraction - Training on Colab

This notebook trains all 4 approaches for claim extraction research.

In [None]:
# Clone repository
!git clone https://github.com/iamdiluxedbutcooler/sms-claim-check.git
%cd sms-claim-check

In [None]:
# Install dependencies
!pip install -q transformers datasets torch scikit-learn pandas numpy seaborn matplotlib openai python-dotenv evaluate accelerate sentencepiece seqeval

## Update Code (if needed)

Run this cell ONLY if you need to pull latest code updates. It will backup experiments first.

In [None]:
# Backup experiments before updating code
!cp -r experiments /content/drive/MyDrive/sms-claim-check/backup_experiments_$(date +%Y%m%d_%H%M%S) 2>/dev/null || echo "No experiments to backup yet"

# Pull latest code
!git pull origin main

# IMPORTANT: Restart runtime after pulling to reload modules
print("\n[WARNING] After pulling, go to Runtime > Restart runtime to reload updated code!")
print("Then continue from where you left off.")

In [None]:
# QUICK FIX: Reload modules without restarting runtime
import sys
import importlib

# Remove cached modules
modules_to_reload = [m for m in sys.modules.keys() if m.startswith('src.')]
for module in modules_to_reload:
    del sys.modules[module]

# Reload
import src.models
import src.data

print("[OK] Modules reloaded! Continue training.")

In [None]:
# Mount Google Drive for checkpoints and results
from google.colab import drive
drive.mount('/content/drive')

# Create project folder in Drive
!mkdir -p '/content/drive/MyDrive/sms-claim-check'
!mkdir -p '/content/drive/MyDrive/sms-claim-check/checkpoints'
!mkdir -p '/content/drive/MyDrive/sms-claim-check/results'

In [None]:
# Check GPU
import torch
print(f"GPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")

In [None]:
# Setup auto-save function
import shutil
from pathlib import Path

DRIVE_PATH = '/content/drive/MyDrive/sms-claim-check'

def save_checkpoint(approach_name):
    """Save checkpoint to Google Drive"""
    source = f'experiments/{approach_name}'
    if Path(source).exists():
        dest = f'{DRIVE_PATH}/checkpoints/{approach_name}'
        shutil.copytree(source, dest, dirs_exist_ok=True)
        print(f'[SAVED] {approach_name} -> Google Drive')

def save_all_results():
    """Save all results to Google Drive"""
    shutil.copytree('experiments', f'{DRIVE_PATH}/results/experiments', dirs_exist_ok=True)
    print('[SAVED] All results -> Google Drive')

print('Auto-save setup complete!')

## Approach 1: Entity-based NER

In [None]:
!python train.py --config configs/entity_ner.yaml

# Save checkpoint to Drive
save_checkpoint('approach1_entity_ner')

## Approach 2: Claim-based NER

In [None]:
!python train.py --config configs/claim_ner.yaml

# Save checkpoint to Drive
save_checkpoint('approach2_claim_ner')

## Approach 4: Contrastive Learning

In [None]:
!python train.py --config configs/contrastive.yaml

# Save checkpoint to Drive
save_checkpoint('approach4_contrastive')

## Approach 3a: Hybrid Entity + LLM (Inference Only)

In [None]:
# Set OpenAI API key
import os
os.environ['OPENAI_API_KEY'] = 'YOUR_API_KEY_HERE'  # Replace with your key

!python inference.py --config configs/hybrid_llm.yaml --model experiments/approach1_entity_ner/best_model

# Save results to Drive
save_checkpoint('approach3_hybrid_llm')

## Approach 3b: Hybrid Claim + LLM (Inference Only)

In [None]:
!python inference.py --config configs/hybrid_claim_llm.yaml --model experiments/approach2_claim_ner/best_model

## Compare Results

In [None]:
!python scripts/compare_models.py

# Save final comparison to Drive
save_all_results()

## Download Results

In [None]:
# Final save to Drive (backup)
save_all_results()

# Also zip and download
!zip -r results.zip experiments/
from google.colab import files
files.download('results.zip')

print('[COMPLETE] All results saved to Google Drive: /MyDrive/sms-claim-check/results/')