In [None]:
import sagemaker
from sagemaker.huggingface import HuggingFace
from datetime import datetime
import boto3

print("‚úì Imports loaded")

In [None]:
BUCKET_NAME = 'learn-igbo-ekpes-useast1'
MODEL_NAME = 'mistralai/Mistral-7B-v0.1'  # ‚Üê Changed from v0.2 to v0.1

train_data_s3 = f's3://{BUCKET_NAME}/datasets/nllb/nllb_train.jsonl'
val_data_s3 = f's3://{BUCKET_NAME}/datasets/nllb/nllb_val.jsonl'
output_path = f's3://{BUCKET_NAME}/models/igbo-llm'

timestamp = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
job_name = f'igbo-nllb-{timestamp}'

print("="*60)
print("CONFIGURATION - MISTRAL 7B v0.1")
print("="*60)
print(f"Region: us-east-1")
print(f"Model: {MODEL_NAME}")
print(f"Training: {train_data_s3}")

In [None]:
hyperparameters={
    'model_name': 'mistralai/Mistral-7B-v0.1',
    'epochs': 3,
    'learning_rate': 2e-4,
    'per_device_train_batch_size': 1,        # ‚Üê Changed from 2 to 1
    'gradient_accumulation_steps': 16,       # ‚Üê Changed from 8 to 16 (keeps effective batch=16)
    'max_length': 256,                       # ‚Üê Changed from 512 to 256
    'lora_r': 16,
    'lora_alpha': 32,
    'lora_dropout': 0.05,
}

print("Hyperparameters for Mistral 7B:")
for k, v in hyperparameters.items():
    print(f"  {k}: {v}")

In [None]:
# Create session in us-east-1
boto_session = boto3.Session(region_name='us-east-1')
session = sagemaker.Session(boto_session=boto_session)
role = sagemaker.get_execution_role()

estimator = HuggingFace(
    entry_point='train_igbo_model.py',
    instance_type='ml.g5.xlarge',
    instance_count=1,
    role=role,
    sagemaker_session=session,
    transformers_version='4.46',
    pytorch_version='2.3',      # ‚Üê Changed from 2.1 to 2.3
    py_version='py311',
    hyperparameters=hyperparameters,
    output_path=output_path,
    base_job_name='igbo-nllb-mistral',
    max_run=432000,
    use_spot_instances=True,
    max_wait=518400,
    checkpoint_s3_uri=f'{output_path}/checkpoints/{job_name}/',
)

print("‚úì Estimator created with transformers 4.46 + pytorch 2.3!")

In [None]:
print("="*60)
print("ESTIMATED COSTS - MISTRAL 7B")
print("="*60)
print(f"Instance: ml.g5.2xlarge (NVIDIA A10G GPU)")
print(f"Model: Mistral 7B (larger, better quality)")
print(f"Training time: ~150-180 hours (6-7 days)")
print(f"")
print(f"Spot rate: $0.42/hour")
print(f"Expected cost: 180 hrs √ó $0.42 = $75.60")
print(f"")
print(f"üí∞ COST: ~$70-80 (vs $50-63 for Llama)")
print(f"üéØ QUALITY: Better (7B vs 1B parameters)")
print("="*60)

In [None]:
confirm = input("Type 'YES' to start training (~$50-63, 5 days): ")

if confirm == 'YES':
    print("\nüöÄ Launching training with SPOT instances...")
    
    estimator.fit(
        inputs={'train': train_data_s3, 'validation': val_data_s3},
        wait=False
    )
    
    job_name = estimator.latest_training_job.name
    region = session.boto_region_name
    
    print("="*60)
    print("‚úì TRAINING JOB STARTED!")
    print("="*60)
    print(f"\nJob name: {job_name}")
    print(f"Region: {region}")
    print(f"Instance: ml.g5.xlarge (spot)")
    print(f"\nMonitor:")
    print(f"https://console.aws.amazon.com/sagemaker/home?region={region}#/jobs/{job_name}")
    print(f"\nCloudWatch Logs:")
    print(f"https://console.aws.amazon.com/cloudwatch/home?region={region}#logStream:group=/aws/sagemaker/TrainingJobs;prefix={job_name}")
    print(f"\n‚úì You can close this notebook - training runs independently!")
    print(f"‚úì Check back in ~5 days for your trained model!")
    print(f"‚úì Estimated cost: $50-63")
else:
    print("Cancelled")