# Igbo Language Model Training - ON-DEMAND

**Updated:** Switched to on-demand instances for guaranteed completion

**Configuration:**
- Model: Mistral-7B-v0.1
- Instance: ml.g5.xlarge (on-demand)
- Duration: ~7 days
- Cost: ~$247

In [None]:
import sagemaker
from sagemaker.huggingface import HuggingFace
from datetime import datetime
import boto3

print("‚úì Imports loaded")

In [None]:
# Configuration
BUCKET_NAME = 'learn-igbo-ekpes-useast1'
MODEL_NAME = 'mistralai/Mistral-7B-v0.1'

train_data_s3 = f's3://{BUCKET_NAME}/datasets/nllb/nllb_train.jsonl'
val_data_s3 = f's3://{BUCKET_NAME}/datasets/nllb/nllb_val.jsonl'
output_path = f's3://{BUCKET_NAME}/models/igbo-llm'

timestamp = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
job_name = f'igbo-nllb-ondemand-{timestamp}'

print("="*60)
print("CONFIGURATION - ON-DEMAND TRAINING")
print("="*60)
print(f"Region: us-east-1")
print(f"Model: {MODEL_NAME}")
print(f"Training: {train_data_s3}")
print(f"Validation: {val_data_s3}")
print(f"Output: {output_path}")

In [None]:
# Hyperparameters - VERIFIED WORKING CONFIGURATION
hyperparameters = {
    'model_name': 'mistralai/Mistral-7B-v0.1',
    'epochs': 3,
    'learning_rate': 2e-4,
    'per_device_train_batch_size': 1,
    'gradient_accumulation_steps': 16,
    'max_length': 256,
    'lora_r': 16,
    'lora_alpha': 32,
    'lora_dropout': 0.05,
}

print("Hyperparameters:")
for k, v in hyperparameters.items():
    print(f"  {k}: {v}")

In [None]:
# Create SageMaker session and estimator
boto_session = boto3.Session(region_name='us-east-1')
session = sagemaker.Session(boto_session=boto_session)
role = sagemaker.get_execution_role()

estimator = HuggingFace(
    entry_point='train_igbo_model.py',
    instance_type='ml.g5.xlarge',
    instance_count=1,
    role=role,
    sagemaker_session=session,
    transformers_version='4.46',
    pytorch_version='2.3',
    py_version='py311',
    hyperparameters=hyperparameters,
    output_path=output_path,
    base_job_name='igbo-nllb-ondemand',
    max_run=604800,              # 7 days (168 hours)
    use_spot_instances=False,    # ON-DEMAND - NO MORE SPOT!
)

print("\n" + "="*60)
print("üöÄ ESTIMATOR CONFIGURED FOR ON-DEMAND TRAINING")
print("="*60)
print(f"Instance type: {estimator.instance_type}")
print(f"Spot instances: {estimator.use_spot_instances}")
print(f"Max runtime: {estimator.max_run / 3600:.0f} hours (7 days)")
print(f"Region: {session.boto_region_name}")
print(f"Transformers: 4.46")
print(f"PyTorch: 2.3")
print("\nüí∞ COST ESTIMATE:")
print(f"  Rate: $1.41/hour (on-demand)")
print(f"  Duration: ~175 hours (7 days)")
print(f"  Total: ~$247")
print("\n‚úÖ BENEFITS:")
print("  ‚Ä¢ Starts immediately (no waiting)")
print("  ‚Ä¢ No interruptions")
print("  ‚Ä¢ Guaranteed completion")
print("  ‚Ä¢ Peace of mind")
print("="*60)

In [None]:
# Launch training
confirm = input("Type 'START' to launch on-demand training (~$247, 7 days): ")

if confirm == 'START':
    print("\nüöÄ Launching ON-DEMAND training...")
    
    estimator.fit(
        inputs={'train': train_data_s3, 'validation': val_data_s3},
        wait=False
    )
    
    job_name = estimator.latest_training_job.name
    region = session.boto_region_name
    
    print("\n" + "="*60)
    print("‚úÖ TRAINING JOB LAUNCHED!")
    print("="*60)
    print(f"\nJob name: {job_name}")
    print(f"Region: {region}")
    print(f"Instance: ml.g5.xlarge (on-demand)")
    print(f"\nüìä Monitor at:")
    print(f"https://console.aws.amazon.com/sagemaker/home?region={region}#/jobs/{job_name}")
    print(f"\nüìù CloudWatch Logs:")
    print(f"https://console.aws.amazon.com/cloudwatch/home?region={region}#logStream:group=/aws/sagemaker/TrainingJobs;prefix={job_name}")
    print(f"\n‚úÖ Training will start within 5-10 minutes")
    print(f"‚úÖ Expected completion: ~7 days from now")
    print(f"‚úÖ No interruptions - guaranteed completion")
    print(f"\nüí∞ Total cost: ~$247")
    print("\nüéâ You can close this notebook - training runs independently!")
    print("="*60)
else:
    print("‚ùå Cancelled - nothing was launched")

In [None]:
# Optional: Check job status
import boto3

sm_client = boto3.client('sagemaker', region_name='us-east-1')

# Get the job name from previous cell
try:
    job_name = estimator.latest_training_job.name
    response = sm_client.describe_training_job(TrainingJobName=job_name)
    
    print(f"Job: {job_name}")
    print(f"Status: {response['TrainingJobStatus']}")
    print(f"Secondary Status: {response['SecondaryStatus']}")
    
    if 'BillableTimeInSeconds' in response:
        billable_hours = response['BillableTimeInSeconds'] / 3600
        cost = billable_hours * 1.41
        print(f"Billable time: {billable_hours:.1f} hours")
        print(f"Cost so far: ${cost:.2f}")
except:
    print("No training job found. Run Cell 5 first to launch training.")