In [None]:
# Cell 1: Install dependencies
!pip install "transformers==4.34.0" "datasets[s3]==2.13.0" "sagemaker>=2.190.0" --upgrade --quiet

In [None]:
# Cell 2: Setup SageMaker session
import sagemaker
import boto3

sess = sagemaker.Session()
role = sagemaker.get_execution_role()
sagemaker_session_bucket = sess.default_bucket()

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

In [None]:
# Cell 3: Load and explore dataset
from datasets import load_dataset
import json

dataset = load_dataset('json', data_files='s3://uav-ltl-data/lifed_data.jsonl', split='train')

print(f"Dataset size: {len(dataset)}")
print(f"Sample: {dataset[0]}")
print(f"Features: {dataset.features}")

In [None]:
# Cell 4: Format and split dataset
def format_ltl_sample(sample):
    sentence = ' '.join(sample['logic_sentence'])
    ltl = ' '.join(sample['logic_ltl'])
    
    instruction = "Translate the following natural language description to Linear Temporal Logic (LTL):"
    
    text = f"""### Instruction
{instruction}

### Input
{sentence}

### Output
{ltl}</s>"""
    
    return {"text": text}

dataset = dataset.map(format_ltl_sample, remove_columns=dataset.column_names)

# 80/10/10 split
dataset = dataset.train_test_split(test_size=0.2, seed=42)
test_valid = dataset['test'].train_test_split(test_size=0.5, seed=42)

train_dataset = dataset['train']
eval_dataset = test_valid['train']  
test_dataset = test_valid['test']  

print(f"Train: {len(train_dataset)}, Eval: {len(eval_dataset)}, Test: {len(test_dataset)}")

In [None]:
# Cell 5: Tokenize all splits
from transformers import AutoTokenizer
import sys
sys.path.append("../scripts/utils")
from pack_dataset import pack_dataset

model_id = "mistralai/Mistral-7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_id)

def tokenize_and_pack(dataset_split):
    tokenized = dataset_split.map(
        lambda sample: tokenizer(sample["text"]), 
        batched=True, 
        remove_columns=["text"]
    )
    return pack_dataset(tokenized, chunk_length=2048)

train_dataset = tokenize_and_pack(train_dataset)
eval_dataset = tokenize_and_pack(eval_dataset)
test_dataset = tokenize_and_pack(test_dataset)

print(f"Packed - Train: {len(train_dataset)}, Eval: {len(eval_dataset)}, Test: {len(test_dataset)}")

In [None]:
# Cell 6: Upload all splits to S3
train_path = f's3://{sess.default_bucket()}/processed/mistral/ltl-translation/train'
eval_path = f's3://{sess.default_bucket()}/processed/mistral/ltl-translation/eval'
test_path = f's3://{sess.default_bucket()}/processed/mistral/ltl-translation/test'

train_dataset.save_to_disk(train_path)
eval_dataset.save_to_disk(eval_path)
test_dataset.save_to_disk(test_path)

print(f"Train: {train_path}")
print(f"Eval: {eval_path}")
print(f"Test: {test_path}")

In [None]:
# Cell 7: Configure training with early stopping
hyperparameters = {
    # Model & Data
    'model_id': model_id,
    'dataset_path': '/opt/ml/input/data/training',
    'output_dir': '/tmp/run',
    'merge_adapters': True,
    
    # Core Training
    'num_train_epochs': 5,
    'per_device_train_batch_size': 6,
    'gradient_accumulation_steps': 2,
    
    # Learning Rate
    'learning_rate': 2e-4,
    'lr_scheduler_type': 'cosine',
    'warmup_ratio': 0.03,
    'max_grad_norm': 0.3,
    
    # Memory/Speed
    'gradient_checkpointing': True,
    'bf16': True,
    'tf32': True,
    'use_flash_attn': True,
    
    # Logging/Saving
    'logging_steps': 10,
    'save_strategy': 'steps',
    'save_steps': 200,
    
    # Evaluation & Early Stopping
    'evaluation_strategy': 'steps',
    'eval_steps': 200,
    'load_best_model_at_end': True,
    'metric_for_best_model': 'eval_loss',
}

job_name = 'mistral-ltl-translation'

In [None]:
# Cell 8: Create training estimator
huggingface_estimator = HuggingFace(
    entry_point='run_qlora.py',
    source_dir='../scripts',
    instance_type='ml.g5.4xlarge',
    instance_count=1,
    max_run=2*24*60*60,
    base_job_name=job_name,
    role=role,
    volume_size=300,
    transformers_version='4.37',
    pytorch_version='2.0',
    py_version='py310',
    hyperparameters=hyperparameters,
    environment={"HUGGINGFACE_HUB_CACHE": "/tmp/.cache"},
    disable_output_compression=True
)

In [None]:
# Cell 9: Start training with eval data
data = {
    'training': train_path,
    'evaluation': eval_path
}

huggingface_estimator.fit(data, wait=True)

In [None]:
# Cell 10: Deploy model (optional - after training completes)
from sagemaker.huggingface import get_huggingface_llm_image_uri

llm_image = get_huggingface_llm_image_uri("huggingface", version="1.1.0", session=sess)

model_s3_path = huggingface_estimator.model_data["S3DataSource"]["S3Uri"]

config = {
    'HF_MODEL_ID': "/opt/ml/model",
    'SM_NUM_GPUS': '1',
    'MAX_INPUT_LENGTH': '1024',
    'MAX_TOTAL_TOKENS': '2048',
}

from sagemaker.huggingface import HuggingFaceModel

llm_model = HuggingFaceModel(
    role=role,
    image_uri=llm_image,
    model_data={'S3DataSource': {'S3Uri': model_s3_path, 'S3DataType': 'S3Prefix', 'CompressionType': 'None'}},
    env=config
)

In [None]:
# Cell 11: Test inference
llm = llm_model.deploy(
    initial_instance_count=1,
    instance_type='ml.g5.2xlarge',
    container_startup_health_check_timeout=300
)

# Test translation
test_input = "Globally, everytime when prop_2 and prop_1 then prop_3"
prompt = f"""### Instruction
Translate the following natural language description to Linear Temporal Logic (LTL):

### Input
{test_input}

### Output
"""

payload = {
    "inputs": prompt,
    "parameters": {
        "max_new_tokens": 256,
        "temperature": 0.7,
        "top_p": 0.9
    }
}

response = llm.predict(payload)
print(response[0]['generated_text'])