In [None]:
import boto3
import pandas as pd
import numpy as np
import sagemaker
from sagemaker import get_execution_role
from sagemaker.huggingface import HuggingFace
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import os
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Set up variables
bucket_name = 'dair-ai-emotion'
file_name = 'train-00000-of-00001.parquet'
prefix = 'distilbert-emotion'
role = get_execution_role()

# Initialize clients
s3 = boto3.client('s3')
sagemaker_session = sagemaker.Session()

logger.info("Downloading and preparing data")
try:
    # Download data
    s3.download_file(bucket_name, file_name, file_name)
    df = pd.read_parquet(file_name)
    df['label'] = df['label'].astype(int)
    
    # Split data
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
    
    # Save to CSV
    train_df.to_csv('train.csv', index=False)
    val_df.to_csv('validation.csv', index=False)
    
    logger.info("Uploading data to S3")
    # Upload to S3
    train_s3_path = f's3://{bucket_name}/{prefix}/train'
    val_s3_path = f's3://{bucket_name}/{prefix}/validation'
    
    sagemaker_session.upload_data('train.csv', bucket=bucket_name, key_prefix=f'{prefix}/train')
    sagemaker_session.upload_data('validation.csv', bucket=bucket_name, key_prefix=f'{prefix}/validation')
    
    # Clean up
    os.remove('train.csv')
    os.remove('validation.csv')
    os.remove(file_name)
    
except Exception as e:
    logger.error(f"Error in data preparation: {str(e)}")
    raise

logger.info("Creating DistilBERT estimator")
try:
    distilbert_estimator = HuggingFace(
        entry_point='train.py',  # Ensure this script is compatible with DistilBERT
        transformers_version='4.6.1',
        pytorch_version='1.7.1',
        py_version='py36',
        hyperparameters={
            'model_name': 'distilbert-base-uncased',
            'epochs': 3,
            'train_batch_size': 32,
            'eval_batch_size': 32,
            'learning_rate': 2e-5
        },
        role=role,
        instance_count=1,
        instance_type='ml.p3.2xlarge',  # Adjust instance type as needed
        output_path=f's3://{bucket_name}/{prefix}/output'
    )
    
    logger.info("Starting training job")
    distilbert_estimator.fit({
        'train': train_s3_path,
        'validation': val_s3_path
    })
    
    logger.info("Training completed, deploying model")
    predictor = distilbert_estimator.deploy(
        initial_instance_count=1,
        instance_type='ml.m5.xlarge',
        endpoint_name='emotion-classifier'
    )
    
except Exception as e:
    logger.error(f"Error in training/deployment: {str(e)}")
    raise

logger.info("Process completed successfully!")