### Environment Setup

In [1]:
import pandas as pd
import sagemaker
import boto3
from sagemaker.s3 import S3Uploader
from sagemaker.huggingface import HuggingFace
from sagemaker import get_execution_role



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [2]:
# SageMaker session and role
sagemaker_session = sagemaker.Session() # creating sagemaker session to track training activities
account = boto3.client("sts").get_caller_identity().get("Account")
role= get_execution_role() # it grants sagemaker permission to interacts with AWS services like S3 buckets

### Step1: Upload dataset to s3 bucket. 

### Step2: Preprocess the dataset using Glue

### Step3: Train the Machine learning model using sagemaker

In [None]:
#step 3.1: Get dataset s3 bucket url
train_data_path = "s3://toxic-meme-classification/feature_dataset/cleaned_train.csv"
output_path = "s3://toxic-meme-classification/model_artifacts/"

In [None]:
#step 3.2: Define hyperparameters
model_name = 'distilbert-base-cased'
hyperparameters={
             'epochs': 3,
             'train_batch_size': 32,
             'model_name': model_name
                 }

In [None]:
#step 3.3: define the hugging face estimator

huggingface_estimator = HuggingFace(
    entry_point='train.py',
    # source_dir='./scripts',
    instance_type='ml.p3.2xlarge',
    instance_count=1,
    role=role,
    transformers_version='4.26',   # Modern version with 'evaluate'
    pytorch_version='1.13',        # Compatible with 4.26
    py_version='py39',             # Required for evaluate
    hyperparameters=hyperparameters,
    # metric_definitions=metric_definitions,
    output_path=output_path,
    max_run=20000,
)


## Train modal

In [8]:
# starting the train job with our uploaded datasets as input
huggingface_estimator.fit({'train': train_data_path})

2025-04-16 15:32:19 Starting - Starting the training job......
2025-04-16 15:32:56 Pending - Training job waiting for capacity......
2025-04-16 15:34:24 Pending - Preparing the instances for training......
2025-04-16 15:35:17 Downloading - Downloading the training image.....................
2025-04-16 15:38:55 Training - Training image download completed. Training in progress....[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
  "cipher": algorithms.TripleDES,[0m
  "class": algorithms.TripleDES,[0m
[34m2025-04-16 15:39:12,360 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2025-04-16 15:39:12,382 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2025-04-16 15:39:12,398 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2025-04-16 15:39:12,401 sagemaker_pytorc

#### The training took about 1hr to complete

## Deploy modal

In [9]:
predictor = huggingface_estimator.deploy(initial_instance_count=1, instance_type="ml.m5.xlarge", endpoint_name="toxic-meme-classification")

---------!

## Testing

In [None]:
data = {
   "inputs": [
        'Hey, you are so stupid. I feel like beating you up',
        'The weather today is so good. I cannot wait to go outside',
         'You are such a lovely person',
        'You all are bunch of idiots!'
   ]
}

# predicting
predictor.predict(data)

[{'label': 'toxic', 'score': 0.9994388222694397},
 {'label': 'not-toxic', 'score': 0.9998989105224609},
 {'label': 'not-toxic', 'score': 0.999161958694458},
 {'label': 'toxic', 'score': 0.9997290968894958}]