In [1]:
!pip install s3fs



In [2]:
# Importing libraries & modules
import os
import torch
import json

from torch import nn
from transformers import DistilBertModel, DistilBertTokenizer
from sagemaker import get_execution_role

2025-01-21 04:11:01.288838: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [3]:
role = get_execution_role()
role

'arn:aws:iam::041434534908:role/service-role/AmazonSageMaker-ExecutionRole-20250111T113739'

In [4]:
# Re-create the model class - FT_DistilBERT() (FT stands for Fine-Tuned)
class FT_DistilBERT(nn.Module):
    def __init__(self, num_classes: int):
        super().__init__()
        self.block_1 = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.layer_2 = nn.Linear(in_features=768,
                                 out_features=768)
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(p=0.3)
        self.classifier_layer = nn.Linear(in_features=768,
                                          out_features=num_classes)


    def forward(self, input_ids, mask_ids):
        # 1. Send through the DistilBERT pre-trained model
        output = self.block_1(input_ids = input_ids,
                              attention_mask = mask_ids)
        hidden_state = output[0]
        pooler = hidden_state[:, 0]
        
        # 2. Send through the linear layer - this serves to increase the representational capacity of our model
        output = self.layer_2(pooler)
        # 3. Send through a non-linear activation function
        output = self.activation(output)
        # 4. Apply dropout to fight over-fitting
        output = self.dropout(output)
        # 5. Get the classification prediction (in logits)
        output = self.classifier_layer(output)

        return output

In [7]:
# Define a function to initialize a new model from the FT_DistilBERT class, and load the fine-tuned weights into it - this is specific for Notebooks where we need to define this extra code to get data from S3
import boto3
import os
import torch
from torch import nn
import tempfile

def model_fn(model_dir):
    """Load model from S3 or local path"""
    # 1. Initialize a new model
    model = FT_DistilBERT(num_classes=4)

    # 2. Check if the path is an S3 path
    if model_dir.startswith('s3://'):
        # Parse S3 path
        bucket_name = model_dir.split('/')[2]
        key = '/'.join(model_dir.split('/')[3:] + ['pytorch_distilbert_model_news.bin'])
        
        # Create S3 client
        s3_client = boto3.client('s3')
        
        # Create a temporary file to download the model
        with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
            try:
                # Download the file from S3
                s3_client.download_file(bucket_name, key, tmp_file.name)
                # Load the model weights
                model_state_dict = torch.load(tmp_file.name, map_location=torch.device('cpu'))
            finally:
                # Clean up the temporary file
                os.unlink(tmp_file.name)
    else:
        # Load from local path
        state_dict_location = os.path.join(model_dir, 'pytorch_distilbert_model_news.bin')
        model_state_dict = torch.load(state_dict_location, map_location=torch.device('cpu'))

    # 3. Apply the trained state_dict to our newly initialized model
    model.load_state_dict(model_state_dict)
    
    return model

In [8]:
# Usage example:
s3_path = 's3://tk5-huggingface-multiclass-textclassification-bucket/output/tk5-generated-output/huggingface-pytorch-training-2025-01-19-06-52-30-974/output'
model = model_fn(s3_path)

  model_state_dict = torch.load(tmp_file.name, map_location=torch.device('cpu'))


In [38]:
# Define a function to conduct inference
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

MAX_LEN = 512

def prediction_fn(model, input_data):

    # 0. Setting up some device-agnostic code
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

    # 1. Tokenize the data
    data = tokenizer.encode_plus(text=input_data, 
                                 add_special_tokens=True, 
                                 max_length=MAX_LEN, 
                                 padding="max_length", 
                                 truncation=True, 
                                 return_attention_mask=True)

    input_ids = torch.tensor(data['input_ids']).to(device)
    attention_mask = torch.tensor(data['attention_mask']).to(device)

    # 2. Run the model with the data
    model.eval()

    with torch.inference_mode():
        logits = model(input_ids, attention_mask)
        probabilities = torch.softmax(logits, dim=1).cpu().numpy()
        class_names = ['BUSINESS', 'ENTERTAINMENT', 'HEALTH', 'SCIENCE'] # checked the correct order in 3.Script.ipynb
        pred_class = probabilities.argmax(axis=1)[0].item()
        pred_label = class_names[pred_class]

        probabilities_dict = {class_names[i]: float(probabilities[0, i]) for i in range(len(class_names))}

    return {'predicted_label': pred_label}, probabilities_dict



In [5]:
sample_input = {'inputs': 'Time travel is achievable - says top scientist from NASA'}
sample_input['inputs']

'Time travel is achievable - says top scientist from NASA'

In [40]:
prediction_fn(model=model, input_data=sample_input['inputs'])

({'predicted_label': 'SCIENCE'},
 {'BUSINESS': 0.0012946061324328184,
  'ENTERTAINMENT': 0.0004263822047505528,
  'HEALTH': 0.000539263419341296,
  'SCIENCE': 0.9977397918701172})

In [22]:
sample_tokenizer = tokenizer.encode_plus(sample_input['inputs'], 
                      add_special_tokens=True, 
                      max_length=MAX_LEN, 
                      padding="max_length", 
                      truncation=True, 
                      return_attention_mask=True)

In [27]:
sample_ids = torch.tensor(sample_tokenizer['input_ids'])

In [26]:
sample_mask = torch.tensor(sample_tokenizer['attention_mask'])

In [29]:
sample_logits = model(sample_ids, sample_mask)
sample_logits

tensor([[-1.4184, -2.5290, -2.2942,  5.2289]], grad_fn=<AddmmBackward0>)

In [32]:
sample_probs = torch.softmax(sample_logits, dim=1)
sample_probs, sample_probs.sum()

(tensor([[1.2946e-03, 4.2638e-04, 5.3926e-04, 9.9774e-01]],
        grad_fn=<SoftmaxBackward0>),
 tensor(1., grad_fn=<SumBackward0>))

In [33]:
class_names = ['BUSINESS', 'ENTERTAINMENT', 'HEALTH', 'SCIENCE']

In [34]:
sample_class = torch.argmax(sample_probs, dim=1)
sample_class, class_names[sample_class]

(tensor([3]), 'SCIENCE')

In [None]:
sample_prob_dict = {class_names[i]: sample_probs[i]

In [37]:
sample_probs[0, 0]

tensor(0.0013, grad_fn=<SelectBackward0>)

In [3]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")



In [9]:
tokenizer(sample_input['inputs'], return_tensors='pt', max_length=512, truncation=True, padding='max_length')

{'input_ids': tensor([[  101,  2051,  3604,  2003,  9353,  4048, 13331,  3468,  1011,  2758,
          2327,  7155,  2013,  9274,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  