In [1]:
import torch
from datasets import load_dataset
from collections import Counter
import numpy as np
import time
import gc

RANDOM_SEED = 42

device="cuda" if torch.cuda.is_available() else "cpu"

def load_and_prepare_data():
    dataset = load_dataset("Tobi-Bueck/customer-support-tickets")
    ds = dataset["train"]

    # Filter only English tickets
    ds = ds.filter(lambda ex: ex["language"] == "en")

    # Select only 5 departments "Technical Support", "Customer Service", "Billing and Payments",
    #"Sales and Pre-Sales", "General Inquiry",

    target_queues = [
        "Technical Support",
        "Customer Service",
        "Billing and Payments",
        "Sales and Pre-Sales",
        "General Inquiry",
    ]

    ds = ds.filter(lambda ex: ex["queue"] in target_queues)

    # Shuffle and split train/val/test
    ds = ds.shuffle(seed=RANDOM_SEED)
    train_test = ds.train_test_split(test_size=0.2, seed=RANDOM_SEED)
    test_valid = train_test["test"].train_test_split(test_size=0.5, seed=RANDOM_SEED)

    train_ds = train_test["train"]
    val_ds = test_valid["train"]
    test_ds = test_valid["test"]

    # Label mapping for discriminative methods
    label_list = sorted(list(set(train_ds["queue"])))
    label2id = {lab: i for i, lab in enumerate(label_list)}
    id2label = {i: lab for lab, i in label2id.items()}

    print("Label distribution (train):")
    print(Counter(train_ds["queue"]))

    return train_ds, val_ds, test_ds, label_list, label2id, id2label

import torch
import numpy as np
from datasets import Dataset
from collections import Counter
import string

# Load data
train_ds, val_ds, test_ds, label_list, label2id, id2label = load_and_prepare_data()


import torch
from torch.utils.data import DataLoader
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from torch.optim import AdamW
from sklearn.metrics import accuracy_score
from datasets import Dataset
from transformers import DataCollatorWithPadding
import numpy as np

##encode_dataset : padding=Flase for more effecient training

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
data_collator = DataCollatorWithPadding(tokenizer)
num_labels = len(label_list)
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_labels
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


# ---------------------------
# Encode dataset for BERT
# ---------------------------
def encode_dataset(ds):
    texts = [f"{subj} {body}" for subj, body in zip(ds["subject"], ds["body"])]
    encodings = tokenizer(texts, truncation=True, padding=False, max_length=512)
    encodings["labels"] = [label2id[label] for label in ds["queue"]]
    return Dataset.from_dict(encodings)

train_enc = encode_dataset(train_ds)
val_enc = encode_dataset(val_ds)
test_enc = encode_dataset(test_ds)

# ---------------------------
# DataLoaders
# ---------------------------
train_loader = DataLoader(train_enc, batch_size=16, collate_fn=data_collator)
val_loader = DataLoader(val_enc, batch_size=16)
test_loader = DataLoader(test_enc, batch_size=16)

# ---------------------------
# Optimizer
# ---------------------------
optimizer = AdamW(model.parameters(), lr=5e-5)

# ---------------------------
# Training loop
# ---------------------------
start_time = time.time()
epochs = 4
model.train()
for epoch in range(epochs):
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} | Loss: {total_loss/len(train_loader):.4f}")

time_cls_train = time.time() - start_time
mem_cls_train = torch.cuda.max_memory_allocated(device) / (1024**2) # Convert to MB
print(f"Peak GPU Memory during training: {mem_cls_train:.2f} MB")

torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
# ---------------------------
# Prediction function
# ---------------------------
start_time = time.time()
model.eval()
def predict_email(subject, body):
    text = f"{subject} {body}"
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=-1)
        pred_id = torch.argmax(probs, dim=-1).item()
        return id2label[pred_id]

# ---------------------------
# Evaluation
# ---------------------------
true_labels = [ex["queue"] for ex in test_ds]
pred_labels = [predict_email(ex["subject"], ex["body"]) for ex in test_ds]

acc_cls = accuracy_score(true_labels, pred_labels)
time_cls_inf = time.time() - start_time
mem_cls_inf = torch.cuda.max_memory_allocated(device) / (1024**2) # Convert to MB
print(f"memory used: {mem_cls_inf:.2f} MB")
print(f"Inference Time: {time_cls_inf:.2f}s")
print(f"Discriminative BERT classifier accuracy: {acc_cls:.4f}")

model.to("cpu")


# Force Python's garbage collector to run
gc.collect()

# Clear the actual VRAM on the GPU
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

README.md: 0.00B [00:00, ?B/s]

aa_dataset-tickets-multi-lang-5-2-50-ver(…):   0%|          | 0.00/26.0M [00:00<?, ?B/s]

(…)set-tickets-german_normalized_50_5_2.csv: 0.00B [00:00, ?B/s]

dataset-tickets-multi-lang-4-20k.csv:   0%|          | 0.00/18.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/61765 [00:00<?, ? examples/s]

Filter:   0%|          | 0/61765 [00:00<?, ? examples/s]

Filter:   0%|          | 0/28261 [00:00<?, ? examples/s]

Label distribution (train):
Counter({'Technical Support': 6476, 'Customer Service': 3471, 'Billing and Payments': 2307, 'Sales and Pre-Sales': 655, 'General Inquiry': 340})


2026-01-07 17:28:15.737798: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767806895.932596      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767806895.986891      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1767806896.461053      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767806896.461100      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767806896.461103      55 computation_placer.cc:177] computation placer alr

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 | Loss: 0.9229
Epoch 2 | Loss: 0.7529
Epoch 3 | Loss: 0.5283
Epoch 4 | Loss: 0.3476
Peak GPU Memory during training: 2944.83 MB
memory used: 1054.59 MB
Inference Time: 9.88s
Discriminative BERT classifier accuracy: 0.7960


In [20]:
# =====================================================
# KAGGLE-READY SAGEMAKER DEPLOYMENT SCRIPT
# =====================================================

# Install required packages (run in Kaggle notebook cell)
# !pip install -q boto3 sagemaker

import os
import json
import torch
import tarfile
from pathlib import Path

# =====================================================
# STEP 1: Load AWS Credentials from Kaggle Secrets
# =====================================================

def setup_aws_credentials():
    """
    Load AWS credentials from Kaggle secrets
    """
    from kaggle_secrets import UserSecretsClient
    
    user_secrets = UserSecretsClient()
    
    # Get credentials from Kaggle secrets
    aws_access_key = ""
    aws_secret_key = ""
    
    # Set as environment variables
    os.environ['AWS_ACCESS_KEY_ID'] = aws_access_key
    os.environ['AWS_SECRET_ACCESS_KEY'] = aws_secret_key
    os.environ['AWS_DEFAULT_REGION'] = 'eu-north-1'  # Change if needed
    
    print("✓ AWS credentials loaded from Kaggle secrets")
    
    # Test credentials
    import boto3
    try:
        sts = boto3.client('sts')
        identity = sts.get_caller_identity()
        print(f"✓ Authenticated as: {identity['Arn']}")
        print(f"✓ Account ID: {identity['Account']}")
        return identity['Account']
    except Exception as e:
        print(f"✗ Authentication failed: {e}")
        return None

# Run this first!
account_id = setup_aws_credentials()


# =====================================================
# STEP 2: Save Model with All Required Files
# =====================================================

def save_model_for_kaggle(model, tokenizer, label2id, id2label, save_dir='./deployment_model'):
    """
    Save your trained model in the correct format
    """
    print(f"Saving model to {save_dir}...")
    
    # Create directory
    os.makedirs(save_dir, exist_ok=True)
    
    # Move model to CPU before saving (important in Kaggle!)
    model.to('cpu')
    
    # Save model and tokenizer
    model.save_pretrained(save_dir)
    tokenizer.save_pretrained(save_dir)
    
    # Save label mappings
    with open(f'{save_dir}/label_mapping.json', 'w') as f:
        json.dump({
            'label2id': label2id,
            'id2label': {str(k): v for k, v in id2label.items()}
        }, f)
    
    print(f"✓ Model saved successfully")
    print(f"✓ Files: {os.listdir(save_dir)}")
    
    return save_dir

# Example usage (add this after your training code):
# save_dir = save_model_for_kaggle(model, tokenizer, label2id, id2label)


# =====================================================
# STEP 3: Create Inference Script
# =====================================================

def create_inference_files(save_dir='./deployment_model'):
    """
    Create inference.py and requirements.txt
    """
    code_dir = f'{save_dir}/code'
    os.makedirs(code_dir, exist_ok=True)
    
    # Inference script
    inference_code = '''import json
import torch
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast
import os

def model_fn(model_dir):
    """Load the model for inference"""
    print(f"Loading model from {model_dir}")
    
    # Load model and tokenizer
    model = DistilBertForSequenceClassification.from_pretrained(model_dir)
    tokenizer = DistilBertTokenizerFast.from_pretrained(model_dir)
    
    # Load label mappings
    with open(f'{model_dir}/label_mapping.json', 'r') as f:
        label_mapping = json.load(f)
    id2label = {int(k): v for k, v in label_mapping['id2label'].items()}
    
    # Move to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    
    print(f"✓ Model loaded on {device}")
    
    return {
        'model': model,
        'tokenizer': tokenizer,
        'id2label': id2label,
        'device': device
    }

def input_fn(request_body, request_content_type):
    """Parse input data"""
    if request_content_type == 'application/json':
        input_data = json.loads(request_body)
        if isinstance(input_data, dict):
            return [input_data]
        elif isinstance(input_data, list):
            return input_data
        else:
            raise ValueError("Input must be dict or list of dicts")
    else:
        raise ValueError(f"Unsupported content type: {request_content_type}")

def predict_fn(input_data, model_dict):
    """Make predictions"""
    model = model_dict['model']
    tokenizer = model_dict['tokenizer']
    id2label = model_dict['id2label']
    device = model_dict['device']
    
    # Prepare texts (subject + body format from your training)
    texts = [f"{item['subject']} {item['body']}" for item in input_data]
    
    # Tokenize
    inputs = tokenizer(
        texts,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=512
    ).to(device)
    
    # Predict
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=-1)
        pred_ids = torch.argmax(probs, dim=-1).cpu().numpy()
        confidences = torch.max(probs, dim=-1).values.cpu().numpy()
    
    # Format results
    results = []
    for i, pred_id in enumerate(pred_ids):
        results.append({
            'predicted_queue': id2label[pred_id],
            'confidence': float(confidences[i]),
            'all_probabilities': {
                id2label[j]: float(probs[i][j].item())
                for j in range(len(id2label))
            }
        })
    
    return results

def output_fn(prediction, response_content_type):
    """Format output"""
    return json.dumps(prediction)
'''
    
    with open(f'{code_dir}/inference.py', 'w') as f:
        f.write(inference_code)
    
    # Requirements file
    requirements = """transformers==4.26.0
torch==2.0.0
"""
    
    with open(f'{code_dir}/requirements.txt', 'w') as f:
        f.write(requirements)
    
    print(f"✓ Inference files created in {code_dir}")

# create_inference_files()

# =====================================================
# STEP 4: Package and Upload to S3
# =====================================================

def package_and_upload_to_s3(save_dir='./deployment_model',bucket_name="distilbert-routing-emails-ok-2026",s3_prefix='distilbert-tickets'):
    """
    Package model and upload to S3 from Kaggle
    """
    import boto3
    
    # Create tar.gz
    tar_filename = 'model.tar.gz'
    print(f"Creating {tar_filename}...")
    
    with tarfile.open(tar_filename, 'w:gz') as tar:
        tar.add(save_dir, arcname='.')
    
    file_size = os.path.getsize(tar_filename) / (1024*1024)
    print(f"✓ Model packaged: {file_size:.2f} MB")
    
    # Upload to S3
    s3_client = boto3.client('s3')
    s3_key = f'{s3_prefix}/model.tar.gz'
    
    print(f"Uploading to S3 (this may take a few minutes)...")
    s3_client.upload_file(
        tar_filename, 
        bucket_name, 
        s3_key,
        Callback=lambda bytes_transferred: print(f"  Uploaded {bytes_transferred / (1024*1024):.1f} MB", end='\r')
    )
    
    model_data_url = f's3://{bucket_name}/{s3_key}'
    print(f"\n✓ Upload complete!")
    print(f"  S3 URL: {model_data_url}")
    
    return model_data_url

# Example:
# model_url = package_and_upload_to_s3(bucket_name='my-sagemaker-bucket')


# =====================================================
# STEP 5: Deploy to SageMaker
# =====================================================

def deploy_model_from_kaggle(model_data_url,
                              role_arn,
                              endpoint_name='distilbert-tickets',
                              instance_type='ml.m5.xlarge'):
    import boto3
    import time
    
    sagemaker_client = boto3.client('sagemaker')
    region = boto3.Session().region_name
    
    # Try different PyTorch + Transformers versions
    images_to_try = [
        f"763104351884.dkr.ecr.eu-north-1.amazonaws.com/huggingface-pytorch-inference:2.0.0-transformers4.28.1-cpu-py310-ubuntu20.04",
        f"763104351884.dkr.ecr.eu-north-1.amazonaws.com/huggingface-pytorch-inference:1.13.1-transformers4.26.0-cpu-py39-ubuntu20.04",
        f"763104351884.dkr.ecr.eu-north-1.amazonaws.com/huggingface-pytorch-inference:1.10.2-transformers4.17.0-cpu-py38-ubuntu20.04",
    ]
    
    model_name = f"{endpoint_name}-model-{int(time.time())}"
    
    print(f"Creating SageMaker model: {model_name}")
    print(f"Region: {region}")
    
    # Try each image
    for i, image_uri in enumerate(images_to_try, 1):
        print(f"Trying image {i}/{len(images_to_try)}...")
        try:
            # Create model
            sagemaker_client.create_model(
                ModelName=model_name,
                PrimaryContainer={
                    'Image': image_uri,
                    'ModelDataUrl': model_data_url,
                    'Environment': {
                        'SAGEMAKER_PROGRAM': 'inference.py',
                        'SAGEMAKER_SUBMIT_DIRECTORY': model_data_url
                    }
                },
                ExecutionRoleArn=role_arn
            )
            print(f"✓ Model created with image {i}")
            break
        except Exception as e:
            if i < len(images_to_try):
                print(f"  Image {i} not available, trying next...")
                continue
            else:
                print(f"✗ All images failed: {e}")
                return None
    
    try:
        # Create endpoint config
        endpoint_config_name = f"{endpoint_name}-config-{int(time.time())}"
        sagemaker_client.create_endpoint_config(
            EndpointConfigName=endpoint_config_name,
            ProductionVariants=[{
                'VariantName': 'AllTraffic',
                'ModelName': model_name,
                'InitialInstanceCount': 1,
                'InstanceType': instance_type
            }]
        )
        print(f"✓ Endpoint config created: {endpoint_config_name}")
        
        # Create endpoint
        print(f"Creating endpoint: {endpoint_name}")
        print(f"Instance: {instance_type}")
        print("⏳ This takes 5-10 minutes. Grab a coffee...")
        
        sagemaker_client.create_endpoint(
            EndpointName=endpoint_name,
            EndpointConfigName=endpoint_config_name
        )
        
        # Wait for endpoint
        print("Waiting for endpoint to be ready...")
        waiter = sagemaker_client.get_waiter('endpoint_in_service')
        waiter.wait(EndpointName=endpoint_name)
        
        print(f"✓ Deployment successful!")
        print(f"  Endpoint: {endpoint_name}")
        return endpoint_name
        
    except Exception as e:
        print(f"✗ Deployment failed: {e}")
        return None

# Example:
# role = f'arn:aws:iam::{account_id}:role/SageMakerExecutionRole'
# predictor = deploy_model_from_kaggle(model_url, role)


# =====================================================
# STEP 6: Test Endpoint from Kaggle
# =====================================================

def test_endpoint_from_kaggle(endpoint_name='distilbert-tickets'):
    """
    Test your deployed endpoint
    """
    import boto3
    import json
    
    runtime = boto3.client('sagemaker-runtime')
    
    # Test with a sample ticket
    test_ticket = {
        "subject": "Cannot login to account",
        "body": "I tried resetting my password but the email never arrived. Need urgent help!"
    }
    
    print("Testing endpoint...")
    print(f"Subject: {test_ticket['subject']}")
    print(f"Body: {test_ticket['body'][:100]}...")
    print()
    
    try:
        response = runtime.invoke_endpoint(
            EndpointName=endpoint_name,
            ContentType='application/json',
            Body=json.dumps(test_ticket)
        )
        
        result = json.loads(response['Body'].read().decode())
        
        print("✓ Prediction successful!")
        print(json.dumps(result, indent=2))
        
        return result
        
    except Exception as e:
        print(f"✗ Prediction failed: {e}")
        return None
# test_endpoint_from_kaggle()


# =====================================================
# COMPLETE WORKFLOW FOR KAGGLE
# =====================================================

def complete_kaggle_deployment():
    """
    Full deployment workflow for Kaggle notebooks
    """
    print("="*60)
    print("KAGGLE → AWS SAGEMAKER DEPLOYMENT")
    print("="*60)
    
    # CONFIGURATION - CHANGE THESE!
    BUCKET_NAME="distilbert-routing-emails-ok-2026"
    ENDPOINT_NAME = 'distilbert-tickets'
    
    # Step 1: Setup credentials
    print("\n[1/7] Setting up AWS credentials...")
    account_id = setup_aws_credentials()
    if not account_id:
        print("✗ Failed to authenticate. Check your Kaggle secrets!")
        return
    
    # Construct role ARN
    ROLE_ARN ="arn:aws:iam::998821594730:role/service-role/AmazonSageMaker-ExecutionRole-20260106T015680"
    
    # Step 2: Save model
    print("\n[2/7] Saving model...")
    # Assuming model, tokenizer, label2id, id2label are in scope
    # save_dir = save_model_for_kaggle(model, tokenizer, label2id, id2label)
    save_dir = './deployment_model'  # If already saved
    
    # Step 3: Create inference files
    print("\n[3/7] Creating inference script...")
    create_inference_files(save_dir)
    
    # Step 4: Package model
    print("\n[4/7] Packaging model...")
    # (packaging happens in upload step)
    
    # Step 5: Upload to S3
    print("\n[5/7] Uploading to S3...")
    model_url = package_and_upload_to_s3(save_dir, BUCKET_NAME)
    
    # Step 6: Deploy
    print("\n[6/7] Deploying to SageMaker...")
    predictor = deploy_model_from_kaggle(model_url, ROLE_ARN, ENDPOINT_NAME)
    
    if predictor:
        # Step 7: Test
        print("\n[7/7] Testing endpoint...")
        test_endpoint_from_kaggle(ENDPOINT_NAME)
        
        print("\n" + "="*60)
        print("✓ DEPLOYMENT COMPLETE!")
        print("="*60)
        print(f"\nEndpoint name: {ENDPOINT_NAME}")
        print("\n⚠️  Don't forget to delete the endpoint when done!")
        print(f"   Run: delete_endpoint('{ENDPOINT_NAME}')")
    else:
        print("\n✗ Deployment failed. Check the errors above.")

# Uncomment to run:
complete_kaggle_deployment()


# =====================================================
# UTILITY: Delete Endpoint
# =====================================================

def delete_endpoint(endpoint_name='distilbert-tickets'):
    """
    Delete endpoint to stop charges
    """
    import boto3
    
    sm_client = boto3.client('sagemaker')
    
    try:
        print(f"Deleting endpoint: {endpoint_name}...")
        sm_client.delete_endpoint(EndpointName=endpoint_name)
        print("✓ Endpoint deleted successfully")
    except Exception as e:
        print(f"✗ Failed to delete endpoint: {e}")



✓ AWS credentials loaded from Kaggle secrets
✓ Authenticated as: arn:aws:iam::998821594730:user/tester
✓ Account ID: 998821594730
KAGGLE → AWS SAGEMAKER DEPLOYMENT

[1/7] Setting up AWS credentials...
✓ AWS credentials loaded from Kaggle secrets
✓ Authenticated as: arn:aws:iam::998821594730:user/tester
✓ Account ID: 998821594730

[2/7] Saving model...

[3/7] Creating inference script...
✓ Inference files created in ./deployment_model/code

[4/7] Packaging model...

[5/7] Uploading to S3...
Creating model.tar.gz...
✓ Model packaged: 235.86 MB
Uploading to S3 (this may take a few minutes)...
  Uploaded 1.0 MB
✓ Upload complete!
  S3 URL: s3://distilbert-routing-emails-ok-2026/distilbert-tickets/model.tar.gz

[6/7] Deploying to SageMaker...
Creating SageMaker model: distilbert-tickets-model-1767810125
Region: eu-north-1
Trying image 1/3...
✓ Model created with image 1
✓ Endpoint config created: distilbert-tickets-config-1767810126
Creating endpoint: distilbert-tickets
Instance: ml.m5.xlar

In [21]:
delete_endpoint()

Deleting endpoint: distilbert-tickets...
✓ Endpoint deleted successfully
