# Spam detector using Jumpstart
Document: https://sagemaker.readthedocs.io/en/stable/algorithms/text/text_classification_tensorflow.html

In [1]:
import sagemaker, boto3, json
from sagemaker import get_execution_role

aws_role = get_execution_role()
aws_region = "ap-northeast-1"
sess = sagemaker.Session()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [2]:
model_id = "tensorflow-tc-bert-multi-cased-L-12-H-768-A-12-2"

Training dataset is here.
https://www.kaggle.com/datasets/tmehul/spamcsv?resource=download

Create **training-datqsets/SST** folder in s3 and place **data.csv** in it.

In [3]:
from sagemaker.jumpstart.estimator import JumpStartEstimator
from sagemaker.jumpstart.utils import get_jumpstart_content_bucket

training_data_prefix = "training-datasets/SST/"
training_dataset_s3_path = f"s3://sagemaker-automated-execution-533267358966-ap-northeast-1/{training_data_prefix}"

Enabling debugger.

In [4]:
from sagemaker.debugger import Rule, rule_configs
debugger_rules = [
    Rule.sagemaker(rule_configs.loss_not_decreasing()),
    Rule.sagemaker(rule_configs.overfit()),
    Rule.sagemaker(rule_configs.vanishing_gradient()),
    ]

Choose instance from here.  https://docs.aws.amazon.com/ja_jp/sagemaker/latest/dg/notebooks-available-instance-types.html

In [5]:
estimator = JumpStartEstimator(
    model_id=model_id,
    hyperparameters={"epochs": "5", "batch_size": "64",  "use_fp16": "True", "train_only_top_layer":"True"},
    instance_type= "ml.m5.4xlarge",
    rules=debugger_rules
    )

Using model 'tensorflow-tc-bert-multi-cased-L-12-H-768-A-12-2' with wildcard version identifier '*'. You can pin to version '3.0.8' for more stable results. Note that models may have different input/output signatures after a major version upgrade.


In [6]:
estimator.hyperparameters()

{'epochs': '"5"',
 'batch_size': '"64"',
 'use_fp16': '"True"',
 'train_only_top_layer': '"True"',
 'optimizer': '"adamw"',
 'learning_rate': '"2e-05"',
 'warmup_steps_fraction': '"0.1"',
 'beta_1': '"0.9"',
 'beta_2': '"0.999"',
 'momentum': '"0.9"',
 'epsilon': '"1e-06"',
 'rho': '"0.95"',
 'initial_accumulator_value': '"0.1"',
 'early_stopping': '"False"',
 'early_stopping_patience': '"5"',
 'early_stopping_min_delta': '"0.0"',
 'dropout_rate': '"0.2"',
 'regularizers_l2': '"0.01"',
 'validation_split_ratio': '"0.2"',
 'reinitialize_top_layer': '"Auto"'}

In [7]:
estimator.fit({"training": training_dataset_s3_path}, logs=True)

INFO:sagemaker:Creating training-job with name: tf-tc-bert-multi-cased-l-12-h-768-a-12--2025-03-10-17-57-13-595


2025-03-10 17:57:19 Starting - Starting the training job...
2025-03-10 17:57:47 Starting - Preparing the instances for trainingLossNotDecreasing: InProgress
Overfit: InProgress
VanishingGradient: InProgress
...
2025-03-10 17:58:18 Downloading - Downloading the training image......
2025-03-10 17:59:07 Training - Training image download completed. Training in progress..[34m2025-03-10 17:59:23.048027: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.[0m
[34m2025-03-10 17:59:23.048173: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.[0m
[34m2025-03-10 17:59:23.073659: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.[0m
[34m2025-03-10 17:59:25,030 sagemaker-training-toolkit INFO     Imported framework sagemaker_tensorflow_container.training[0m
[

Checking debugger results.

In [15]:
for rule in estimator.latest_training_job.rule_job_summary():
    print(f"{rule['RuleConfigurationName']}:  {rule['RuleEvaluationStatus']}")

LossNotDecreasing:  NoIssuesFound
Overfit:  NoIssuesFound
VanishingGradient:  NoIssuesFound


Choose serverless endpoint.

In [None]:
from sagemaker.serverless import ServerlessInferenceConfig
serverless_config = ServerlessInferenceConfig(
    memory_size_in_mb=2048,  # Choose from 1024, 2048, 3072, 4096, 5120, 6144 MB
    max_concurrency=10       # Max concurrent invocations
    )

In [10]:
predictor = estimator.deploy(instance_type="ml.m5.xlarge", initial_instance_count=1, serverless_inference_config=serverless_config)

INFO:sagemaker:Creating model with name: tf-tc-bert-multi-cased-l-12-h-768-a-12--2025-03-06-02-48-07-356
INFO:sagemaker:Creating endpoint-config with name tf-tc-bert-multi-cased-l-12-h-768-a-12--2025-03-06-02-48-07-355
INFO:sagemaker:Creating endpoint with name tf-tc-bert-multi-cased-l-12-h-768-a-12--2025-03-06-02-48-07-355


-----!

In [34]:
texts = [
    "Thanks for your subscription to Ringtone UK your mobile will be charged", 
     "とにかく安い、安すぎる。今すぐクリック ", 
     "いまのところは雨は降ってないです。",
     "今日の学校は忙しかったですか？",
    "今ならお買い得、買うなら今！！"
    ]

In [35]:
for text in texts:
    query_response = predictor.predict(text)
    print(query_response)

{'probabilities': [0.614778221, 0.385221839]}
{'probabilities': [0.620334804, 0.379665226]}
{'probabilities': [0.422047585, 0.577952385]}
{'probabilities': [0.473016113, 0.526983917]}
{'probabilities': [0.511945069, 0.488054901]}


In [None]:
# Delete the SageMaker endpoint and the attached resources
predictor.delete_predictor()

**Code below also works**

In [None]:
import json
from sagemaker.jumpstart.estimator import JumpStartEstimator
from sagemaker.jumpstart.utils import get_jumpstart_content_bucket
from sagemaker.inputs import TrainingInput

training_input = TrainingInput(
    "s3://sagemaker-automated-execution-533267358966-ap-northeast-1/data.csv",
    content_type="text/csv"
    )

estimator = JumpStartEstimator(
    model_id=model_id,
    hyperparameters={"epochs": "1", "batch_size": "64",  "use_fp16": "True", "train_only_top_layer":"True"},
    instance_type= "ml.m5.4xlarge",
    )

estimator.hyperparameters()

estimator.fit({"training": training_input}, logs=True)