# Import library and set the roles and S3 buckets

In [1]:
%pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [2]:
import boto3
import sagemaker
import pandas as pd
import matplotlib.pyplot as plt
import sagemaker, boto3, json
from sagemaker import get_execution_role
import nltk
nltk.download('punkt')
%matplotlib inline

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


Matplotlib is building the font cache; this may take a moment.
[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [7]:
aws_role = get_execution_role()
aws_region = boto3.Session().region_name
sess = sagemaker.Session()
region = boto3.Session().region_name

# Specify S3 bucket and prefix where you have uploaded email_dataset.csv
training_data_bucket = "myemailspambucket"#"<specify s3 bucket>"
training_data_prefix = "trainingdata"#"<specify s3 prefix>"

training_dataset_s3_path = f"s3://{training_data_bucket}/{training_data_prefix}/email_dataset.csv"

output_bucket = sess.default_bucket()
output_prefix = ""#"<specify s3 prefix>"

s3_output_location = f"s3://{output_bucket}/{output_prefix}/output"

# Load Data

In [8]:
# load data
df = pd.read_csv(training_dataset_s3_path)
df.head()


Unnamed: 0,Category,Message
0,HAM,just wanted to check with you if you'll be sha...
1,HAM,"Hi,Job Title:- SAP ABAP Consultant Experience-..."
2,SPAM,Winter is here and so are the grand holidays
3,HAM,"Hi,We are looking for an expert in SAP for our..."
4,HAM,Your prepaid recharge is now successful.


In [9]:
# Check if dataset is balanced or not
df['Category'].value_counts()

Category
HAM     42
SPAM    38
Name: count, dtype: int64

# Prepare the Data

In [10]:
#Replace SPAM with 1 and HAM with 0
df['Category'] = df['Category'].apply(lambda x:1 if x=='SPAM' else 0)
df.head()

Unnamed: 0,Category,Message
0,0,just wanted to check with you if you'll be sha...
1,0,"Hi,Job Title:- SAP ABAP Consultant Experience-..."
2,1,Winter is here and so are the grand holidays
3,0,"Hi,We are looking for an expert in SAP for our..."
4,0,Your prepaid recharge is now successful.


In [11]:
def tokenize(message):
    # delete quotation marks and commas , apply tokenization and join back into a string separating by spaces
    return ' '.join([str(token) for token in nltk.word_tokenize(str(message).replace(',', '').replace('"', '').lower())])
    
def prepare_data(df):
    df['Category'] = df['Category'].map(lambda category : '__label__{}'.format(str(category).replace('__label__', '')))
    df['Message'] = df['Message'].map(lambda message : tokenize(message)) 
    return df

df_final = df[['Category', 'Message']].reset_index(drop=True)
df_final = prepare_data(df_final)
df_final.head()

Unnamed: 0,Category,Message
0,__label__0,just wanted to check with you if you 'll be sh...
1,__label__0,hijob title : - sap abap consultant experience...
2,__label__1,winter is here and so are the grand holidays
3,__label__0,hiwe are looking for an expert in sap for our ...
4,__label__0,your prepaid recharge is now successful .


In [12]:
# Split data into train and validation
from sklearn.model_selection import train_test_split
df_train, df_validation = train_test_split(df_final, 
                                           test_size=0.10,
                                           stratify=df_final['Category'])

In [13]:
#upload transformed data to S3 bucket
train_path = './train.csv'
df_train[['Category', 'Message']].to_csv(train_path, index=False, header=False, sep=' ')

validation_path = './validation.csv'
df_validation[['Category', 'Message']].to_csv(validation_path, index=False, header=False, sep=' ')

#Specify S3 bucket prefix
train_s3_uri = sess.upload_data(bucket=training_data_bucket, key_prefix='trainig', path=train_path)
validation_s3_uri = sess.upload_data(bucket=training_data_bucket, key_prefix='validation', path= validation_path)

# Train the Model

In [14]:
image_uri = sagemaker.image_uris.retrieve(
    region=region,
    framework='blazingtext'
)

In [15]:
estimator = sagemaker.estimator.Estimator(image_uri=image_uri, 
    role=aws_role, 
    instance_count=1, 
    instance_type='ml.m5.large',
    volume_size=30,
    max_run=7200,
    disable_profiler=True,                                      
    sagemaker_session=sess
)

In [16]:
#Hyperparameter
estimator.set_hyperparameters(mode='supervised',   
                              epochs=10,          
                              learning_rate=0.01,  
                              min_count=2,                          
                              vector_dim=300,      
                              word_ngrams=3)  

In [17]:
train_data = sagemaker.inputs.TrainingInput(
    train_s3_uri, 
    distribution='FullyReplicated', 
    content_type='text/plain', 
    s3_data_type='S3Prefix'
)
validation_data = sagemaker.inputs.TrainingInput(
    validation_s3_uri, 
    distribution='FullyReplicated', 
    content_type='text/plain', 
    s3_data_type='S3Prefix'
)

data_channels = {
    'train': train_data,
    'validation': validation_data 
}

In [18]:
estimator.fit(
    inputs=data_channels,
    wait=True
)

INFO:sagemaker:Creating training-job with name: blazingtext-2024-04-12-16-22-10-422


2024-04-12 16:22:10 Starting - Starting the training job...
2024-04-12 16:22:29 Starting - Preparing the instances for training...
2024-04-12 16:23:03 Downloading - Downloading input data...
2024-04-12 16:23:41 Downloading - Downloading the training image...
2024-04-12 16:24:01 Training - Training image download completed. Training in progress..[34mArguments: train[0m
  self.stdout = io.open(c2pread, 'rb', bufsize)[0m
[34m[04/12/2024 16:24:04 INFO 140365925304128] nvidia-smi took: 0.025211095809936523 secs to identify 0 gpus[0m
[34m[04/12/2024 16:24:04 INFO 140365925304128] Running single machine CPU BlazingText training using supervised mode.[0m
[34mNumber of CPU sockets found in instance is  1[0m
[34m[04/12/2024 16:24:04 INFO 140365925304128] Processing /opt/ml/input/data/train/train.csv . File size: 0.0045928955078125 MB[0m
[34m[04/12/2024 16:24:04 INFO 140365925304128] Processing /opt/ml/input/data/validation/validation.csv . File size: 0.00036334991455078125 MB[0m
[3

In [19]:
#Get the accuracy of the train and validation dataset
estimator.training_job_analytics.dataframe()



Unnamed: 0,timestamp,metric_name,value
0,0.0,train:accuracy,0.9722
1,0.0,validation:accuracy,0.75


# Deploy the Model

In [20]:
text_classifier = estimator.deploy(initial_instance_count=1,
                                   instance_type='ml.m5.large',
                                   serializer=sagemaker.serializers.JSONSerializer(),
                                   deserializer=sagemaker.deserializers.JSONDeserializer())
print()
print('Endpoint name:  {}'.format(text_classifier.endpoint_name))

INFO:sagemaker:Creating model with name: blazingtext-2024-04-12-16-27-05-972
INFO:sagemaker:Creating endpoint-config with name blazingtext-2024-04-12-16-27-05-972
INFO:sagemaker:Creating endpoint with name blazingtext-2024-04-12-16-27-05-972


-----!
Endpoint name:  blazingtext-2024-04-12-16-27-05-972


# Test the Model

In [24]:
messages = [
                # Spam
                'Click on below link, provide your details and win this award' ,
                'Best summer deal here',
                #ham
                'See you in the office.'

]

tokenized_message = [' '.join(nltk.word_tokenize(mesaage)) for mesaage in messages]
payload = {"instances" : tokenized_message}
print(payload)

{'instances': ['Click on below link , provide your details and win this award', 'Best summer deal here', 'See you in the office .']}


In [25]:
predictions = text_classifier.predict(data=payload)
for prediction in predictions:
    predicted_class = prediction['label'][0].lstrip('__label__')
    print('SPAM' if predicted_class == '1' else 'HAM')
    

SPAM
SPAM
HAM


# Delete the Model Endpoint

In [None]:
text_classifier.delete_endpoint()