# BlazingText

In this example we take the labelled set of news articles and use Amazon Blazing Text to build a multi-class classifier. We will evaluate it on an independent holdout set and look at accuracy includeing the top predicted topic, and the top two predicted topics.

This Notebook was run in Sagemaker Studio with The **Python 3 (Data Science)** Kernel.


In [1]:
import sagemaker
from sagemaker import get_execution_role
import json
import boto3

sess = sagemaker.Session()

role = get_execution_role()
print(role)  

arn:aws:iam::320389841409:role/service-role/AmazonSageMaker-ExecutionRole-20201022T141998
sagemaker-ap-southeast-2-320389841409


In [2]:
bucket = "funnybones"
prefix = "rural/topics/blazingtext"  

In [8]:
from random import shuffle
import multiprocessing
from multiprocessing import Pool
import csv
import nltk

nltk.download("punkt")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Grab the training and testing datasets

These were prepped in the EDA Notebooks

In [28]:
import pandas as pd

In [88]:
test_file = "data/test.csv"
test_df = pd.read_csv(test_file, header=None)

In [89]:
train_file = "data/training.csv"
train_df = pd.read_csv(train_file, header=None)

In [91]:
def transform_instance(row):
    cur_row = []
    # Prefix the index-ed label with __label__
    label = "__label__" + row[0]
    cur_row.append(label)
    cur_row.extend(nltk.word_tokenize(row[1].lower()))
    return cur_row

In [92]:
def preprocess(input_file, output_file, keep=1):
    all_rows = []
    with open(input_file, "r") as csvinfile:
        csv_reader = csv.reader(csvinfile, delimiter=",")
        for row in csv_reader:
            all_rows.append(row)
    shuffle(all_rows)
    all_rows = all_rows[: int(keep * len(all_rows))]
    pool = Pool(processes=multiprocessing.cpu_count())
    transformed_rows = pool.map(transform_instance, all_rows)
    pool.close()
    pool.join()

    with open(output_file, "w") as csvoutfile:
        csv_writer = csv.writer(csvoutfile, delimiter=" ", lineterminator="\n")
        csv_writer.writerows(transformed_rows)



In [95]:
%%time

# Preparing the training dataset

# Since preprocessing the whole dataset might take a couple of mintutes,
# we keep 20% of the training dataset for this demo.
# Set keep to 1 if you want to use the complete dataset
preprocess(train_file, "blazingtext.train", keep=1)

# Preparing the validation dataset
preprocess(test_file, "blazingtext.test")

CPU times: user 202 ms, sys: 77.5 ms, total: 280 ms
Wall time: 4.25 s


In [96]:
!head -n 60 blazingtext.train > blazingtext.validation

In [97]:
!tail -n +61 blazingtext.train > blazingtext.training

In [99]:
!wc blazingtext.training

    887  584716 3154512 blazingtext.training


In [100]:
%%time

train_channel = prefix + "/train"
validation_channel = prefix + "/validation"

sess.upload_data(path="blazingtext.training", bucket=bucket, key_prefix=train_channel)
sess.upload_data(path="blazingtext.validation", bucket=bucket, key_prefix=validation_channel)

s3_train_data = "s3://{}/{}".format(bucket, train_channel)
s3_validation_data = "s3://{}/{}".format(bucket, validation_channel)

CPU times: user 57.6 ms, sys: 12.2 ms, total: 69.7 ms
Wall time: 384 ms


In [101]:
s3_train_data

's3://funnybones/rural/topics/blazingtext/train'

In [102]:
s3_output_location = "s3://{}/{}/output".format(bucket, prefix)

In [103]:
region_name = boto3.Session().region_name

In [104]:
container = sagemaker.image_uris.retrieve( "blazingtext", region_name, "latest")
print("Using SageMaker BlazingText container: {} ({})".format(container, region_name))

Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: latest.


Using SageMaker BlazingText container: 544295431143.dkr.ecr.ap-southeast-2.amazonaws.com/blazingtext:1 (ap-southeast-2)


In [192]:
bt_model = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type="ml.c4.4xlarge",
    volume_size=30,
    max_run=432000,
    input_mode="File",
    output_path=s3_output_location,
    hyperparameters={
        "mode": "supervised",
        "epochs": 5000,
        "min_count": 2,
        "learning_rate": 0.02,
        "vector_dim": 50,
        "early_stopping": True,
        "patience": 14,
        "min_epochs": 2000,
        "word_ngrams": 2,
    },
)


In [193]:
train_data = sagemaker.inputs.TrainingInput(
    s3_train_data,
    distribution="FullyReplicated",
    content_type="text/plain",
    s3_data_type="S3Prefix",
)
validation_data = sagemaker.inputs.TrainingInput(
    s3_validation_data,
    distribution="FullyReplicated",
    content_type="text/plain",
    s3_data_type="S3Prefix",
)
data_channels = {"train": train_data, "validation": validation_data}

In [194]:
%%time

bt_model.fit(inputs=data_channels, logs=True)

2021-07-06 06:20:18 Starting - Starting the training job...
2021-07-06 06:20:20 Starting - Launching requested ML instancesProfilerReport-1625552418: InProgress
...
2021-07-06 06:21:11 Starting - Preparing the instances for training.........
2021-07-06 06:22:47 Downloading - Downloading input data
2021-07-06 06:22:47 Training - Training image download completed. Training in progress...[34mArguments: train[0m
[34m[07/06/2021 06:22:48 INFO 139773318444416] nvidia-smi took: 0.025355100631713867 secs to identify 0 gpus[0m
[34m[07/06/2021 06:22:48 INFO 139773318444416] Running single machine CPU BlazingText training using supervised mode.[0m
[34mNumber of CPU sockets found in instance is  1[0m
[34m[07/06/2021 06:22:48 INFO 139773318444416] Processing /opt/ml/input/data/train/blazingtext.training . File size: 3.0083770751953125 MB[0m
[34m[07/06/2021 06:22:48 INFO 139773318444416] Processing /opt/ml/input/data/validation/blazingtext.validation . File size: 0.20688343048095703 MB[0

# Deployment and scoring

In [178]:
from sagemaker.serializers import JSONSerializer

text_classifier = bt_model.deploy(
    initial_instance_count=1, instance_type="ml.m4.xlarge", serializer=JSONSerializer()
)



-------------!

# Test the model on the independent set

This is data from a different newspaper, but categorised in the same way in the same human labelling process. This gives us a stronger sense of how the model will perform on **new** data

In [179]:
import numpy as np

In [180]:
sentences = list(test_df[1])

In [181]:
categories = list(test_df[0])

In [182]:
# using the same nltk tokenizer that we used during data preparation for training
tokenized_sentences = [" ".join(nltk.word_tokenize(sent)) for sent in sentences]

payload = {"instances": tokenized_sentences, "configuration": {"k": 2}}

response = text_classifier.predict(payload)
predictions = json.loads(response)
#print(json.dumps(predictions, indent=2))

preds = []
seconds = []
for elem in predictions:
    preds.append(elem["label"][0][9:])
    seconds.append(elem["label"][1][9:])
    
    

In [183]:
results = pd.DataFrame({"target":categories, "pred":preds, "second":seconds})


In [184]:
results["correct"] = np.where(results["target"]==results["pred"],1,0)
results["correct_in2"] = np.where( (results["correct"]==1) | (results["target"]==results["second"]),1,0)

In [185]:
results.head(20)

Unnamed: 0,target,pred,second,correct,correct_in2
0,business,business,realestate,1,1
1,politics,sport,realestate,0,0
2,health,health,business,1,1
3,lifestyle,business,realestate,0,0
4,environment,business,society,0,0
5,business,realestate,business,0,1
6,human,crime,accident,0,0
7,business,business,sport,1,1
8,sport,arts,society,0,0
9,sport,sport,lifestyle,1,1


In [186]:
print("first level accuracy")
acc = sum(results["correct"])/len(results)
print(acc)

print("second level accuracy")
acc = sum(results["correct_in2"])/len(results)
print(acc)


first level accuracy
0.37349397590361444
second level accuracy
0.5783132530120482


In [187]:
# Just for sport

sport_df = results[ results["target"]=="sport" ]
print("first level accuracy for sport")
acc = sum(sport_df["correct"])/len(sport_df)
print(acc)

print("second level accuracy for sport")
acc = sum(sport_df["correct_in2"])/len(sport_df)
print(acc)

first level accuracy for sport
0.40625
second level accuracy for sport
0.65625


# Results with params

first level accuracy
0.37349397590361444
second level accuracy
0.6024096385542169

    hyperparameters={
        "mode": "supervised",
        "epochs": 2000,
        "min_count": 2,
        "learning_rate": 0.05,
        "vector_dim": 30,
        "early_stopping": True,
        "patience": 10,
        "min_epochs": 200,
        "word_ngrams": 1,
    },

## N-gram 2 -- Lower learning

    hyperparameters={
        "mode": "supervised",
        "epochs": 5000,
        "min_count": 2,
        "learning_rate": 0.01,
        "vector_dim": 40,
        "early_stopping": True,
        "patience": 10,
        "min_epochs": 1000,
        "word_ngrams": 2,
    },

### Overall

first level accuracy
0.3855421686746988
second level accuracy
0.46987951807228917

### Sport
first level accuracy for sport
0.53125
second level accuracy for sport
0.65625



## N-gram 1 -- Longer training

    hyperparameters={
        "mode": "supervised",
        "epochs": 5000,
        "min_count": 2,
        "learning_rate": 0.03,
        "vector_dim": 50,
        "early_stopping": True,
        "patience": 12,
        "min_epochs": 1000,
        "word_ngrams": 1,
    },

### Overall
first level accuracy
0.37349397590361444
second level accuracy
0.5783132530120482

### Sport
first level accuracy for sport
0.40625
second level accuracy for sport
0.65625

