In [2]:
!pip install -qU pip transformers optuna

[0m

In [3]:
import boto3
import pandas as pd
from sklearn.model_selection import train_test_split
import io
import numpy as np
import os
import ast

import sagemaker
from sagemaker.session import Session
from sagemaker.feature_store.feature_group import FeatureGroup

from transformers import TFBertForSequenceClassification, BertTokenizer, BertConfig, TrainingArguments, Trainer, TextClassificationPipeline
import tensorflow as tf
from tensorflow.data import Dataset
from sklearn.utils import class_weight
import optuna
from sagemaker.huggingface import HuggingFace
from sagemaker.tuner import IntegerParameter, ContinuousParameter, CategoricalParameter
from sagemaker.tuner import HyperparameterTuner

2024-02-14 22:57:35.438419: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX512F
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-02-14 22:57:36.201539: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.
2024-02-14 22:57:36.206633: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.
2024-02-14 22:57:36.654865: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.


In [4]:
region_name='us-west-2'
bucket_name = 'aai-540-final-data'

session = sagemaker.Session()
featurestore_runtime = session.boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region_name)
feature_group_name = 'emotion_feature_group_13_03_13_24_1707794028'

In [5]:
athena_client = boto3.client('athena', region_name=region_name)
query_string = f"""
SELECT * FROM "{feature_group_name}"
WHERE data_type = 'train'
"""
output_location = f's3://{bucket_name}/athena/results/'

response_train = athena_client.start_query_execution(
    QueryString=query_string,
    QueryExecutionContext={
        'Database': 'sagemaker_featurestore'
    },
    ResultConfiguration={
        'OutputLocation': output_location,
    }
)
train_location_id = response_train['QueryExecutionId']

In [6]:
val_query_string = f"""
SELECT * FROM "{feature_group_name}"
WHERE data_type = 'val'
"""

response_val = athena_client.start_query_execution(
    QueryString=val_query_string,
    QueryExecutionContext={
        'Database': 'sagemaker_featurestore'
    },
    ResultConfiguration={
        'OutputLocation': output_location,
    }
)
val_location_id = response_val['QueryExecutionId']

In [7]:
s3 = session.boto_session.client('s3')
s3_path = 'athena/results/'
s3_train_location = s3_path + train_location_id + '.csv'
s3_val_location = s3_path + val_location_id + '.csv'

In [8]:
train_data_obj = s3.get_object(Bucket=bucket_name, Key=s3_train_location)
val_data_obj = s3.get_object(Bucket=bucket_name, Key=s3_val_location)

df_train = pd.read_csv(io.BytesIO(train_data_obj['Body'].read()))
df_val = pd.read_csv(io.BytesIO(val_data_obj['Body'].read()))

In [9]:
df_train.head()

Unnamed: 0,input_ids,attention_mask,emotions,text,eventtime,id,data_type,write_time,api_invocation_time,is_deleted
0,"[101, 4067, 2017, 1010, 2008, 2515, 4025, 2000...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ...",happy,"Thank You, that does seem to be the consensus.",1707794000.0,eemui59,train,2024-02-13 03:20:23.818,2024-02-13 03:14:47.000,False
1,"[101, 2471, 2066, 1031, 2171, 1033, 3226, 2003...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",anger,Almost like [NAME] culture is written to be of...,1707794000.0,eejenva,train,2024-02-13 03:20:23.818,2024-02-13 03:14:47.000,False
2,"[101, 2821, 2057, 1005, 2128, 2725, 2023, 2153...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ...",anger,Oh we're doing this again huh,1707794000.0,ed4i059,train,2024-02-13 03:20:23.818,2024-02-13 03:14:48.000,False
3,"[101, 11867, 23644, 1010, 2008, 2001, 2919, 12...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",neutral,"Sphh, that was badass. I say that went well.",1707794000.0,eelexo7,train,2024-02-13 03:20:23.818,2024-02-13 03:14:48.000,False
4,"[101, 1045, 2293, 2043, 4268, 2131, 24995, 199...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",affectionate,I love when kids get cocky and then fall / get...,1707794000.0,eekzamr,train,2024-02-13 03:20:23.818,2024-02-13 03:14:48.000,False


In [10]:
emotion_categories = {
	"anger": ["anger", "annoyance", "disapproval"],
	"disgust": ["disgust"],
	"fear": ["fear", "nervousness"],
	"happy": ["joy", "amusement", "approval", "gratitude"],
	"optimistic": ["optimism", "relief", "pride", "excitement"],
	"affectionate": [ "love", "caring", "admiration",  "desire"],
	"sadness": ["sadness", "disappointment", "embarrassment", "grief",  "remorse"],
	"surprise": ["surprise", "realization", "confusion", "curiosity"],
	"neutral": ["neutral"]
}
category_to_index = {category: index for index, category in enumerate(emotion_categories)}

In [11]:
def feature_store_to_dataset(dataframe, category_to_index, shuffle=True, batch_size=16):
    dataframe = dataframe.copy()
    
    # Extract labels and convert to numerical values
    labels = dataframe.pop('emotions').apply(lambda x: category_to_index[x]).values
    
    # Parse 'input_ids' and 'attention_mask' from strings to lists of integers
    input_ids = dataframe['input_ids'].apply(ast.literal_eval).tolist()
    attention_mask = dataframe['attention_mask'].apply(ast.literal_eval).tolist()
    
    # Convert lists to TensorFlow tensors
    input_ids = tf.constant(input_ids, dtype=tf.int32)
    attention_mask = tf.constant(attention_mask, dtype=tf.int32)
    
    # Create a TensorFlow dataset
    ds = tf.data.Dataset.from_tensor_slices(({
        "input_ids": input_ids, 
        "attention_mask": attention_mask
    }, labels))
    
    # Shuffle and batch the dataset
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    
    return ds

In [12]:
batch_size = 16
train_dataset = feature_store_to_dataset(df_train, category_to_index, shuffle=True, batch_size=batch_size)
val_dataset = feature_store_to_dataset(df_val, category_to_index, shuffle=False, batch_size=batch_size)

2024-02-13 17:20:36.752821: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-02-13 17:20:36.762279: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-02-13 17:20:36.764106: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-02-13 17:20:36.766319: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX512F AVX512_VNNI
To enable them in other operations, rebu

In [13]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load the BERT model
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=9)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
num_epochs = 4
decay_steps = num_epochs * (len(df_train) / batch_size)

lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=5e-5,
    decay_steps=decay_steps,
    power=1.0)
optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=lr_schedule)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [25]:
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

# Train the model with validation data
model.fit(train_dataset, 
          epochs=num_epochs, 
          validation_data=val_dataset)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7fe8ec14c190>

In [28]:
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)

pipe("There is terrible news today")



[[{'label': 'LABEL_0', 'score': 0.004577369429171085},
  {'label': 'LABEL_1', 'score': 0.032892897725105286},
  {'label': 'LABEL_2', 'score': 0.9122949838638306},
  {'label': 'LABEL_3', 'score': 0.004681961145251989},
  {'label': 'LABEL_4', 'score': 0.0033336216583848},
  {'label': 'LABEL_5', 'score': 0.0009535696008242667},
  {'label': 'LABEL_6', 'score': 0.013790403492748737},
  {'label': 'LABEL_7', 'score': 0.004066551569849253},
  {'label': 'LABEL_8', 'score': 0.023408683016896248}]]

In [29]:
model_dir = 'models/'

In [30]:
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

('models/tokenizer_config.json',
 'models/special_tokens_map.json',
 'models/vocab.txt',
 'models/added_tokens.json')

In [31]:
tar_file = 'base_model.tar.gz'

command = f"tar -czvf {tar_file} -C {model_dir} ."
os.system(command)

./
./tokenizer_config.json
./.ipynb_checkpoints/
./config.json
./special_tokens_map.json
./vocab.txt
./tf_model.h5


0

In [32]:
local_tar_file_path = tar_file
s3_key = f'models/{tar_file}'
s3.upload_file(local_tar_file_path, bucket_name, s3_key)

## Train with best hyperparameters from Optuna Search

In [8]:
role = sagemaker.get_execution_role()
tensorflow_version = '2.6.3'
transformers_version = '4.17.0'
py_version = 'py38'

# Define your hyperparameters directly
hyperparameters = {
    'epochs': 4,  
    'initial_learning_rate': 9.4e-06,
    'batch_size': 12,
    'num_layers_to_freeze': 0,
    'dropout_prob': 0.1,
    'lr_scheduler': 'CosineDecay',
    'decay_steps': 7639
}

huggingface_estimator = HuggingFace(entry_point='train_script.py',
                                    source_dir='.',
                                    role=role,
                                    instance_type='ml.p2.xlarge',  
                                    instance_count=1,
                                    transformers_version=transformers_version,
                                    tensorflow_version=tensorflow_version,
                                    py_version=py_version,
                                    hyperparameters=hyperparameters)

# Launch the training job
huggingface_estimator.fit()

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: huggingface-tensorflow-training-2024-02-15-00-12-04-704


2024-02-15 00:13:58 Starting - Starting the training job......
2024-02-15 00:14:33 Starting - Preparing the instances for training............
2024-02-15 00:16:44 Downloading - Downloading input data......
2024-02-15 00:17:29 Downloading - Downloading the training image........................
2024-02-15 00:21:46 Training - Training image download completed. Training in progress....[34m2024-02-15 00:22:17.886002: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.[0m
[34m2024-02-15 00:22:17.886252: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.[0m
[34m2024-02-15 00:22:17.939334: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.[0m
[34m2024-02-15 00:22:19,317 sagemaker-training-toolkit INFO     Imported framework sagemaker_tensorflow_container.