In [2]:
!pip install -qU pip transformers optuna plotly

[0m

In [3]:
import boto3
import pandas as pd
from sklearn.model_selection import train_test_split
import io
import numpy as np
import os
import ast
import time

import sagemaker
from sagemaker.session import Session
from sagemaker.feature_store.feature_group import FeatureGroup

from transformers import TFBertForSequenceClassification, BertTokenizer, BertConfig, TrainingArguments, Trainer, TextClassificationPipeline
import tensorflow as tf
from tensorflow.data import Dataset
from sklearn.utils import class_weight
import optuna
from sagemaker.huggingface import HuggingFace
from sagemaker.tuner import IntegerParameter, ContinuousParameter, CategoricalParameter
from sagemaker.tuner import HyperparameterTuner

2024-02-13 23:36:15.041795: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX512F
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-02-13 23:36:15.852786: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.
2024-02-13 23:36:15.858895: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.
2024-02-13 23:36:16.238616: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.


In [4]:
region_name='us-west-2'
bucket_name = 'aai-540-final-data'

session = sagemaker.Session()
featurestore_runtime = session.boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region_name)
feature_group_name = 'emotion_feature_group_13_03_13_24_1707794028'

In [10]:
athena_client = boto3.client('athena', region_name=region_name)
query_string = f"""
SELECT * FROM "{feature_group_name}"
WHERE data_type = 'train'
"""
output_location = f's3://{bucket_name}/athena/results/'

response_train = athena_client.start_query_execution(
    QueryString=query_string,
    QueryExecutionContext={
        'Database': 'sagemaker_featurestore'
    },
    ResultConfiguration={
        'OutputLocation': output_location,
    }
)
train_location_id = response_train['QueryExecutionId']

In [6]:
val_query_string = f"""
SELECT * FROM "{feature_group_name}"
WHERE data_type = 'val'
"""

response_val = athena_client.start_query_execution(
    QueryString=val_query_string,
    QueryExecutionContext={
        'Database': 'sagemaker_featurestore'
    },
    ResultConfiguration={
        'OutputLocation': output_location,
    }
)
val_location_id = response_val['QueryExecutionId']

In [14]:
def wait_for_query_completion(client, query_execution_id):
    while True:
        # Get the query execution status
        response = client.get_query_execution(QueryExecutionId=query_execution_id)
        status = response['QueryExecution']['Status']['State']
        
        # If the query is finished, break from the loop
        if status in ['SUCCEEDED', 'FAILED', 'CANCELLED']:
            return status
        
        # Otherwise, wait a bit before checking again
        time.sleep(5)

In [15]:
s3 = session.boto_session.client('s3')
s3_path = 'athena/results/'

In [16]:
query_status_train = wait_for_query_completion(athena_client, train_location_id)
query_status_val = wait_for_query_completion(athena_client, val_location_id)

if query_status_train == 'SUCCEEDED' and query_status_val == 'SUCCEEDED':
    # Construct the S3 key for the query results
    s3_train_location = f"{s3_path}{train_location_id}.csv"
    s3_val_location = f"{s3_path}{val_location_id}.csv"
    
    # Now that the query has succeeded, we can safely access the results
    train_data_obj = s3.get_object(Bucket=bucket_name, Key=s3_train_location)
    val_data_obj = s3.get_object(Bucket=bucket_name, Key=s3_val_location)

    df_train = pd.read_csv(io.BytesIO(train_data_obj['Body'].read()))
    df_val = pd.read_csv(io.BytesIO(val_data_obj['Body'].read()))
else:
    print(f"Query failed with status '{query_status}'")

In [8]:
train_data_obj = s3.get_object(Bucket=bucket_name, Key=s3_train_location)
val_data_obj = s3.get_object(Bucket=bucket_name, Key=s3_val_location)

df_train = pd.read_csv(io.BytesIO(train_data_obj['Body'].read()))
df_val = pd.read_csv(io.BytesIO(val_data_obj['Body'].read()))

In [9]:
df_train.head()

Unnamed: 0,input_ids,attention_mask,emotions,text,eventtime,id,data_type,write_time,api_invocation_time,is_deleted
0,"[101, 1045, 2228, 2057, 2035, 2113, 1031, 2171...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",affectionate,I think we all know [NAME] has the best curves...,1707794000.0,efb3lzj,train,2024-02-13 03:20:25.533,2024-02-13 03:14:48.000,False
1,"[101, 1031, 2171, 1033, 7615, 2038, 2033, 5870...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...",happy,[NAME] comment has me laughing,1707794000.0,eeqvwi1,train,2024-02-13 03:20:25.533,2024-02-13 03:14:48.000,False
2,"[101, 16507, 2015, 4148, 1024, 1052, 102, 0, 0...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",neutral,coincidences happen :P,1707794000.0,eez5v64,train,2024-02-13 03:20:25.533,2024-02-13 03:14:49.000,False
3,"[101, 2009, 1005, 1055, 1996, 10439, 1010, 201...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",neutral,"It's the app, you get daily readings, you can ...",1707794000.0,ee8mfir,train,2024-02-13 03:20:25.533,2024-02-13 03:14:49.000,False
4,"[101, 2040, 2145, 4895, 9711, 20913, 2135, 275...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",anger,Who still unironically says SJWs? You look lik...,1707794000.0,eexwqqi,train,2024-02-13 03:20:25.533,2024-02-13 03:14:49.000,False


In [10]:
emotion_categories = {
	"anger": ["anger", "annoyance", "disapproval"],
	"disgust": ["disgust"],
	"fear": ["fear", "nervousness"],
	"happy": ["joy", "amusement", "approval", "gratitude"],
	"optimistic": ["optimism", "relief", "pride", "excitement"],
	"affectionate": [ "love", "caring", "admiration",  "desire"],
	"sadness": ["sadness", "disappointment", "embarrassment", "grief",  "remorse"],
	"surprise": ["surprise", "realization", "confusion", "curiosity"],
	"neutral": ["neutral"]
}
category_to_index = {category: index for index, category in enumerate(emotion_categories)}

In [11]:
def feature_store_to_dataset(dataframe, category_to_index, shuffle=True, batch_size=16):
    dataframe = dataframe.copy()
    
    # Extract labels and convert to numerical values
    labels = dataframe.pop('emotions').apply(lambda x: category_to_index[x]).values
    
    # Parse 'input_ids' and 'attention_mask' from strings to lists of integers
    input_ids = dataframe['input_ids'].apply(ast.literal_eval).tolist()
    attention_mask = dataframe['attention_mask'].apply(ast.literal_eval).tolist()
    
    # Convert lists to TensorFlow tensors
    input_ids = tf.constant(input_ids, dtype=tf.int32)
    attention_mask = tf.constant(attention_mask, dtype=tf.int32)
    
    # Create a TensorFlow dataset
    ds = tf.data.Dataset.from_tensor_slices(({
        "input_ids": input_ids, 
        "attention_mask": attention_mask
    }, labels))
    
    # Shuffle and batch the dataset
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    
    return ds

In [12]:
batch_size = 16
train_dataset = feature_store_to_dataset(df_train, category_to_index, shuffle=True, batch_size=batch_size)
val_dataset = feature_store_to_dataset(df_val, category_to_index, shuffle=False, batch_size=batch_size)

2024-02-13 17:20:36.752821: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-02-13 17:20:36.762279: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-02-13 17:20:36.764106: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-02-13 17:20:36.766319: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX512F AVX512_VNNI
To enable them in other operations, rebu

# Hyperparamter Tuning

In [14]:
MIN_EPOCHS = 3
MAX_EPOCHS = 5
LR_MIN = 5e-6
LR_CEIL = 1e-5
LR_SCHEDULERS = ["PolynomialDecay", "CosineDecay"]
BATCH_SIZE_MIN = 8
BATCH_SIZE_CEIL = 16
NUM_LAYERS_FREEZE_MIN = 0
NUM_LAYERS_FREEZE_CEIL = 4
DROP_OUT_PROP_MIN = 0.3
DROP_OUT_PROP_CEIL = 0.4
DECAY_STEPS_MIN = 6000
DECAY_STEPS_CEIL = 8000
POWER_EXP_MIN = 1.0
POWER_EXP_CEIL = 3.0

In [18]:
def objective(trial: optuna.Trial):     
    num_layers_to_freeze = trial.suggest_int("num_layers_to_freeze", NUM_LAYERS_FREEZE_MIN, NUM_LAYERS_FREEZE_CEIL)
    dropout_prob = trial.suggest_float("dropout_prob", DROP_OUT_PROP_MIN, DROP_OUT_PROP_CEIL)
    config = BertConfig.from_pretrained('bert-base-uncased', num_labels=9, hidden_dropout_prob=dropout_prob, attention_probs_dropout_prob=dropout_prob)
    
    model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)
    for layer in model.bert.encoder.layer[:num_layers_to_freeze]:
        layer.trainable = False
    
    lr_scheduler_name = trial.suggest_categorical("lr_scheduler", LR_SCHEDULERS)
    batch_size = trial.suggest_int("batch_size", BATCH_SIZE_MIN, BATCH_SIZE_CEIL)
    
    decay_steps = 0
    if lr_scheduler_name == "PolynomialDecay":
        initial_learning_rate = trial.suggest_float("initial_learning_rate_poly", LR_MIN, LR_CEIL)
        decay_steps = trial.suggest_int("decay_steps_poly", DECAY_STEPS_MIN, DECAY_STEPS_CEIL)
        power = trial.suggest_float("power_poly", POWER_EXP_MIN, POWER_EXP_CEIL)
        lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
        initial_learning_rate=initial_learning_rate,
        decay_steps=decay_steps,
        power=power)
    elif lr_scheduler_name == "CosineDecay":
        initial_learning_rate = trial.suggest_float("initial_learning_rate_cosine", LR_MIN, LR_CEIL)
        decay_steps = trial.suggest_int("decay_steps_cosine", DECAY_STEPS_MIN, DECAY_STEPS_CEIL)
        lr_schedule = tf.keras.experimental.CosineDecay(
            initial_learning_rate=initial_learning_rate,
            decay_steps=decay_steps,
            alpha=0.0)
        
    num_epochs = trial.suggest_int("num_epochs", MIN_EPOCHS, MAX_EPOCHS)
    optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=lr_schedule)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    
    print(f"lr_scheduler: {lr_scheduler_name}, batch_size: {batch_size}, dropout: {dropout_prob}, num_frozen: {num_layers_to_freeze}, initial_learning_rate: {initial_learning_rate}, decay_steps: {decay_steps}, num_epochs: {num_epochs}")
    model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
    trainer = model.fit(train_dataset, 
              epochs=num_epochs, 
              validation_data=val_dataset)
    result = trainer.history
    
    return result['val_loss'][-1]

In [19]:
study = optuna.create_study(direction='minimize')

[I 2024-02-13 18:59:27,050] A new study created in memory with name: no-name-058c75ed-dfbd-4583-90b2-a10d1d47db74


In [None]:
study.optimize(objective, n_trials=10)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


lr_scheduler: CosineDecay, batch_size: 14, dropout: 0.2437810201978664, num_frozen: 6, initial_learning_rate: 2.1005659482580446e-05, decay_steps: 6308, num_epochs: 3
Epoch 1/3
Epoch 2/3
Epoch 3/3


[I 2024-02-13 19:19:27,926] Trial 0 finished with value: 1.1135334968566895 and parameters: {'num_layers_to_freeze': 6, 'dropout_prob': 0.2437810201978664, 'lr_scheduler': 'CosineDecay', 'batch_size': 14, 'initial_learning_rate_cosine': 2.1005659482580446e-05, 'decay_steps_cosine': 6308, 'num_epochs': 3}. Best is trial 0 with value: 1.1135334968566895.
All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


lr_scheduler: CosineDecay, batch_size: 12, dropout: 0.359094842080101, num_frozen: 0, initial_learning_rate: 4.823502706062804e-05, decay_steps: 7735, num_epochs: 3
Epoch 1/3
Epoch 2/3
Epoch 3/3


[I 2024-02-13 19:42:15,859] Trial 1 finished with value: 1.1633527278900146 and parameters: {'num_layers_to_freeze': 0, 'dropout_prob': 0.359094842080101, 'lr_scheduler': 'CosineDecay', 'batch_size': 12, 'initial_learning_rate_cosine': 4.823502706062804e-05, 'decay_steps_cosine': 7735, 'num_epochs': 3}. Best is trial 0 with value: 1.1135334968566895.
All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


lr_scheduler: CosineDecay, batch_size: 16, dropout: 0.3253139877174047, num_frozen: 4, initial_learning_rate: 1.0311930946365403e-05, decay_steps: 5829, num_epochs: 5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[I 2024-02-13 20:16:43,688] Trial 2 finished with value: 1.1953095197677612 and parameters: {'num_layers_to_freeze': 4, 'dropout_prob': 0.3253139877174047, 'lr_scheduler': 'CosineDecay', 'batch_size': 16, 'initial_learning_rate_cosine': 1.0311930946365403e-05, 'decay_steps_cosine': 5829, 'num_epochs': 5}. Best is trial 0 with value: 1.1135334968566895.
All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


lr_scheduler: CosineDecay, batch_size: 8, dropout: 0.3707144434421056, num_frozen: 4, initial_learning_rate: 2.0628070736326217e-05, decay_steps: 6270, num_epochs: 5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[I 2024-02-13 20:51:16,954] Trial 3 finished with value: 1.2030062675476074 and parameters: {'num_layers_to_freeze': 4, 'dropout_prob': 0.3707144434421056, 'lr_scheduler': 'CosineDecay', 'batch_size': 8, 'initial_learning_rate_cosine': 2.0628070736326217e-05, 'decay_steps_cosine': 6270, 'num_epochs': 5}. Best is trial 0 with value: 1.1135334968566895.
All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


lr_scheduler: PolynomialDecay, batch_size: 16, dropout: 0.2187157338850907, num_frozen: 3, initial_learning_rate: 1.2666993453461297e-05, decay_steps: 7438, num_epochs: 5
Epoch 1/5
Epoch 2/5
Epoch 3/5

In [None]:
study.best_params

In [None]:
optuna.visulatization.plot_optimization_history(study)