In [2]:
!pip install -qU pip transformers

[0m

In [3]:
import boto3
import pandas as pd
from sklearn.model_selection import train_test_split
import io
import numpy as np
import os

from transformers import TFBertForSequenceClassification, BertTokenizer
import tensorflow as tf
from tensorflow.data import Dataset
from sklearn.utils import class_weight

2024-02-05 19:48:43.292166: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX512F AVX512_VNNI
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-02-05 19:48:43.416197: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.
2024-02-05 19:48:43.416291: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.
2024-02-05 19:48:43.416481: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-0

In [4]:
region_name='us-west-2'

s3 = boto3.client('s3')
bucket_name = 'aai-540-final-data'
s3_path = 'data/pre_processed_data.tsv'

In [5]:
data_obj = s3.get_object(Bucket=bucket_name, Key=s3_path)

df = pd.read_csv(io.BytesIO(data_obj['Body'].read()), delimiter='\t')
df.head()

Unnamed: 0,text,emotions,id
0,"He isn't as big, but he's still quite popular....",0,eczuekb
1,that's adorable asf,0,ef961hv
2,"I have, and now that you mention it, I think t...",27,ed9w1hm
3,"I wanted to downvote this, but it's not your f...",27,ee52cjs
4,Build a wall? /jk,27,edsqvyx


In [6]:
# split into train, text, val
X = df.drop('emotions', axis=1)
y = df['emotions']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Further split the training set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125, random_state=42)

In [7]:
y_train

12418     3
22289     2
20074    27
33910    13
25163     2
         ..
51824     4
16566     1
40366    27
13497     3
39242    18
Name: emotions, Length: 42731, dtype: int64

In [8]:
# Loading the file with emotion label names
emotion_labels_obj = s3.get_object(Bucket=bucket_name, Key='data/emotions.txt')
emotion_labels_data = emotion_labels_obj['Body'].read()

# Converting the data from bytes to string and splitting by lines
emotion_labels = emotion_labels_data.decode('utf-8').splitlines()
# Split emotions column to get count of each emotion individually
split_emotions = y_train.astype(str).str.split(',')
all_emotions = split_emotions.explode().astype(int)
# Creating a mapping of indices to emotion labels
emotion_index_to_label = {index: label for index, label in enumerate(emotion_labels)}

# Applying the mapping to the emotions dataset
labeled_emotions = all_emotions.map(emotion_index_to_label)

# Counting occurrences of each emotion label
labeled_emotion_counts = labeled_emotions.value_counts()

emotion_categories = {
	"anger": ["anger", "annoyance", "disapproval"],
	"disgust": ["disgust"],
	"fear": ["fear", "nervousness"],
	"happy": ["joy", "amusement", "approval", "gratitude"],
	"optimistic": ["optimism", "relief", "pride", "excitement"],
	"affectionate": [ "love", "caring", "admiration",  "desire"],
	"sadness": ["sadness", "disappointment", "embarrassment", "grief",  "remorse"],
	"surprise": ["surprise", "realization", "confusion", "curiosity"],
	"neutral": ["neutral"]
} 

emotion_to_category = {}
for category, emotions in emotion_categories.items():
	for emotion in emotions:
		emotion_to_category[emotion] = category

category_counts = pd.Series(dtype=int).reindex(emotion_categories.keys(), fill_value=0)

for emotion, count in labeled_emotion_counts.items():
	category = emotion_to_category[emotion]
	if category:
		category_counts[category] += count

category_counts

anger            5291
disgust           572
fear              617
happy            7911
optimistic       1787
affectionate     7079
sadness          2538
surprise         4403
neutral         12533
dtype: int64

In [9]:
# count number of labels
num_labels = len(category_counts)
num_labels

9

In [10]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load the BERT model
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=9)

2024-02-02 04:08:53.510858: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-02-02 04:08:53.518353: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-02-02 04:08:53.520155: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-02-02 04:08:53.522293: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX512F AVX512_VNNI
To enable them in other operations, rebuild TensorFlow with the appropriate com

In [11]:
# Convert the labels given the emotion_categories mapping
y_train = y_train.apply(lambda x: emotion_to_category[emotion_index_to_label[x]])

In [12]:
y_val = y_val.apply(lambda x: emotion_to_category[emotion_index_to_label[x]])
y_test = y_test.apply(lambda x: emotion_to_category[emotion_index_to_label[x]])

In [13]:
y_train.value_counts()

neutral         12533
happy            7911
affectionate     7079
anger            5291
surprise         4403
sadness          2538
optimistic       1787
fear              617
disgust           572
Name: emotions, dtype: int64

In [14]:
# Tokenize the input data
train_encodings = tokenizer(X_train['text'].tolist(), truncation=True, padding=True, max_length=50, return_tensors='tf')
val_encodings = tokenizer(X_val['text'].tolist(), truncation=True, padding=True, max_length=50, return_tensors='tf')
test_encodings = tokenizer(X_test['text'].tolist(), truncation=True, padding=True, max_length=50, return_tensors='tf')

In [15]:
# Create a dictionary that maps each category to its index
category_to_index = {category: index for index, category in enumerate(emotion_categories)}

In [16]:
# Convert the categories in y_train to indices
y_train_num = y_train.apply(lambda x: category_to_index[x])
# Convert the categories in y_val and y_test to indices
y_val_num = y_val.apply(lambda x: category_to_index[x])
y_test_num = y_test.apply(lambda x: category_to_index[x])

In [17]:
y_train

12418           anger
22289           anger
20074         neutral
33910      optimistic
25163           anger
             ...     
51824           happy
16566           happy
40366         neutral
13497           anger
39242    affectionate
Name: emotions, Length: 42731, dtype: object

In [18]:
train_dataset = Dataset.from_tensor_slices((
	dict(train_encodings),
	y_train_num.values
))
val_dataset = Dataset.from_tensor_slices((
    dict(val_encodings),
    y_val_num.values
))
test_dataset = Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test_num.values
))

In [19]:
# Calculate class weights
class_weights = class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(y_train_num), y=y_train_num)

# Convert class weights to dictionary for compatibility with Keras
class_weights_dict = dict(enumerate(class_weights))

In [20]:
batch_size = 16
num_epochs = 4
decay_steps = num_epochs * (len(X_train) / batch_size)

lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=5e-5,
    decay_steps=decay_steps,
    power=1.0)
optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=lr_schedule)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [21]:
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

# Train the model with validation data
model.fit(train_dataset.shuffle(1000).batch(batch_size), 
          epochs=num_epochs, 
          validation_data=val_dataset.batch(batch_size))

Epoch 1/4
Extension horovod.torch has not been built: /usr/local/lib/python3.9/site-packages/horovod/torch/mpi_lib/_mpi_lib.cpython-39-x86_64-linux-gnu.so not found
If this is not expected, reinstall Horovod with HOROVOD_WITH_PYTORCH=1 to debug the build error.
[2024-02-02 04:09:18.079 tensorflow-2-10-1-g-ml-g4dn-xlarge-e8596bac38df100bfc7a3aaf537e:20 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2024-02-02 04:09:18.287 tensorflow-2-10-1-g-ml-g4dn-xlarge-e8596bac38df100bfc7a3aaf537e:20 INFO profiler_config_parser.py:111] Unable to find config at /opt/ml/input/config/profilerconfig.json. Profiler is disabled.




Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f34a9c40460>

In [5]:
model_dir = 'models/'

In [8]:
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

('models/tokenizer/tokenizer_config.json',
 'models/tokenizer/special_tokens_map.json',
 'models/tokenizer/vocab.txt',
 'models/tokenizer/added_tokens.json')

In [8]:
import os
tar_file = 'model.tar.gz'

command = f"tar -czvf {tar_file} -C {model_dir} ."
os.system(command)

./
./tokenizer_config.json
./.ipynb_checkpoints/
./config.json
./special_tokens_map.json
./vocab.txt
./tf_model.h5


0

In [9]:
local_tar_file_path = tar_file
s3_key = f'models/{tar_file}'
s3.upload_file(local_tar_file_path, bucket_name, s3_key)