In [None]:
# Connect to Google Drive for getting data and saving models
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import os

# Create train set
emotions = ["anger", "anticipation", "fear", "joy", "sadness"]
data_file_train = "/content/drive/MyDrive/ling490/TermProject/data/small.csv"
train_lang = "en"
test_lang = "en"
train_split = .8

for e in emotions: os.makedirs(f"/content/train/{test_lang}/{e}")

df = pd.read_csv(data_file_train)
sentences = [[],[],[],[],[]]
for idx,row in df.loc[df['language'] == train_lang].iterrows():
  sentences[emotions.index(row[1])].append(row[0])

for i,e_sents in enumerate(sentences):
  train_sents = e_sents[:int(len(e_sents)*train_split)]
  for j,sent in enumerate(train_sents):
    with open(f"/content/train/{test_lang}/{emotions[i]}/{train_lang}_{j}.txt", 'w') as f: f.write(sent)

In [None]:
# Create test set
emotions = ["anger", "anticipation", "fear", "joy", "sadness"]
data_file_test = "/content/drive/MyDrive/ling490/TermProject/data/small.csv"
test_lang = "en"

for e in emotions: os.makedirs(f"/content/test/{test_lang}/{e}")

df = pd.read_csv(data_file_test)
sentences = [[],[],[],[],[]]
for idx,row in df.loc[df['language'] == test_lang].iterrows():
  sentences[emotions.index(row[1])].append(row[0])

for i,e_sents in enumerate(sentences):
  test_sents = e_sents[int(len(e_sents)*train_split):]
  for j,sent in enumerate(test_sents):
    with open(f"/content/test/{test_lang}/{emotions[i]}/{j}.txt", 'w') as f: f.write(sent)

In [None]:
!pip install -q -U "tensorflow-text==2.8.*"
!pip install -q tf-models-official==2.7.0

[K     |████████████████████████████████| 4.9 MB 4.7 MB/s 
[K     |████████████████████████████████| 498.0 MB 10.0 kB/s 
[K     |████████████████████████████████| 1.4 MB 63.7 MB/s 
[K     |████████████████████████████████| 462 kB 83.1 MB/s 
[K     |████████████████████████████████| 5.8 MB 76.0 MB/s 
[K     |████████████████████████████████| 1.8 MB 5.1 MB/s 
[K     |████████████████████████████████| 352 kB 83.5 MB/s 
[K     |████████████████████████████████| 1.1 MB 59.9 MB/s 
[K     |████████████████████████████████| 118 kB 87.9 MB/s 
[K     |████████████████████████████████| 1.3 MB 57.4 MB/s 
[K     |████████████████████████████████| 238 kB 89.2 MB/s 
[K     |████████████████████████████████| 43 kB 2.2 MB/s 
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [None]:
import os
import shutil
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer
import tensorflow_addons as tfa

tf.get_logger().setLevel('ERROR')

In [None]:
# Check for GPU
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [None]:
# Convert to tf datasets
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 1
seed = 42

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    f'/content/train/{test_lang}',
    batch_size=batch_size,
    seed=seed,label_mode='categorical')
train_ds = raw_train_ds.cache().prefetch(buffer_size=AUTOTUNE)

test_ds = tf.keras.utils.text_dataset_from_directory(
    f'/content/test/{test_lang}',
    batch_size=batch_size,label_mode='categorical')
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

Found 11109 files belonging to 5 classes.
Found 2947 files belonging to 5 classes.


In [None]:
# Load model and preprocessor
tfhub_handle_encoder = "https://tfhub.dev/jeongukjae/xlm_roberta_multi_cased_L-12_H-768_A-12/1"
tfhub_handle_preprocess = "https://tfhub.dev/jeongukjae/xlm_roberta_multi_cased_preprocess/1"

# Preprocess sanity check
# bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)
# text_test = ['this is such an amazing movie!']
# text_preprocessed = bert_preprocess_model(text_test)
# print(f'Keys       : {list(text_preprocessed.keys())}')

# Model sanity check
# xlmr_model = hub.KerasLayer(tfhub_handle_encoder)
# xlmr_results = xlmr_model(text_preprocessed)
# print(f'Pooled Outputs Shape:{xlmr_results["pooled_output"].shape}')

In [None]:
# Define model architecture
def build_classifier_model():
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='xlmr_encoder')
  outputs = encoder(encoder_inputs)
  net = outputs['pooled_output']
  net = tf.keras.layers.Dense(5, activation=tf.keras.activations.softmax, name='classifier')(net)
  return tf.keras.Model(text_input, net)

In [None]:
classifier_model = build_classifier_model()

# Classifier sanity check
# bert_raw_result = classifier_model(tf.constant(text_test))
# print(tf.sigmoid(bert_raw_result))

In [None]:
loss = tf.keras.losses.CategoricalCrossentropy(from_logits=False)
metrics = [tf.metrics.CategoricalAccuracy(), tfa.metrics.F1Score(num_classes=len(emotions),average='weighted')]

In [None]:
epochs = 12
steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 1e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

In [None]:
# Compile model
classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)

In [None]:
# Train model
history = classifier_model.fit(x=train_ds,
                              epochs=epochs)
# Save model
classifier_model.save(f"/content/drive/MyDrive/ling490/TermProject/checkpoints/{test_lang}12", include_optimizer=False)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6




In [None]:
# Evaluate model
loss_m, accuracy, f1 = classifier_model.evaluate(test_ds)

print(f'Loss: {loss_m}')
print(f'Accuracy: {accuracy}')
print(f'F1: {f1}')

Loss: 3.7874960899353027
Accuracy: 0.4876145124435425
F1: 0.47029101848602295


In [None]:
# Get class predictions
res = classifier_model.predict(test_ds)
pred = [list(p).index(max(p)) for p in res]