In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q -U "tensorflow-text==2.8.*"
!pip install -q tf-models-official==2.7.0

[K     |████████████████████████████████| 4.9 MB 8.3 MB/s 
[K     |████████████████████████████████| 498.0 MB 13 kB/s 
[K     |████████████████████████████████| 5.8 MB 86.5 MB/s 
[K     |████████████████████████████████| 462 kB 86.8 MB/s 
[K     |████████████████████████████████| 1.4 MB 83.7 MB/s 
[K     |████████████████████████████████| 1.8 MB 8.5 MB/s 
[K     |████████████████████████████████| 118 kB 84.8 MB/s 
[K     |████████████████████████████████| 352 kB 83.3 MB/s 
[K     |████████████████████████████████| 238 kB 87.4 MB/s 
[K     |████████████████████████████████| 1.1 MB 68.1 MB/s 
[K     |████████████████████████████████| 1.3 MB 61.0 MB/s 
[K     |████████████████████████████████| 43 kB 2.4 MB/s 
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [None]:
import pandas as pd
import os
import shutil
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer
import tensorflow_addons as tfa

tf.get_logger().setLevel('ERROR')

In [None]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [None]:
# Create train set
emotions = ["anger", "anticipation", "fear", "joy", "sadness"]
data_file_train = "/content/drive/MyDrive/ling490/TermProject/data/small.csv"
train_lang = "en"

for e in emotions: os.makedirs(f"/content/train/{train_lang}/{e}")

df = pd.read_csv(data_file_train)
sentences = [[],[],[],[],[]]
for idx,row in df.loc[df['language'] == train_lang].iterrows():
  sentences[emotions.index(row[1])].append(row[0])

for i,e in enumerate(sentences):
  train_sents = e[:int(len(e)*.8)]
  for j,sent in enumerate(train_sents):
    with open(f"/content/train/{train_lang}/{emotions[i]}/{j}.txt", 'w') as f: f.write(sent)

In [None]:
# Create test set for large languages from file 1
emotions = ["anger", "anticipation", "fear", "joy", "sadness"]
data_file_test = "/content/drive/MyDrive/ling490/TermProject/data/small.csv"
test_langs = ["en", "tl", "zh", "pt"]

for test_lang in test_langs:
  for e in emotions: os.makedirs(f"/content/test/{test_lang}/{e}")

  df = pd.read_csv(data_file_test)
  sentences = [[],[],[],[],[]]
  for idx,row in df.loc[df['language'] == test_lang].iterrows():
    sentences[emotions.index(row[1])].append(row[0])

  for i,sents in enumerate(sentences):
    test_sents = sents[int(len(sents)*.8):]
    for j,sent in enumerate(test_sents):
      with open(f"/content/test/{test_lang}/{emotions[i]}/{j}.txt", 'w') as f: f.write(sent)

In [None]:
# Create test set for large languages from file 2
emotions = ["anger", "anticipation", "fear", "joy", "sadness"]
data_file_test = "/content/drive/MyDrive/ling490/TermProject/data/low-resource.csv"
test_langs = ["de", "fr", "id", "th", "vi"]

for test_lang in test_langs:
  for e in emotions: os.makedirs(f"/content/test/{test_lang}/{e}")

  df = pd.read_csv(data_file_test)
  sentences = [[],[],[],[],[]]
  for idx,row in df.loc[df['language'] == test_lang].iterrows():
    sentences[emotions.index(row[1])].append(row[0])

  for i,sents in enumerate(sentences):
    test_sents = sents[int(len(sents)*.8):]
    for j,sent in enumerate(test_sents):
      with open(f"/content/test/{test_lang}/{emotions[i]}/{j}.txt", 'w') as f: f.write(sent)

In [None]:
# Create test set for small languages from file 2
emotions = ["anger", "anticipation", "fear", "joy", "sadness"]
data_file_test = "/content/drive/MyDrive/ling490/TermProject/data/low-resource.csv"
test_langs = ["km", "bn", "my"]

for test_lang in test_langs:
  for e in emotions: os.makedirs(f"/content/test/{test_lang}/{e}")

  df = pd.read_csv(data_file_test)
  sentences = [[],[],[],[],[]]
  for idx,row in df.loc[df['language'] == test_lang].iterrows():
    sentences[emotions.index(row[1])].append(row[0])

  for i,sents in enumerate(sentences):
    for j,sent in enumerate(sents):
      with open(f"/content/test/{test_lang}/{emotions[i]}/{j}.txt", 'w') as f: f.write(sent)

In [None]:
# Convert to tf datasets
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 1
seed = 42

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    f'/content/train/{train_lang}',
    batch_size=batch_size,
    seed=seed,label_mode='categorical')
train_ds = raw_train_ds.cache().prefetch(buffer_size=AUTOTUNE)

Found 2357 files belonging to 5 classes.
Found 953 files belonging to 5 classes.


In [None]:
loss = tf.keras.losses.CategoricalCrossentropy(from_logits=False)
metrics = [tf.metrics.CategoricalAccuracy(), 
           tfa.metrics.F1Score(num_classes=len(emotions),average='weighted')]

epochs = 12
steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 1e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

In [None]:
# Load trained model
classifier_model = tf.keras.models.load_model(f"/content/drive/MyDrive/ling490/TermProject/checkpoints/en12")
classifier_model.compile(optimizer, loss, metrics)

In [None]:
# Evlauate model on each test language
for test_lang in ["en","fr","pt","zh","id","vi","th","bn","de","my","tl","km"]:
  test_ds = tf.keras.utils.text_dataset_from_directory(
    f'/content/test/{test_lang}',
    batch_size=batch_size,label_mode='categorical')
  test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

  loss_m, accuracy, f1 = classifier_model.evaluate(test_ds)
  print(test_lang)
  print(f'Loss: {loss_m}')
  print(f'Accuracy: {accuracy}')
  print(f'F1: {f1}')

Found 590 files belonging to 5 classes.
en
Loss: 2.4969141483306885
Accuracy: 0.6016949415206909
F1: 0.5992365479469299
Found 1314 files belonging to 5 classes.
fr
Loss: 2.6729252338409424
Accuracy: 0.5616438388824463
F1: 0.5563379526138306
Found 592 files belonging to 5 classes.
pt
Loss: 2.6303935050964355
Accuracy: 0.5844594836235046
F1: 0.5779013633728027
Found 592 files belonging to 5 classes.
zh
Loss: 2.0476770401000977
Accuracy: 0.6689189076423645
F1: 0.6397345662117004
Found 1243 files belonging to 5 classes.
id
Loss: 2.6769630908966064
Accuracy: 0.56154465675354
F1: 0.5483831763267517
Found 794 files belonging to 5 classes.
vi
Loss: 3.2410783767700195
Accuracy: 0.48236775398254395
F1: 0.46206215023994446
Found 763 files belonging to 5 classes.
th
Loss: 2.606219530105591
Accuracy: 0.5779816508293152
F1: 0.5500348210334778
Found 869 files belonging to 5 classes.
bn
Loss: 3.4861316680908203
Accuracy: 0.45339471101760864
F1: 0.42842793464660645
Found 1182 files belonging to 5 class