In [1]:
from utils import *
import pandas as pd
from IPython.display import clear_output
from naive_models import LSTM, SimpleRNN
import numpy as np
import random as python_random
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping

np.random.seed(123)
python_random.seed(123)
tf.random.set_seed(1234)

V = TextVectorizer(tokenizer=list)
V.build_vocab(['a', 'b', 'c', 'd'])

Two vocabulary dictionaries have been built!
Please call [1mX.vocab_to_idx | X.idx_to_vocab[0m to find out more where [X] stands for the name you used for this TextVectorizer class.


In [2]:
def get_model(model_name, 
              embd_dim, 
              vocab_size, 
              num_class):

    model_name = model_name.lower()
    if model_name == 's-rnn':
        model = SimpleRNN
    elif model_name == 'lstm':
        model = LSTM
    else:
        raise ValueError(f"Only s-rnn and lstm models are allowed.")
    
    model = model(vocab_size, num_class, embd_dim)
    model.compile(optimizer=keras.optimizers.Adam(5e-4, clipnorm=1.0), 
                  loss=keras.losses.BinaryCrossentropy(),
                  metrics=[[keras.metrics.BinaryAccuracy(), 
                            keras.metrics.Precision(), 
                            keras.metrics.Recall()]])
    return model


def do_train_and_evaluate(model_name, 
                          embd_dim, 
                          train_path,  
                          epoch_num,
                          num_class=2,
                          batch_size=128,
                          encoder = V,
                          transform=transform,
                          vocab_size=len(V),
                          max_text_len=None,
                          val_split=0.,
                          earlystop=False, 
                          monitor='val_loss', 
                          patience=10):
    
    if earlystop:
        callbacks = [EarlyStopping(monitor=monitor, patience=patience)]
        val_split = 0.2
    else:
        callbacks = None
    
    test1_path, test2_path = get_two_test_fpathes(train_path)
    train, test1, test2 = load_dataset([train_path, 
                                        test1_path, 
                                        test2_path])
    
    train_X, train_Y = transform(train, encoder, num_class,
                                 shuffle=True, max_len=max_text_len)
    
    test1_X, test1_Y = transform(test1, encoder, 
                                 num_class, max_text_len)
    
    test2_X, test2_Y = transform(test2, encoder, 
                                 num_class, max_text_len)
    
    model = get_model(model_name, embd_dim, vocab_size, num_class)
    model.fit(train_X, train_Y, epochs=epoch_num, 
              validation_split=val_split,
              batch_size=batch_size, callbacks=callbacks)
        
    test1_res = model.evaluate(test1_X, test1_Y, batch_size=1000, verbose=0)[1:]
    test2_res = model.evaluate(test2_X, test2_Y, batch_size=1000, verbose=0)[1:]
    
    return test1_res, test2_res


def get_results(train_path, model_name, embd_dim, r, 
                epoch_num, batch_size, 
                earlystop=False, patience=0):
    
    metadata = train_path.split('/')[2:-1] + [model_name, embd_dim, 
                                              earlystop, r]
    
    test1_res, test2_res = do_train_and_evaluate(model_name, 
                                                 embd_dim, 
                                                 train_path, 
                                                 epoch_num, 
                                                 patience=patience,
                                                 earlystop=earlystop)
    res1 = metadata + ['Test1'] + test1_res
    res2 = metadata + ['Test2'] + test2_res
    return [res1, res2]

In [3]:
filepathes = get_filepathes_from_dir('Experimental Data/Data', 
                                     include_sub_dir=True, 
                                     file_format='Training.txt')

filepathes = sort_filepathes(filepathes)
columns = ['Lang Class', 'Lang Subclass', 'Train Size', 
           'Model', 'Embd Dim', 'EarlyStop', 'Round #', 
           'Test Set', 'Accuracy', 'Recall', 'Precision']

results = []


for fpath in filepathes[-3:]:
    sl, s = fpath.split('/')[3:-1]
    size = int(s.replace('k', ''))
    
    if size == 100:
        end = 2; epoch_num = 20; patience = 2; batch_size = 2048
        
    elif size == 10:
        end = 3; epoch_num = 70; patience = 5; batch_size = 512
        
    else:
        end = 4; epoch_num = 200; patience = 10; batch_size = 128
    
    for model in ['lstm', 's-rnn']:
        for dim in [10, 30, 100]:
            for r in range(1, end):
                for earlystop in [False, True]:
                    print(f"{'=' * 20} Round # {r} {'=' * 20}\n")
                    print(f"SubLang: {sl}; Size: {s}; Model: {model}; Embd Dim: {dim}; Earlystop: {earlystop}\n")
                    
                    result = get_results(fpath, model, dim, r, 
                                         epoch_num, batch_size, 
                                         earlystop, patience)
                    
                    results.extend(result)
                    clear_output(wait=True)
                    
        pd.DataFrame(results, columns=columns).to_csv('tf_results/SP8_results_naive.csv', index=False)


SubLang: SP8; Size: 100k; Model: s-rnn; Embd Dim: 100; Earlystop: True

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
