## Accessing the project directory on my Google Drive

In [1]:
import os
from google.colab import drive

drive.mount('/drive', force_remount=False)

Mounted at /drive


In [2]:
project_dir = "/drive/My Drive/RNN_seq2seq"

# change working directory to project_dir
os.chdir(project_dir)

## Dependencies

In [3]:
import torch
import torch.nn as nn

from os import makedirs
from os.path import join

import random
import pandas as pd 
import matplotlib.pyplot as plt
from string import ascii_lowercase

from scripts.data import get_input_spec_red_pair, n_words_of_length
from scripts.utils import read_data, read_json, save_ds_in_txt, save_dict_as_json
from scripts.dataloader import get_text_encoder_decoder, customize_dataloader_func
from scripts.pytorch_utils import get_model, customize_predictor, metrics, get_results

## Log time

In [4]:
from datetime import datetime

now = datetime.now()
print("Time stamp:", now.strftime("%Y-%m-%d %H:%M:%S"))

Time stamp: 2023-02-13 04:18:03


## Creating new data and processing

In [5]:
gen = []
gen_X = []
gen_size = 5000
out_distribution_ranges = [(31, 50)]

for (l, h) in out_distribution_ranges:
    for n in range(l, h+1):
        gen_X.extend(n_words_of_length(gen_size, n, ascii_lowercase))

for n in range(1, 4):
    for x in gen_X:
        gen.append(get_input_spec_red_pair(x, n))

# for reproducibility 
date = now.strftime("%Y_%m_%d")
save_ds_in_txt(gen, f"data/input_spec_red/gen_{date}")

data/input_spec_red/gen_2023_02_13 saved!


In [6]:
len(gen)

300000

In [7]:
in_vocab = ascii_lowercase + "@"
out_vocab = ascii_lowercase + "@"
in_seq_encoder, in_seq_decoder = get_text_encoder_decoder(in_vocab)
out_seq_encoder, out_seq_decoder = get_text_encoder_decoder(out_vocab)

In [8]:
dataloader_func = customize_dataloader_func(in_seq_encoder, 
                                            out_seq_encoder, 
                                            padding_idx=1, 
                                            batch_size=gen_size, 
                                            shuffle=False)

gen_dl = dataloader_func(gen)

## Re-evaluation

We redefine the evaluation and visualization functions so that they work for results obtained in the experiments for the input specified reduplication. 

In [9]:
def get_trained_model(task_name, rnn_type, attention, run_num):
    attention = "attn" if attention else "attn-less"
    saved_model_fp = f'experiments/{task_name}/{rnn_type}/{attention}/{run_num}/model.pt'
    ModelConfig = read_json(f'experiments/{task_name}/{rnn_type}/{attention}/{run_num}/ModelConfig.json')
    model = get_model(ModelConfig)
    model.load_state_dict(torch.load(saved_model_fp))
    return model

In [10]:
def evaluate_input_spec_red(model, dataloader, criterion):
    model.eval()    
    perf_log = dict()
        
    # aggragate performance
    aggr_perf = {"loss": 0.0, 
                 "full sequence accuracy": 0.0, 
                 "first n-symbol accuracy": 0.0, 
                 "overlap rate": 0.0}
    
    with torch.no_grad():
        for X, Y in dataloader:
            num_red = (X[:, 0] == 28).sum().item()
            x_seq_len = X.shape[0] - 2 - num_red # not counting <s>, </s>, and @
            seq_len, batch_size = Y.shape
            seq_len -= 1 # logits does not have <s>

            X = X.to(model.device)
            Y = Y.to(model.device)
            logits, _ = model(X, Y, teacher_forcing_ratio=0.0)
            
            Ypred = logits.view(seq_len, batch_size, -1).argmax(2)
            full_seq_accu, first_n_accu, overlap_rate = metrics(Y[1:], Ypred)
            loss = criterion(logits, Y[1:].view(-1))
            
            aggr_perf["loss"] += loss.item()
            aggr_perf["full sequence accuracy"] += full_seq_accu
            aggr_perf["first n-symbol accuracy"] += first_n_accu
            aggr_perf["overlap rate"] += overlap_rate
            
            batch_perf = {"loss": loss.item(), "full sequence accuracy": full_seq_accu, 
                          "first n-symbol accuracy": first_n_accu, "overlap rate": overlap_rate}

            key = f"Red-{num_red}"
            if key not in perf_log:
                perf_log[f"Red-{num_red}"] = {f"Len-{x_seq_len}": batch_perf}
            else:
                perf_log[f"Red-{num_red}"][f"Len-{x_seq_len}"] = batch_perf
    
    aggr_perf = {k:v/len(dataloader) for k,v in aggr_perf.items()}
    perf_log[f"Aggregated"] = aggr_perf
        
    return aggr_perf, perf_log

In [11]:
def plot_performances_per_seq_len(perfs, show_plot=True, saved_plot_fp=None, 
                                  metric="full sequence accuracy"):
    
    red_nums = [rn for rn in perfs if "Red" in rn]
    
    for red_num in red_nums:
        rn_dict = perfs[red_num]
        lengths = [int(l.split("-")[-1]) for l in perfs[red_num]]
        res = [rn_dict[f"Len-{l}"][metric] for l in lengths]
        plt.plot(lengths, res, ".-", label=red_num)
    
    plt.xlabel("Length")
    plt.ylabel(metric.title())
    plt.xticks(lengths, fontsize=6, rotation=-25)
    plt.grid(True, alpha=0.1)
    plt.legend()
        
    if saved_plot_fp != None:
        plt.savefig(saved_plot_fp, dpi=600, bbox_inches='tight')
        print(f"{saved_plot_fp} saved!")

    if show_plot:
        plt.show()
    else:
        plt.close()

    plt.show()

In [12]:
task_name = "input_spec_red"

main_results = []
task_folder = join("experiments", task_name + "_ATTN_SRNN_GEN")
main_res_col = ["Run #", "RNN", "Attention", "Dataset", "Loss", 
                "Full Sequence Accuracy", "First N-symbol Accuracy", "Overlap Rate"]

for rnn_type in ["SRNN"]:
    for use_attn in [True]:
        for run_num in range(1, 4):
            
            print(f"RNN type: {rnn_type}; use attention: {use_attn}; run num: {run_num}\n")
            
            model = get_trained_model(task_name, rnn_type, use_attn, run_num)
            criterion = nn.CrossEntropyLoss()

            attn = "attn" if use_attn else "attn-less"
            folder = join(task_folder, rnn_type, attn, str(run_num))
            makedirs(folder, exist_ok=True)

            gen_aggr, perfs_gen = evaluate_input_spec_red(model, gen_dl, criterion)

            for aggr, ds in zip([gen_aggr], ["Gen"]):
                main_results.append([run_num, rnn_type, use_attn, ds] + get_results(aggr, train_log=False))
            
            save_dict_as_json(perfs_gen, join(folder, "perfs_gen.json"))
            plot_performances_per_seq_len(perfs_gen, False, join(folder, "perfs_gen.png"))

pd.DataFrame(main_results, columns=main_res_col).to_csv(join(task_folder, "main_results.csv"), index=False)

RNN type: SRNN; use attention: True; run num: 1

The model has 1,467,165 trainable parameters
experiments/input_spec_red_ATTN_SRNN_GEN/SRNN/attn/1/perfs_gen.json saved!
experiments/input_spec_red_ATTN_SRNN_GEN/SRNN/attn/1/perfs_gen.png saved!
RNN type: SRNN; use attention: True; run num: 2

The model has 1,467,165 trainable parameters
experiments/input_spec_red_ATTN_SRNN_GEN/SRNN/attn/2/perfs_gen.json saved!
experiments/input_spec_red_ATTN_SRNN_GEN/SRNN/attn/2/perfs_gen.png saved!
RNN type: SRNN; use attention: True; run num: 3

The model has 1,467,165 trainable parameters
experiments/input_spec_red_ATTN_SRNN_GEN/SRNN/attn/3/perfs_gen.json saved!
experiments/input_spec_red_ATTN_SRNN_GEN/SRNN/attn/3/perfs_gen.png saved!


## Automatically disconnect and delete the runtime 

In [13]:
from google.colab import runtime
runtime.unassign()