# Prerequisites

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

import numpy as np

In [None]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print(device)

# Load Excel File

In [None]:
import pandas as pd
master_df = pd.read_excel('./DATASETS/Training_Dataset.xlsx')
master_df.head()

In [None]:
model_df = master_df[['E1','Text']].copy()
model_df.head()

In [None]:
model_df.info()

In [None]:
model_df_label1 = model_df.query('E1 == 1')
len(model_df_label1)

In [None]:
model_df_label0 = model_df.query('E1 == 0')
len(model_df_label0)

In [None]:
model_df = pd.concat([model_df_label1[:1500],model_df_label0])

# Data process and tokenizer

In [None]:
from sklearn.model_selection  import train_test_split
train_df, test_df = train_test_split(model_df, test_size=0.1, shuffle=True)

train_iter = iter(list(train_df.itertuples(index=False, name=None)))
test_iter = iter(list(test_df.itertuples(index=False, name=None)))

In [None]:
len(train_df)

## Pre-trained embeddings GloVe

In [None]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer("basic_english")

def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [None]:
# !wget http://nlp.stanford.edu/data/glove.6B.zip

2024-04-12 10:22:30 (4.53 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]

2024-04-12 12:03:44 (3.88 MB/s) - ‘glove.42B.300d.zip’ saved [1877800501/1877800501]

2024-04-12 12:12:06 (4.40 MB/s) - ‘glove.840B.300d.zip’ saved [2176768927/2176768927]

In [None]:
# !unzip -d glove glove.6B.zip

Archive:  glove.6B.zip
  inflating: glove/glove.6B.50d.txt  
  inflating: glove/glove.6B.100d.txt  
  inflating: glove/glove.6B.200d.txt  
  inflating: glove/glove.6B.300d.txt  
  
Archive:  glove.42B.300d.zip
  inflating: glove/glove.42B.300d.txt 
  
Archive:  glove.840B.300d.zip
  inflating: glove/glove.840B.300d.txt


In [None]:
# GLOVE_NAME = "840B"
# GLOVE_NAME = "42B"
GLOVE_NAME = "6B"

GLOVE_DIM = 100
# GLOVE_DIM = 300

def load_embs_npa(glove_name=GLOVE_NAME, glove_dim=GLOVE_DIM):
    # Read embeddings from pre-downloaded file.
    embeddings = []
    
    with open(f'/home/it/environments/Genety/glove/glove.{glove_name}.{glove_dim}d.txt','rt') as fi:
        full_content = fi.read().strip().split('\n')
    for i in range(len(full_content)):
        i_word = full_content[i].split(' ')[0]
        i_embeddings = [float(val) for val in full_content[i].split(' ')[1:]]
        embeddings.append(i_embeddings)
    
    # Convert tu numpy
    embs_npa = np.array(embeddings)
    
    pad_emb_npa = np.zeros((1,embs_npa.shape[1]))   #embedding for '<pad>' token.
    unk_emb_npa = np.mean(embs_npa,axis=0,keepdims=True)    #embedding for '<unk>' token.

    #insert embeddings for pad and unk tokens at top of embs_npa.
    embs_npa = np.vstack((pad_emb_npa,unk_emb_npa,embs_npa))
    
    return embs_npa

In [None]:
embs_npa = load_embs_npa()
print(embs_npa.shape)

# Dataset iterator

In [None]:
train_iter = iter(list(train_df.itertuples(index=False, name=None)))
test_iter = iter(list(test_df.itertuples(index=False, name=None)))

In [None]:
next(train_iter)

In [None]:
def pad_tensor(x, max_len):
    padded = torch.zeros(max_len, GLOVE_DIM)
    
    if len(x) > max_len: padded[:] = x[:max_len]
    else: padded[:len(x)] = x
        
    return padded

def pad_sequences(x, max_len):
    padded = np.zeros((max_len), dtype=np.int64)
    
    if len(x) > max_len: padded[:] = x[:max_len]
    else: padded[:len(x)] = x
        
    return padded

In [None]:
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x)

In [None]:
from torch.utils.data import DataLoader

MAX_LENGTH = 100

def collate_batch(batch):
    label_list, text_list = [], []
    for _label, _text in batch:
        label_list.append(label_pipeline(_label))
        processed_text = text_pipeline(_text)
        padded = pad_sequences(processed_text, MAX_LENGTH)
        text_list.append(padded)
        
    label_list = torch.tensor(np.array(label_list), dtype=torch.int64)
    text_list = torch.tensor(np.array(text_list), dtype=torch.long)
    return label_list.to(device), text_list.to(device)

In [None]:
train_iter = iter(list(train_df.itertuples(index=False, name=None)))
first = next(train_iter)
second = next(train_iter)

print(first)
print(second)

label, text = collate_batch([first, second])

print(label)
print(label.shape)
print(text)
print(text.shape)


# Define the model

In [None]:
class GRU_Model(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden, layers, num_classes, dropout):
        super().__init__()

        self.embs_npa = load_embs_npa()
        
        self.embedding_layer = nn.Embedding.from_pretrained(torch.from_numpy(self.embs_npa).float())
        
        self.rnn = nn.GRU(input_size= embed_size,
                            hidden_size=hidden,
                            num_layers=layers,
                            batch_first=True,
                            bidirectional=True, 
                            dropout=dropout
                           )
        
        self.dropout = nn.Dropout(0.5)
        
        self.fc = nn.Linear(in_features=hidden*2, out_features=num_classes) # Double the size of hidden neurons to account for the reverse pass
        nn.init.kaiming_normal_(self.fc.weight)

    def forward(self, x):
        embedded = self.embedding_layer(x)
        y, h = self.rnn(embedded)
        y = self.dropout(y)
        return self.fc(y[:,-1]) # Only use output for last timestep. The reason is because this is a classification problem.

In [None]:
num_class = len(set([label for (label, text) in train_iter]))
num_class

In [None]:
train_iter = iter(list(train_df.itertuples(index=False, name=None)))
num_class = len(set([label for (label, text) in train_iter]))
vocab_size = len(vocab)
emsize = GLOVE_DIM
hidden_size = 100
num_layers = 5
dropout = 0.1
model = GRU_Model(vocab_size, emsize, hidden_size, num_layers, num_class, dropout)

In [None]:
model

# Train and eval functions

In [None]:
from datetime import datetime, date
import time

def train(model, dataloader, epoch):
    
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 5
    start_time = time.time()
    total_loss = 0


    for idx, (label, text) in enumerate(dataloader):         
        optimizer.zero_grad()
        
        predicted_label = model(text)
        loss = criterion(predicted_label, label)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        total_loss += loss.item()
        
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print(
                "| epoch {:3d} | {:5d}/{:5d} batches "
                "| accuracy {:8.3f} | {}".format(
                    epoch, idx, len(dataloader), total_acc / total_count, datetime.now().isoformat()
                )
            )
            start_time = time.time()
        
     
        
    return total_acc / total_count, total_loss / total_count
        

def evaluate(model, dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text) in enumerate(dataloader):      
            predicted_label = model(text)
            loss = criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)

    return total_acc / total_count, loss.item() / total_count

# Split the dataset and run the model

In [None]:
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset

BATCH_SIZE = 64  # batch size for training

train_iter = iter(list(train_df.itertuples(index=False, name=None)))
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)
num_train = int(len(train_dataset) * 0.8)
split_train_, split_valid_ = random_split(
    train_dataset, [num_train, len(train_dataset) - num_train]
)

train_dataloader = DataLoader(
    split_train_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)
valid_dataloader = DataLoader(
    split_valid_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)
test_dataloader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)

In [None]:
today = date.today().isoformat()

# Hyperparameters
EPOCHS = 20  # epoch
LR = 0.01 # learning rate
    
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
    
def train_with_hist(model, checkpoint_path = './models/simple_embeddings_baseline'):
    model = model.to(device)
    
    total_accu = None

    loss_hist_train, loss_hist_valid, accuracy_hist_train, accuracy_hist_valid = [], [], [], []
    for epoch in range(1, EPOCHS + 1):
        epoch_start_time = time.time()
        accu_train, loss_train = train(model, train_dataloader, epoch)
        accu_val, loss_val = evaluate(model, valid_dataloader)
        if total_accu is not None and total_accu > accu_val:
            scheduler.step()
            print("Learning rate took a step by the scheduler {:8.3f} > {:8.3f}".format(total_accu, accu_val))
        else:
            total_accu = accu_val
        print("-" * 59)
        print(
            "| end of epoch {:3d} | time: {:5.2f}s | "
            "valid accuracy {:8.3f} ".format(
                epoch, time.time() - epoch_start_time, accu_val
            )
        )
        print("-" * 59)

        loss_hist_train.append(loss_train)
        loss_hist_valid.append(loss_val)
        accuracy_hist_train.append(accu_train)
        accuracy_hist_valid.append(accu_val)
        
    torch.save(model, f'{checkpoint_path}/{today}_bidirectional_gru_glove.pt')
    return loss_hist_train, loss_hist_valid, accuracy_hist_train, accuracy_hist_valid

In [None]:
hist = train_with_hist(model, './models/bidirectional_gru_glove')

In [None]:
import numpy as np
import matplotlib.pyplot as plt
def plot_learning_curves(hist):
    x_arr = np.arange(len(hist[0])) + 1
    fig = plt.figure(figsize=(12, 4))
    ax = fig.add_subplot(1, 2, 1)
    ax.plot(x_arr, hist[0], '-o', label='Train loss')
    ax.plot(x_arr, hist[1], '--<', label='Validation loss')
    ax.legend(fontsize=15)
    ax = fig.add_subplot(1, 2, 2)
    ax.plot(x_arr, hist[2], '-o', label='Train acc.')
    ax.plot(x_arr, hist[3], '--<', label='Validation acc.')
    ax.legend(fontsize=15)
    ax.set_xlabel('Epoch', size=15)
    ax.set_ylabel('Accuracy', size=15)
    plt.show()

In [None]:
plot_learning_curves(hist)

### Evaluate the model with test dataset

Checking the results of the test dataset…



In [None]:
print("Checking the results of test dataset.")
accu_test, _ = evaluate(model, test_dataloader)
print("test accuracy {:8.3f}".format(accu_test))

In [None]:
# create confusion matrix
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns

def plot_confusion_matrix(model, dataloader):
    model.eval()
    y_test = np.asarray([])
    y_predict = np.asarray([])

    with torch.no_grad():
        for idx, (label, text) in enumerate(dataloader):
            predicted_label = model(text)
                  
            y_test = np.concatenate((y_test, np.asarray(label.to(device='cpu', dtype=torch.long))), axis=None)
            y_predict = np.concatenate((y_predict, np.asarray((predicted_label.argmax(1).to(device='cpu', dtype=torch.long)))), axis=None)

    cm = confusion_matrix(y_test, y_predict)
    sns.heatmap(cm, annot=True, fmt = "d")
    print(classification_report(y_test, y_predict))

In [None]:
plot_confusion_matrix(model, test_dataloader)

In [None]:
def predict(text):
    with torch.no_grad():
        text = torch.tensor(text_pipeline(text)).to('cpu')
        padded = torch.tensor(pad_sequences(text, MAX_LENGTH), dtype=torch.long).to(device)
        output = model(padded.unsqueeze(0))
        return output.argmax(1).item()

In [None]:
ex_text_str = 'The ePump Software shall define Fault ID 1 as follows:'

print("This is a %s" % predict(ex_text_str))

In [None]:
pred_text = "The IO Service shall select the XLR-PW DEV_INFO_DATA file if HPP_XLR_WIRING is grounded (logical 1) and bits AC_TYPE_BIT1 - AC_TYPE_BIT6 do not indicate a CFM engine configuration. NOTE: HPP_XLR_WIRING and bits AC_TYPE_BIT[1-6] are discrete inputs which are received on constant pins between hardware configurations. See 282100-ICD-x for more details."
predict(pred_text)

In [None]:
pred_text = "I shall like waffles"
predict(pred_text)

In [None]:
pred_text = "Bumblebe is red"
predict(pred_text)

In [None]:
pred_text = "Bumblebee is red"
predict(pred_text)

# Hyperparameter tunning with Ray Tune

In [None]:
import time
from datetime import datetime, date

import ray
import ray.train.torch

from ray import train, tune
from ray.train import RunConfig, ScalingConfig, Checkpoint
from ray.train.torch import TorchTrainer

from ray.tune.schedulers import ASHAScheduler

In [None]:
ray.train.torch.get_device()

In [None]:
from torch.utils.data import DataLoader

MAX_LENGTH = 100

def collate_batch(batch):
    label_list, text_list = [], []
    for _label, _text in batch:
        label_list.append(label_pipeline(_label))
        processed_text = text_pipeline(_text)
        padded = pad_sequences(processed_text, MAX_LENGTH)
        text_list.append(padded)
        
    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = torch.tensor(text_list, dtype=torch.long)
    return label_list, text_list

In [None]:
def train_func(model, optimizer, train_loader, max_norm):  
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.train()
    
    total_acc, total_count = 0, 0
    log_interval = 5
    start_time = time.time()
    total_loss = 0

    for idx, (label, text) in enumerate(train_loader):           
        label, text = label.to(device), text.to(device)
        
        optimizer.zero_grad()
        
        predicted_label = model(text)
        loss = criterion(predicted_label, label)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
        optimizer.step()
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        total_loss += loss.item()
         
    return total_acc / total_count, total_loss / total_count
        

def eval_func(model, data_loader):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text) in enumerate(data_loader):
            label, text = label.to(device), text.to(device)
            
            predicted_label = model(text)
            loss = criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)

    return total_acc / total_count, loss.item() / total_count

In [None]:
import os
import tempfile

today = date.today().isoformat()
checkpoint_path = "./models/bidirectional_gru_glove"
model_name = "bidirectional_gru_glove"
num_class = 2
vocab_size = len(vocab)


def train_search(config):
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    # print(device)
    # print(config)
    # config_params = config["params"]
    config_params = config
    # print(config_params)
    
    
    # Embeddings size depends on the GloVe embeddings defined for the tokenizer
    model = GRU_Model(vocab_size, 
                      GLOVE_DIM, 
                      config_params["hidden_size"], 
                      config_params["num_layers"], 
                      num_class,
                      config_params["dropout"], 
                     )
    model = model.to(device)
    

    train_dataloader = DataLoader(
        split_train_, batch_size=config_params["batch_size"], shuffle=True, collate_fn=collate_batch
    )
    valid_dataloader = DataLoader(
        split_valid_, batch_size=config_params["batch_size"], shuffle=True, collate_fn=collate_batch
    )
    test_dataloader = DataLoader(
        test_dataset, batch_size=config_params["batch_size"], shuffle=True, collate_fn=collate_batch
    )
    
    # optimizer = torch.optim.Adam(model.parameters(), lr=config_params["lr"])
    optimizer = torch.optim.SGD(model.parameters(), lr=config_params["lr"], momentum=config_params["momentum"])
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=config_params["step_size"], gamma=config_params["lr_gamma"])
    criterion = torch.nn.CrossEntropyLoss()
    
    for epoch in range(1, config_params["epochs"] + 1):
        epoch_start_time = time.time()
        accu_train, loss_train = train_func(model, optimizer, train_dataloader, config_params["max_norm"])
        accu_val, loss_val = eval_func(model, valid_dataloader)
        
        # Always let the scheduler take a step because it will be optimized by Hyperopt
        scheduler.step()
            
        with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
            checkpoint = None
            if epoch % 5 == 0:
                # This saves the model to the trial directory
                torch.save(
                    model.state_dict(),
                    os.path.join(temp_checkpoint_dir, "model.pth")
                )
                checkpoint = Checkpoint.from_directory(temp_checkpoint_dir)

            # Send the current training result back to Ray Tune
            train.report({
                "loss_train": loss_train,
                "loss_val": loss_val,
                "accuracy_train": accu_train,
                "accuracy_val": accu_val,
            }, checkpoint=checkpoint)

In [None]:
import gc

gc.collect()
torch.cuda.empty_cache()

In [None]:
(free_memory, global_memory) = torch.cuda.mem_get_info()
print(f"Free(GB): {free_memory/1024/1024/1024}, Global(GB): {global_memory/1024/1024/1024}, Free(%): {free_memory/global_memory}")

In [None]:
from guppy import hpy
h=hpy()
h.heap()

In [None]:
import numpy as np
from hyperopt import hp
from ray.tune.search.hyperopt import HyperOptSearch

exp_name = "bidirectional_gru_glove"

space = {
    "lr": hp.loguniform("lr", -3, 1),
    "momentum": hp.uniform("momentum", 0.1, 0.9),
    "hidden_size": hp.choice("hidden_size", [32, 64, 128]), 
    "num_layers": hp.randint("num_layers", 3, 10),
    "epochs": hp.choice("epochs", [30]),
    "batch_size": hp.choice("batch_size", [16, 32]),
    "step_size": hp.randint("step_size", 1, 10),
    "lr_gamma": hp.uniform("lr_gamma", 0.1, 0.9),
    "max_norm": hp.uniform("max_norm", 0.1, 0.9),
    "dropout": hp.uniform("dropout", 0.1, 0.9),
}

hyperopt_search = HyperOptSearch(space, metric="accuracy_val", mode="max")

# Uncomment this to enable distributed execution
# `ray.init(address="auto")`

tuner = tune.Tuner(
    tune.with_resources(train_search, resources={"cpu":8, "gpu":0.5}),
    tune_config=tune.TuneConfig(
        num_samples=30,
        scheduler=ASHAScheduler(metric="accuracy_val", mode="max"), # Early stopping
        search_alg=hyperopt_search, # Hyperopt library for Hyper-parameter Optimization
    ),
    run_config=train.RunConfig(
        name=exp_name,
        checkpoint_config=train.CheckpointConfig(
            checkpoint_score_attribute="accuracy_val",
            num_to_keep=2,
            # checkpoint_at_end=True
        ),
    ),
)
results = tuner.fit()

In [None]:
dfs = {result.path: result.metrics_dataframe for result in results}
[d["accuracy_val"].plot() for d in dfs.values()]

In [None]:
best_result = results.get_best_result("accuracy_val", "max")
best_result.config

In [None]:
best_result.metrics_dataframe

In [None]:
import matplotlib.pyplot as plt

df = best_result.metrics_dataframe

plt.plot(df['accuracy_train'], label='accuracy_train')
plt.plot(df['accuracy_val'], label='accuracy_val')

plt.legend(title='')

plt.show()

In [None]:
best_checkpoints = best_result.best_checkpoints
best_checkpoints

In [None]:
best_checkpoint = best_result.get_best_checkpoint("accuracy_val", mode="max")
next(x for x in best_checkpoints if x[0].path == best_checkpoint.path)

In [None]:
state_dict = torch.load(os.path.join(best_checkpoint.path, "model.pth"))

num_class = 2
vocab_size = len(vocab)

model = GRU_Model(vocab_size, 
                      GLOVE_DIM, 
                      best_result.config["hidden_size"], 
                      best_result.config["num_layers"], 
                      num_class,
                      best_result.config["dropout"], 
                     )

model.load_state_dict(state_dict)

In [None]:
uq_path = today + "_" + "_".join(best_result.path.split("=")[0].split("/")[-2:])
save_path = os.path.join('./models/bidirectional_gru_glove', uq_path + "_model.pt")

In [None]:
torch.save(model, save_path)

In [None]:
test_df = pd.read_excel('TestDataset.xlsx')
test_df = test_df[['E1','Text']]
#Convert E1 column values to integers
test_df['E1'] = test_df['E1'].astype(int)
test_df.head()

In [None]:
from torchtext.data.functional import to_map_style_dataset

test_iter = iter(list(test_df.itertuples(index=False, name=None)))
test_dataset = to_map_style_dataset(test_iter)
    
test_dataloader = DataLoader(test_dataset,
                             batch_size=64,
                             shuffle=False,
                             collate_fn=collate_batch)

In [None]:
plot_confusion_matrix(model, test_dataloader)