# Process 

* get train and test data 
* create tokenizer from data 
* create dataset 
* create dataloader 
* load model 
* training 
* save model
* testing

In [1]:
import torch 
import matplotlib.pyplot as plt 
import pandas as pd
from csv_dataset import CSV_Dataset
from tokenizer import Tokenizer
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
from model import SimpleNN
from tqdm import tqdm 

# Prepare data

In [2]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

In [3]:
train_ds = CSV_Dataset(train_data)
test_ds = CSV_Dataset(test_data)

In [5]:
train_loader = DataLoader(train_ds, batch_size=8, shuffle=True, num_workers=4)
test_loader = DataLoader(test_ds, batch_size=8, shuffle=False)

In [6]:
batch = next(iter(train_loader))
print(type(batch))


<class 'list'>


In [7]:
print(len(batch))
batch

8


['in the executive board of the state railways wanted to unk the operation of restaurant carriages and the most important station restaurants in norway under one management the board stated that they wanted to minimise the conflict of interest between the railway company and the dining car operator they also saw unk operations as a way to unk a larger share of the revenue to the railway company and to ensure a high quality of service on new lines at that time the unk line and dovre line were in the planning stages and the nsb intended to introduce dining services on these when they opened oslo east station and its restaurant were operated by the private norwegian trunk railway in an agreement signed on september both railway companies agreed that a new restaurant operator would be controlled by the norwegian trunk railway but this company had to abide by the nsb s decision of how many restaurant carriages to operate on any line',
 'razor ray guitar soloist vocals background vocals',
 '

# Tokenizer

In [8]:
tokenizer = Tokenizer(train_data)

# Model

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
training_config = {"vocab_size": tokenizer.get_vocab_size(), 
                "embedding_dim": 100, 
                "hidden_dim": 512, 
                "learning_rate": 1e-3,
                "epochs": 30,
                "batch_size": 8,
                "window_size": 5,
                "device": device}


In [10]:
model = SimpleNN(vocab_size=training_config["vocab_size"], 
                 embedding_dim=training_config["embedding_dim"], 
                 hidden_dim=training_config["hidden_dim"], 
                 window_size=training_config["window_size"])

# Training

In [35]:
def get_Loss_func():
    return torch.nn.CrossEntropyLoss()

def generate_data(batch: list[str], tokenizer= None, config: dict= None):
    data = tokenizer.encode(batch)
    inputs = []
    targets = [] 
    for sample in data:
        for idx in range(len(sample) - config["window_size"]):
            inputs.append(sample[idx:idx+config["window_size"]])
            targets.append(sample[idx+config["window_size"]])
    return torch.tensor(inputs, dtype= torch.LongStorage), torch.tensor(targets, dtype= torch.long)

def model_step(batch, model, loss_func, config: dict):
    _input, target = batch
    _input = _input.to(config['device'])
    target = target.to(config['device'])
    outputs = model(_input)
    loss = loss_func(outputs, target)
    
    del _input, target, outputs
    return loss

def optimizer_step(optimizer, loss):
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

def train_epoch(model, train_loader, loss_func, optimizer, tokenizer, config: dict, epoch):
    model.train()
    pbar = tqdm(train_loader)
    total, total_loss = 0, 0
    for batch in pbar:
        inputs, targets = generate_data(batch, tokenizer, config)
        for idx in range(0, len(inputs) , config["batch_size"]):
            x = inputs[idx:idx+config["batch_size"]]
            y = targets[idx:idx+config["batch_size"]]
            loss= model_step((x, y), model, loss_func, config= config)
            optimizer_step(optimizer, loss)

            total += len(y)
            total_loss += loss.item()
        pbar.set_description(f"epoch = {epoch}, train/loss = {total_loss/total:.3f}")
            
    return (total_loss / total)    #loss


def plot_training_history(train_loss_history):
    plt.figure(figsize=(8, 8))
    plt.plot(train_loss_history, label='Training Loss')
    plt.legend(loc='upper right')
    plt.ylabel('Cross Entropy Loss')
    plt.title('Training Loss')
    plt.xlabel('epoch')
    plt.show()

def train(model, train_loader, tokenizer = None, plot_res = True, config: dict = None):
    train_loss_history = []
    loss_func = get_Loss_func()
    optimizer = torch.optim.Adam(model.parameters(), config["learning_rate"])
    model.to(config['device'])
    for epoch in range(config["epochs"]):
        ## train loop
        train_loss = train_epoch(model, train_loader, loss_func, optimizer, tokenizer, config, epoch)
        train_loss_history.append(train_loss)
            
    if(plot_res):
        plot_training_history(train_loss_history)
    return model, train_loss_history

def save_weight(model, output_file):
    print("save model to", output_file)
    torch.save(model.state_dict(), output_file)
    


In [36]:
print(model)

SimpleNN(
  (embedding): Embedding(27159, 100)
  (relu1): LeakyReLU(negative_slope=0.15)
  (flaten): Flatten(start_dim=1, end_dim=-1)
  (fc1): Linear(in_features=500, out_features=512, bias=True)
  (relu2): LeakyReLU(negative_slope=0.15)
  (fc2): Linear(in_features=512, out_features=512, bias=True)
  (relu3): LeakyReLU(negative_slope=0.15)
  (fc3): Linear(in_features=512, out_features=27159, bias=True)
)


In [37]:
print(type(training_config['learning_rate']))

<class 'float'>


In [40]:
#calculate total params 
total_params = sum(p.numel() for p in model.parameters())
total_params

17167635

In [38]:
model, training_history = train(model = model, train_loader= train_loader, tokenizer = tokenizer, config= training_config)



epoch = 0, train/loss = 1.746:   3%|▎         | 86/3067 [00:57<32:57,  1.51it/s] 


KeyboardInterrupt: 

In [None]:
def evaluate(model, test_loader, tokenizer, config):
    loss_func = get_Loss_func()
    model.eval()
    total, total_loss = 0, 0
    with torch.no_grad():
        for batch in test_loader:
            inputs, targets = generate_data(batch, tokenizer, config)
        for idx in range(0, len(inputs) , config["batch_size"]):
            x = inputs[idx:idx+config["batch_size"]]
            y = targets[idx:idx+config["batch_size"]]
            loss= model_step((x, y), model, loss_func, config= config)
            
            total += 1
            total_loss += loss.item()
    res_loss = total_loss / total
    perplexity = np.exp(res_loss)
    return perplexity, res_loss

In [None]:
def predict(net: nn.Module, tokenizer, text: str, window_size: int):
    text_id = tokenizer.encode(text)
    generated_text = [] 
    if len(text_id) < window_size:
        text_id = [[tokenizer.encode('<pad>')]] * (window_size - len(text_id)) + text_id
    for i in range(100):
        inputs = text_id + generated_text
        inputs = inputs[-window_size:]
        inputs = torch.tensor(inputs, dtype= torch.long, device= training_config['device']).unsqueeze(0)
        prob = net(inputs)
        prob = torch.softmax(prob, dim=1)
        next_word_id = torch.argmax(prob).item()
        generated_text.append(next_word_id)
        
    return tokenizer.decode(generated_text)