In [1]:
# # for google colab
# from google.colab import drive
# drive.mount('/content/gdrive/')

In [2]:
import os
# # for google colab
# os.chdir('gdrive/My Drive/AI')
os.getcwd()

'/Users/johncalab/Dropbox/gitstuff/deepShowerThoughts/ai'

In [3]:
from charvocabulary import charVocabulary
from charvectorizer import charVectorizer
from chardataset import charDataset
from charmodel import charModel
from charsample import gen_samp

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import tqdm
import pickle

In [4]:
DATASET = 'may15nov17_above130_less100_light.csv'
csv_path = os.path.join('training_data',DATASET)

rootpath = 'oren'
if rootpath not in os.listdir():
    os.mkdir(rootpath)

dict_path = os.path.join(rootpath, 'dict.pkl')
model_path = os.path.join(rootpath, 'model.pt')
losses_path = os.path.join(rootpath, 'losses.txt')
bestloss_path = os.path.join(rootpath, 'bestloss.txt')
params_path = os.path.join(rootpath, 'params.pkl')

RESUME = True
NUM_EPOCHS = 2
CUDA = True

In [5]:
posts = pd.read_csv(csv_path).title.astype('U')

vocab = charVocabulary()
if RESUME:
    token_to_idx = pickle.load(open(dict_path,'rb'))
    vocab = charVocabulary(token_to_idx=token_to_idx)
else:
    vocab.add_series(df=posts)
    pickle.dump(vocab.token_to_idx, open(dict_path,'wb'))

if RESUME:
    params = pickle.load(open(params_path,'rb'))
else:
    params = {}
    params['vocab_size'] = len(vocab)
    params['embedding_dim'] = 128
    params['rnn_hidden_dim'] = 512
    params['num_layers'] = 2
    params['dropout_p'] = 0.5
    params['bidirectional'] = False
    # missing the vocab_size!
    
    pickle.dump(params, open(params_path,'wb'))


BATCH_SIZE = 64
LEARNING_RATE = 1e-3

In [6]:
import logging
logpath = os.path.join(rootpath, 'logbook.log')
logging.basicConfig(level=logging.INFO, filename=logpath, filemode='w')

logging.info(f"Vocab size is {params['vocab_size']}.")
logging.info(f"Embedding dim is {params['embedding_dim']}.")
logging.info(f"RNN hidden dim is {params['rnn_hidden_dim']}.")
logging.info(f"I am using {params['num_layers']} RNN layers.")
if params['bidirectional']:
    logging.info(f"RNN unit is bidirectional.")
logging.info(f"Dropout is {params['dropout_p']}.")
logging.info(f"Batch size is {BATCH_SIZE}.")

logging.info(f"Folder is {rootpath}.")
logging.info(f"Training on {csv_path}.")
logging.info(f"Dictionary is in {dict_path}.")
logging.info(f"Model is in {model_path}.")
logging.info(f"Loss info is in {losses_path}.")
logging.info(f"Best loss is in {bestloss_path}.")
logging.info(f"I aspire to train for {NUM_EPOCHS} epochs.")
if RESUME:
    logging.info(f"I am resuming training.")
else:
    logging.info(f"I am training from scratch.")
if CUDA:
    logging.info(f"I will try to use CUDA.")

In [7]:
if CUDA and torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

t_device = torch.device(device)

s = f"I am using {device}."
logging.info(s)
print(s)

I am using cpu.


In [8]:
maskid = vocab.mask_idx
vectorizer = charVectorizer(vocab=vocab)

model = charModel(**params)

if RESUME:
    model.load_state_dict(torch.load(model_path, map_location=device))

model.to(device)

charModel(
  (emb): Embedding(136, 128, padding_idx=0)
  (rnn): GRU(128, 512, num_layers=2, batch_first=True, dropout=0.5)
  (fc): Linear(in_features=512, out_features=136, bias=True)
)

In [9]:
ds = charDataset(vectorizer=vectorizer, posts=posts)
dl = DataLoader(ds, batch_size=BATCH_SIZE, shuffle=False)
# optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)#, momentum=0.1)
optimizer = optim.Adam(model.parameters())

In [None]:
# OK let's start training ----------------------------------------------------------
try:
    if RESUME:
        with open(bestloss_path, 'r') as f:
            bestloss = float(f.readline())
    else:
        bestloss = float('inf')

    for epoch in range(NUM_EPOCHS):
        ### training ----------
        model.train()

        batch_losses = []

        with tqdm.tqdm(total=len(dl)) as progress_bar:
            for x,y in dl:
                
                optimizer.zero_grad()

                x = x.to(device)
                y = y.to(device)

                y_pred = model(x)

                batch_size, seq_len, feats = y_pred.shape
                y_pred_loss = y_pred.view(batch_size*seq_len,feats)
                y_loss = y.view(-1)

                loss = F.cross_entropy(y_pred_loss, y_loss, ignore_index=maskid)
                loss.backward()
                optimizer.step()

                batch_losses.append(loss.item())
                with open(losses_path, 'a') as f:
                    stringa = '\n' + str(loss.item())
                    f.write(stringa)

                progress_bar.update(1)
        
        avgloss = np.asarray(batch_losses).mean()

        # if we'll want to use a train/test split
        # model should be updated with VALIDATION losses, not training

        if avgloss < bestloss:
            bestloss = avgloss
            logging.info("Loss improved! I am saving this model.")
            torch.save(model.state_dict(), model_path)
#             logging.info(f"{bestloss}")
            with open(bestloss_path, 'w') as f:
                f.write(str(bestloss))
        
        ### 'validating' -------------------
        model.eval()
        model.to('cpu')
        print('\n')
        for i in range(5):
            print(gen_samp(model=model,vocab=vocab,prompt=""))

        model.to(device)

        print(f"\nEpoch number {epoch+1} has concluded. The mean average loss was {avgloss}.\n")

except Exception as e:
    logging.info(f"Model trained for {epoch} full epochs.")
    logging.error("Something went wrong", exc_info=True)

#logger.info("I'll save the latest model.")

#torch.save(model.state_dict(), model_path)

  6%|▋         | 21/334 [00:20<04:57,  1.05it/s]