In [1]:
import torch
import pandas as pd
from torch import nn
import pytorch_lightning as pl
import torch.utils.data as data_utils
import numpy as np
from os import cpu_count
from types import NoneType
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import math

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import wandb
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor, ModelCheckpoint, LearningRateFinder
from pytorch_lightning.loggers import WandbLogger
from gensim.models import Word2Vec

In [3]:
torch.set_float32_matmul_precision("medium")

In [4]:
df=pd.read_parquet('./data/books.par')
df.columns

Index(['title', 'genre', 'summary', 'input_ids', 'att_mask', 'label',
       'mapped_inputs'],
      dtype='object')

# tweets dataset

In [6]:
df=pd.read_csv('./data/tweets.csv', names=['index','genre','summary']).drop(columns='index')

In [7]:
from transformers import GPT2Tokenizer, AutoTokenizer
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token 
tokenized_summary=gpt2_tokenizer(df.summary.map(lambda x: x.strip().lower()).tolist(),
                    return_tensors='pt',
                    padding=True)
tokenized_summary['input_ids'].shape

torch.Size([149985, 835])

In [8]:
tokenized_summary['input_ids'][0,0].item()

38690

In [9]:
tokens_dict={}
for i, t in enumerate(tokenized_summary['input_ids'].unique()):
    tokens_dict[t.item()]=i

KeyboardInterrupt: 

In [None]:
tokenized_summary['input_ids']=tokenized_summary['input_ids'].apply_(lambda x: tokens_dict[x])

In [None]:
tokenized_summary['input_ids'][0,0].item()

21301

In [None]:
tokenized_summary['attention_mask'].sum(axis=1).shape

torch.Size([149985])

In [None]:
df.genre.unique()

array([0, 1])

In [None]:
class TweetDataset(data_utils.Dataset):
    def __init__(self, tokenized_summary, labels, idxs):
        self.input_ids=tokenized_summary['input_ids'][idxs]
        self.lengths=tokenized_summary['attention_mask'].sum(axis=1)[idxs]
        self.y=torch.tensor(labels, dtype=torch.uint8)[idxs]
        self.no_classes=2
        self.max_len=self.input_ids.shape[1]

    def __len__(self):
        return self.y.shape[0]
    
    def __getitem__(self, indexes):
        return self.input_ids[indexes], self.lengths[indexes], self.y[indexes]
        

In [None]:
train_idxs=torch.rand(df.shape[0])>0.8
val_idxs=~train_idxs

In [None]:
train_data=TweetDataset(tokenized_summary, df.genre.values, train_idxs)
val_data=TweetDataset(tokenized_summary, df.genre.values, val_idxs)

train_dataloader=data_utils.DataLoader(train_data, batch_size=32, num_workers=cpu_count(),
                                       shuffle=True, drop_last=True)
val_dataloader=data_utils.DataLoader(val_data, batch_size=32, num_workers=cpu_count())

# dataset

In [5]:
class BookDataset(data_utils.Dataset):
    def __init__(self, df: pd.DataFrame):
        self.input_ids=torch.tensor(df.mapped_inputs, dtype=torch.int32)
        self.lengths=torch.tensor(df.att_mask.map(sum), dtype=torch.int32)
      #  self.att_masks=torch.tensor(df.att_mask, dtype=torch.float32)
        self.y=torch.tensor(df.label.values, dtype=torch.uint8)
        self.no_classes=df.label.nunique()
        self.max_len=self.input_ids.shape[1]

    def __len__(self):
        return self.y.shape[0]
    
    def __getitem__(self, indexes):
       # batch_max_len= torch.max(self.lengths[indexes])
        return self.input_ids[indexes], self.lengths[indexes], self.y[indexes]
        

In [6]:
class DataSampler(data_utils.Sampler):
    def __init__(self, dataset, batch_size, shuffle=False, drop_last=False):
        self.dataset = dataset
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.drop_last = drop_last

    def __len__(self):
        return self.dataset.length

    def __iter__(self):
        order = np.arange(len(self))
        if self.shuffle:
            np.random.shuffle(order)
        if len(self) % self.batch_size:
            for i in range(0, len(self) - self.batch_size, self.batch_size):
                yield order[i : i + self.batch_size]
            if not self.drop_last:
                yield order[-(len(self) % self.batch_size) :]
        else:
            for i in range(0, len(self), self.batch_size):
                yield order[i : i + self.batch_size]


In [7]:
class PackedDataset(data_utils.Dataset):
    def __init__(self, df: pd.DataFrame):
        self.input_ids=torch.tensor(df.mapped_inputs, dtype=torch.int32)
        self.lengths=torch.tensor(df.att_mask.map(sum), dtype=torch.int32)
        self.y=torch.tensor(df.label.values, dtype=torch.uint8)
        self.no_classes=df.label.nunique()
        self.max_len=self.input_ids.shape[1]

    def __len__(self):
        return self.y.shape[0]
    
    def __getitem__(self, indexes):
       
        return pack_padded_sequence(self.input_ids[indexes].view(-1, self.max_len), 
                                    self.lengths[indexes].view(-1), 
                                    batch_first=True, enforce_sorted=False), self.y[indexes]
        

In [8]:
def simple_elementwise_apply(fn, packed_sequence):
    """applies a pointwise function fn to each element in packed_sequence"""
    return torch.nn.utils.rnn.PackedSequence(fn(packed_sequence.data), packed_sequence.batch_sizes)

In [9]:
#pack_padded_sequence(emb(b[0]), b[1], batch_first=True, enforce_sorted=False)

In [10]:
#simple_elementwise_apply(emb, val_data[:32][0])

In [6]:
df=df.iloc[np.random.permutation(df.shape[0])].reset_index(drop=True)
split=int(df.shape[0]*0.9)

train_df=df.iloc[:split]
val_df=df.iloc[split:].reset_index(drop=True)

In [7]:
_,label_weights=np.unique(train_df.label, return_counts=True)
label_weights=1/label_weights
label_weights=label_weights/np.sum(label_weights)
label_weights

array([0.02315746, 0.03168706, 0.0408208 , 0.03396712, 0.03434735,
       0.01992444, 0.20685595, 0.18596141, 0.20920658, 0.21407185])

In [8]:
train_data=BookDataset(train_df)
val_data=BookDataset(val_df)

train_dataloader=data_utils.DataLoader(train_data, batch_size=32, num_workers=cpu_count(),
                                       shuffle=True, drop_last=True)
val_dataloader=data_utils.DataLoader(val_data, batch_size=32, num_workers=cpu_count())

  self.input_ids=torch.tensor(df.mapped_inputs, dtype=torch.int32)


# pl module

In [9]:
class BookGenreClassifier(pl.LightningModule):
    def __init__(self, model, lr=1e-2, loss=nn.CrossEntropyLoss(), l2=1e-5, lr_dc_step=3, lr_dc=0.1, weight_init='normal', **kwargs):
        super().__init__()
        self.save_hyperparameters(ignore=['model','loss'])
        if isinstance(loss.weight, NoneType):
            weighted_loss=False
        else: weighted_loss=True
        self.save_hyperparameters({'name':model.name, 
                                   'hid_dim':model.hid_dim,
                                   'dropout_p':model.dropout_p,
                                   'w2v_init':model.w2v_init,
                                   'num_layers':model.num_layers,
                                   'nonlinearity':model.nonlinearity,
                                   'bidirectional':model.bidirectional,
                                   'weighted_loss':weighted_loss})
        self.lr=lr
        self.loss=loss
        self.model=model
        if weight_init!='normal':
            self.reset_parameters(weight_init)

    def reset_parameters(self, weight_init):
        if weight_init == "uniform":
            stdv = 1.0 / math.sqrt(self.hidden_size)
            for weight in self.parameters():
                nn.init.uniform_(weight, -stdv, stdv)
        elif weight_init == "normal":
            for weight in self.parameters():
                nn.init.normal_(weight, 0, 0.1)
        elif weight_init == "xavier_normal":
            for weight in self.parameters():
                if len(weight.shape) < 2:
                    nn.init.normal_(weight, 0, 0.1)
                else:
                    nn.init.xavier_normal_(weight)
        else:
            raise ValueError(
                f"Weight initialization of type {weight_init} not implemented!"
            )
        
    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x = batch[:-1]
        y = batch[-1]
        logits=self(x)
        loss=self.loss(logits, y)

        self.log('train_loss', loss, prog_bar=True)
        return loss
    
    def evaluate(self, batch, mode=None):
        x = batch[:-1]
        y = batch[-1]
        logits=self(x)

        loss=self.loss(logits, y)

        preds=torch.argmax(logits, axis=1)
        acc=torch.sum(preds==y)/y.shape[0]
        # TODO add more metrics

        if mode:
            self.log(mode+'_loss', loss,  prog_bar=True)
            self.log(mode+'_acc', 100*acc,  prog_bar=True)

    def validation_step(self, batch, *args, **kwargs):
        return self.evaluate(batch, "val")
    def test_step(self, batch, *args, **kwargs):
        return self.evaluate(batch, "test")
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(
            self.model.parameters(), lr=self.lr, weight_decay=self.hparams.l2
        )
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            patience=self.hparams.lr_dc_step,
            factor=self.hparams.lr_dc,
            cooldown=1,
        )
        return {
            "optimizer": optimizer,
            "lr_scheduler": {
                "scheduler": scheduler,
                "monitor": "val_acc",
                "strict": False,
                "interval": "epoch",
                "frequency": 1,
                "name": "scheduler_lr",
            },
        }

# different models

## dummy example

In [10]:
class DummyModel(nn.Module): 
    def __init__(self, in_dim=7031, hid_dim=128, out_dim=10):
        # dummy model as an example, just one hidden layer straight from list of tokens
        super().__init__()
        self.name='DummyModel'
        self.dropout_p=0
        self.l1=nn.Linear(in_dim, hid_dim)
        self.nonlinear=nn.Tanh()
        self.l2=nn.Linear(hid_dim, out_dim)

    def forward(self, x):
        in_ids, att_mask = x
        x=in_ids*att_mask
        x=self.l1(x)
        x=self.nonlinear(x)
        return self.l2(x)

### train

In [11]:
dm_model=BookGenreClassifier(DummyModel(), loss=nn.CrossEntropyLoss(weight=torch.tensor(label_weights, dtype=torch.float32)))

In [12]:
trainer=pl.Trainer(max_epochs=10)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/szymon/pythonvenvs/rocmwork/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


In [13]:
trainer.fit(dm_model, train_dataloader, val_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type             | Params | Mode 
---------------------------------------------------
0 | loss  | CrossEntropyLoss | 0      | train
1 | model | DummyModel       | 901 K  | train
---------------------------------------------------
901 K     Trainable params
0         Non-trainable params
901 K     Total params
3.606     Total estimated model params size (MB)


Sanity Checking: |                                                            | 0/? [00:00<?, ?it/s]

Training: |                                                                   | 0/? [00:00<?, ?it/s]

Validation: |                                                                 | 0/? [00:00<?, ?it/s]

Validation: |                                                                 | 0/? [00:00<?, ?it/s]

Validation: |                                                                 | 0/? [00:00<?, ?it/s]

Validation: |                                                                 | 0/? [00:00<?, ?it/s]

Validation: |                                                                 | 0/? [00:00<?, ?it/s]

Validation: |                                                                 | 0/? [00:00<?, ?it/s]

Validation: |                                                                 | 0/? [00:00<?, ?it/s]

Validation: |                                                                 | 0/? [00:00<?, ?it/s]

Validation: |                                                                 | 0/? [00:00<?, ?it/s]

Validation: |                                                                 | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


# global variables

In [10]:
train_data[0]

(tensor([ 3602,  6904,   173,  ..., 26305, 26305, 26305], dtype=torch.int32),
 tensor(1074, dtype=torch.int32),
 tensor(0, dtype=torch.uint8))

In [11]:
# for tweets
vocab_size=tokenized_summary['input_ids'].max()+1
bookwords = []
#for s in df.summary:
#    bookwords.append(list(tokenize(s, lowercase=True)))
for s, a in zip(tokenized_summary['input_ids'], tokenized_summary['attention_mask']):
    bookwords.append([str(i.item()) for i in s[a.to(bool)]]+[str(s[-1].item())])
print(bookwords[:1])


NameError: name 'tokenized_summary' is not defined

In [11]:
tokens_set=set()
for tokens in df.mapped_inputs.values:
    tokens_set=tokens_set.union(set(tokens))
vocab_size=len(tokens_set)
vocab_size

26306

In [12]:
bookwords = []
#for s in df.summary:
#    bookwords.append(list(tokenize(s, lowercase=True)))
for s, a in zip(df.mapped_inputs, df.att_mask):
    bookwords.append([str(i) for i in s[a.astype(bool)]]+[str(s[-1])])
print(bookwords[:1])

[['3602', '6904', '173', '6462', '592', '170', '1346', '527', '175', '1048', '11', '10380', '2707', '16953', '211', '151', '10878', '175', '151', '1176', '336', '447', '5583', '179', '203', '7293', '13', '222', '7410', '467', '213', '146', '1304', '179', '3605', '840', '246', '981', '9959', '165', '395', '1', '723', '222', '7677', '293', '333', '14305', '342', '2403', '1013', '9877', '13', '562', '6008', '173', '2638', '479', '12', '47', '270', '42', '179', '8048', '151', '1508', '232', '151', '3083', '175', '151', '4496', '11', '530', '203', '1599', '204', '1006', '3495', '173', '13080', '151', '9959', '165', '395', '11', '146', '10491', '175', '170', '4753', '14805', '175', '24600', '205', '11', '4920', '209', '172', '151', '1503', '4180', '13', '4920', '209', '172', '221', '4531', '336', '447', '13778', '176', '151', '10491', '179', '4753', '2772', '264', '255', '527', '173', '1636', '173', '1247', '539', '13', '10417', '11', '1656', '151', '145', '1099', '276', '11', '204', '151', 

In [13]:
del df
del train_df
del val_df

# w2v

In [15]:
class Word2VecSimple(nn.Module):
    def __init__(self, hid_dim, out_dim, vocab_size, dropout_p=0.5):
        super().__init__()
        self.name='Word2VecSimple'
        self.dropout_p=dropout_p
        self.nonlinearity='none'
        self.w2v_init=True
        self.num_layers=0
        self.bidirectional=False
        self.hid_dim=hid_dim

        w2vmodel = Word2Vec(bookwords, vector_size=hid_dim, min_count=0)
        self.emb = nn.Embedding(vocab_size, hid_dim, padding_idx=-1)

        emb_lst = []
        for v in range(vocab_size):
            emb_lst.append(w2vmodel.wv[str(v)])
        
        emb_mat = np.array(emb_lst)
        self.emb.load_state_dict({'weight': torch.from_numpy(emb_mat)})
        # load embeddings from pretrained word2vec
        #self.emb=nn.Embedding(vocab_size, hid_dim, padding_idx=-1)
        self.out_layer=nn.Linear(hid_dim, out_dim)
        #self.out_layer=nn.Sequential(nn.Dropout(p=dropout_p),
         #                            nn.Linear(hid_dim, hid_dim),
          #                           nn.Tanh(),
           #                          nn.Dropout(p=0.2),
            #                         nn.Linear(hid_dim, out_dim)
             #                        )

    def forward(self, x):
        inputs, lengths = x
        batch_max_len=torch.max(lengths)
        w2v_output=self.emb(inputs[:,:batch_max_len])
        #avg_output = w2v_output.mean(dim=1)
        avg_output = torch.stack([w2v_output[i, :lengths[i]].mean(dim=0) for i in range(lengths.shape[0])])
       # print(f"w2v_output:{w2v_output.shape}")
        #print(f"avg_output:{avg_output.shape}")
        return self.out_layer(avg_output)

In [16]:
w2v_simple_model = BookGenreClassifier(Word2VecSimple(64, 10, vocab_size=vocab_size), 
                             loss=nn.CrossEntropyLoss(),#weight=torch.tensor(label_weights, dtype=torch.float32)),
                             lr_dc=0.1,
                             lr_dc_step=4,
                             weight_init='xavier_normal',
                             )

In [17]:
wandb_logger = WandbLogger(
        project="ecoNLP", entity="kpuchalskixiv", log_model=False
    )

trainer=pl.Trainer(max_epochs=50,
                   callbacks=[
            EarlyStopping(
                monitor="val_acc", patience=10, mode="max", check_finite=True, check_on_train_epoch_end=False
            ),
            LearningRateMonitor(),
            ModelCheckpoint(monitor="val_acc", mode="max"),
            LearningRateFinder( num_training_steps=200)
            ],
            logger=wandb_logger,
        )

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [18]:
trainer.fit(w2v_simple_model, train_dataloader, val_dataloader)
wandb.finish()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkpuchalskixiv[0m. Use [1m`wandb login --relogin`[0m to force relogin


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Finding best initial lr:  98%|█████████▊| 196/200 [00:03<00:00, 58.28it/s]`Trainer.fit` stopped: `max_steps=200` reached.
Finding best initial lr: 100%|██████████| 200/200 [00:03<00:00, 53.50it/s]
Learning rate set to 0.10964781961431852
Restoring states from the checkpoint path at /home/kacper/ecoNLP/.lr_find_75491b96-5ed3-4913-8034-cb5532c1e2e6.ckpt
Restored all states from the checkpoint at /home/kacper/ecoNLP/.lr_find_75491b96-5ed3-4913-8034-cb5532c1e2e6.ckpt

  | Name  | Type             | Params
-------------------------------------------
0 | loss  | CrossEntropyLoss | 0     
1 | model | Word2VecSimple   | 1.7 M 
-------------------------------------------
1.7 M     Trainable params
0         Non-trainable params
1.7 M     Total params
6.737     Total estimated model params size (MB)
Restored all states from the checkpoint at /home/kacper/ecoNLP/.lr_find_75491b96-5ed3-4913-8034-cb5532c1e2e6.ckpt


Epoch 1:   0%|          | 0/130 [00:00<?, ?it/s, v_num=olfa, train_loss=1.950]         

/home/kacper/anaconda3/envs/gpu_torch/lib/python3.12/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:382: `ModelCheckpoint(monitor='val_acc')` could not find the monitored key in the returned metrics: ['scheduler_lr', 'train_loss', 'epoch', 'step']. HINT: Did you call `log('val_acc', value)` in the `LightningModule`?
/home/kacper/anaconda3/envs/gpu_torch/lib/python3.12/site-packages/pytorch_lightning/loops/training_epoch_loop.py:381: ReduceLROnPlateau conditioned on metric val_acc which is not available but strict is set to `False`. Skipping learning rate update.


Epoch 15: 100%|██████████| 130/130 [00:02<00:00, 46.80it/s, v_num=olfa, train_loss=0.113, val_loss=1.020, val_acc=68.50] 




0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇█████
scheduler_lr,███████▂▂▂▂▂▂▁▁▁
train_loss,█▇▆▄▃▃▂▃▃▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁
trainer/global_step,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_acc,▁▄▇▇█▆▆▇▇▆▇▆▆▆▇
val_loss,█▂▁▁▃▃▃▃▃▃▄▄▄▄▄

0,1
epoch,15.0
scheduler_lr,0.0001
train_loss,0.10797
trainer/global_step,2009.0
val_acc,68.45493
val_loss,1.02262


# RNN

In [45]:
class SimpleRNN(nn.Module):
    def __init__(self, hid_dim, out_dim, vocab_size, dropout_p=0, nonlinearity='tanh', w2v_init=False,
                 num_layers=1, bidirectional=False):
        super().__init__()
        self.name='SimpleRNN'
        self.dropout_p=dropout_p
        self.nonlinearity=nonlinearity
        self.w2v_init=w2v_init
        self.num_layers=num_layers
        self.bidirectional=bidirectional
        self.bimult=1+bidirectional
        # last token states 'end of string' and is repeated multiple time at the end of an input
        # therefore set its embedding to 0 with padding_idx=-1
        self.emb=nn.Embedding(vocab_size, hid_dim, padding_idx=-1) 
        if w2v_init:
            w2vmodel = Word2Vec(bookwords, vector_size=hid_dim, min_count=0)
            emb_lst = []
            for v in range(vocab_size):
                emb_lst.append(w2vmodel.wv[str(v)])
            emb_mat = np.array(emb_lst)
            self.emb.load_state_dict({'weight': torch.from_numpy(emb_mat)})
            
        self.model=nn.RNN(input_size=hid_dim, 
                          hidden_size=hid_dim, 
                          batch_first=True, 
                          dropout=dropout_p,
                          bidirectional=True,
                          nonlinearity=nonlinearity)
        self.out_layer= nn.Linear(self.bimult*num_layers*hid_dim+hid_dim, out_dim),
        #nn.Sequential(nn.Dropout(p=dropout_p),
         #                            nn.Linear(hid_dim, hid_dim),
          #                           nn.LeakyReLU(),
           #                          nn.Dropout(p=0.2),
            #                         nn.Linear(hid_dim, out_dim)
             #                        )

    def forward(self, x):
        inputs, lengths=x
        batch_max_len=torch.max(lengths)
        rnn_input=self.emb(inputs[:,:batch_max_len])

     #   avg_emb = torch.stack([rnn_input[i, :lengths[i]].mean(dim=0) for i in range(lengths.shape[0])])

        h0 = torch.randn(self.bimult*self.num_layers, rnn_input.shape[0], rnn_input.shape[-1], device=rnn_input.device)

        rnn_input=pack_padded_sequence(rnn_input,lengths.to('cpu').to(int), batch_first=True, enforce_sorted=False)
        hstates, hn = self.model(rnn_input, h0)
        padded_hstates, lengths=[x.to(inputs.device) for x in pad_packed_sequence(hstates, batch_first=True)]
        hstates_avg=padded_hstates.sum(dim=1).div(lengths.float().unsqueeze(dim=1))
        #return hstates
       # print(h_states.shape)
    #    hn[0]=torch.stack([hstates[e, int(i)-1] for e,i in enumerate(lengths)])
        hn=hn.view(inputs.shape[0], -1)
       # hn=hn.squeeze()
       #return self.out_layer(hn)
        return self.out_layer(torch.concat([hstates_avg, hn], dim=1))

In [46]:
rnn_model=BookGenreClassifier(SimpleRNN(32, 10, vocab_size=vocab_size,
                                        nonlinearity='tanh',
                                        dropout_p=0.5,
                                        w2v_init=True,
                                        ), 
                             loss=nn.CrossEntropyLoss(),#weight=torch.tensor(label_weights, dtype=torch.float32),
                             lr=1e-3,
                             lr_dc=0.5,
                             lr_dc_step=4,
                             )



In [47]:
wandb_logger = WandbLogger(
        project="ecoNLP", entity="kpuchalskixiv", log_model=True
    )

In [48]:
trainer=pl.Trainer(max_epochs=20,
                   callbacks=[
            EarlyStopping(
                monitor="val_acc", patience=10, mode="max", check_finite=True, check_on_train_epoch_end=False
            ),
            LearningRateMonitor(),
            ModelCheckpoint(monitor="val_acc", mode="max"),
         #   LearningRateFinder(min_lr=1e-4, num_training_steps=1000)
            ],
            logger=wandb_logger,
        )

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [49]:
trainer.fit(rnn_model, train_dataloader, val_dataloader)
wandb.finish()

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type             | Params
-------------------------------------------
0 | loss  | CrossEntropyLoss | 0     
1 | model | SimpleRNN        | 1.2 M 
-------------------------------------------
1.2 M     Trainable params
0         Non-trainable params
1.2 M     Total params
4.660     Total estimated model params size (MB)


Epoch 19: 100%|██████████| 130/130 [00:05<00:00, 22.49it/s, v_num=uo4k, train_loss=0.897, val_loss=2.350, val_acc=25.30]

`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 19: 100%|██████████| 130/130 [00:05<00:00, 22.48it/s, v_num=uo4k, train_loss=0.897, val_loss=2.350, val_acc=25.30]




0,1
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
scheduler_lr,████████████▃▃▃▃▃▃▁▁
train_loss,█▇▇█▇█▇▇▇▇▆▇▆▆▆▇▆▆▅▅▃▅▄▄▄▃▃▃▃▄▄▂▃▂▁▁▂▂▂▂
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
val_acc,▁▃▄▁▃▄▅▅▆▆▄▆▅▆█▇▆▄▆▄
val_loss,▂▁▁▁▁▁▁▁▁▂▃▃▄▅▆▆▇▇██

0,1
epoch,19.0
scheduler_lr,0.00025
train_loss,0.89707
trainer/global_step,2599.0
val_acc,25.32189
val_loss,2.35457


In [24]:
wandb.finish()



0,1
epoch,▁▁▁▂▂▂▃▃▃▄▄▄▅▅▆▆▆▁▁▂▂▂▃▃▃▃▄▄▅▅▅▆▆▆▆▇▇███
scheduler_lr,███████▄▄▄▄▄▄▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁
train_loss,█▇▇███▇▇▆▆▆▆▆▅▅▄▅█▇███▇▆▅▇▄▄▃▄▃▂▂▂▂▂▁▁▂▁
trainer/global_step,▁▁▂▂▂▂▃▃▃▄▄▄▅▅▅▆▆▁▁▂▂▂▂▃▃▄▄▄▄▅▅▅▆▆▆▇▇▇██
val_acc,▂▄▄▂▃▃▁▂▃▄▃▄▃▂▆▄▄▆▇▆█▅▆▆▄▅▅▄▄▃▃
val_loss,▁▁▁▁▁▁▂▂▂▃▃▃▄▁▁▁▁▁▁▁▂▃▄▄▅▆▆▆▇▇█

0,1
epoch,17.0
scheduler_lr,0.00025
train_loss,0.14059
trainer/global_step,2339.0
val_acc,22.103
val_loss,4.35297


# LSTM

In [33]:
class SimpleLSTM(nn.Module):
    def __init__(self, hid_dim, out_dim, vocab_size, dropout_p=0, nonlinearity='none', w2v_init=False, num_layers=1, bidirectional=False):
        super().__init__()
        self.name='SimpleLSTM'
        self.dropout_p=dropout_p
        self.nonlinearity=nonlinearity
        self.w2v_init=w2v_init
        self.num_layers=num_layers
        self.bidirectional=bidirectional
        self.bimult=1+bidirectional
        self.hid_dim=hid_dim
        # last token states 'end of string' and is repeated multiple time at the end of an input
        # therefore set its embedding to 0 with padding_idx=-1
        self.emb=nn.Embedding(vocab_size, hid_dim, padding_idx=-1) 
        if w2v_init:
            w2vmodel = Word2Vec(bookwords, vector_size=hid_dim, min_count=0)
            emb_lst = []
            for v in range(vocab_size):
                emb_lst.append(w2vmodel.wv[str(v)])
            emb_mat = np.array(emb_lst)
            self.emb.load_state_dict({'weight': torch.from_numpy(emb_mat)})

        self.model=nn.LSTM(input_size=hid_dim, 
                           hidden_size=hid_dim, 
                           batch_first=True, 
                           dropout=dropout_p, 
                           num_layers=num_layers,
                           bidirectional=bidirectional)
      #  self.out_layer=nn.Linear(self.bimult*num_layers*hid_dim, out_dim)
        self.out_layer=nn.Sequential(nn.Dropout(p=dropout_p),
                                     nn.Linear(self.bimult*num_layers*hid_dim+hid_dim, hid_dim),
                                     nn.Tanh(),
                                     nn.Dropout(p=0.2),
                                     nn.Linear(hid_dim, out_dim)
                                     )

    def forward(self, x):
        inputs, lengths=x
        batch_max_len=torch.max(lengths)
        rnn_input=self.emb(inputs[:,:batch_max_len])
        rnn_input=rnn_input.div(rnn_input.sum(axis=2).view(inputs.shape[0],batch_max_len,1))

     #   avg_emb = torch.stack([rnn_input[i, :lengths[i]].mean(dim=0) for i in range(lengths.shape[0])])

        h0 = torch.randn(self.bimult*self.num_layers, rnn_input.shape[0], rnn_input.shape[-1], device=rnn_input.device)
        c0 = torch.randn(self.bimult*self.num_layers, rnn_input.shape[0], rnn_input.shape[-1], device=rnn_input.device)

        rnn_input=pack_padded_sequence(rnn_input,lengths.to('cpu').to(int), batch_first=True, enforce_sorted=False)
        hstates, (hn, _) = self.model(rnn_input, (h0, c0))
        padded_hstates, lengths=[x.to(inputs.device) for x in pad_packed_sequence(hstates, batch_first=True)]
        hstates_avg=padded_hstates.sum(dim=1).div(lengths.float().unsqueeze(dim=1))
        #return hstates
       # print(h_states.shape)
    #    hn[0]=torch.stack([hstates[e, int(i)-1] for e,i in enumerate(lengths)])
        hn=hn.view(inputs.shape[0], -1)
       # hn=hn.squeeze()
       #return self.out_layer(hn)
        return self.out_layer(torch.concat([hstates_avg, hn], dim=1))

In [34]:
lstm_model=BookGenreClassifier(SimpleLSTM(64, 10, vocab_size=vocab_size,
                                       # nonlinearity='tanh',
                                        dropout_p=0.5,
                                        w2v_init=True,
                                        num_layers=1,

                                        ), 
                             loss=nn.CrossEntropyLoss(),#weight=torch.tensor(label_weights, dtype=torch.float32)),
                             lr=1e-4,
                             lr_dc=0.25,
                             lr_dc_step=4,
                             weight_init='normal',
                             )



In [35]:
wandb_logger = WandbLogger(
        project="ecoNLP", entity="kpuchalskixiv", log_model=False
    )

In [36]:
trainer=pl.Trainer(max_epochs=50,
                   callbacks=[
            EarlyStopping(
                monitor="val_acc", patience=10, mode="max", check_finite=True, check_on_train_epoch_end=False
            ),
            LearningRateMonitor(),
            ModelCheckpoint(monitor="val_acc", mode="max"),
            LearningRateFinder(num_training_steps=200)
            ],
            logger=wandb_logger,
        )

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [37]:
trainer.fit(lstm_model, train_dataloader, val_dataloader)
wandb.finish()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkpuchalskixiv[0m. Use [1m`wandb login --relogin`[0m to force relogin


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Finding best initial lr: 100%|██████████| 200/200 [00:12<00:00, 16.18it/s]
`Trainer.fit` stopped: `max_steps=200` reached.
Learning rate set to 0.005754399373371567
Restoring states from the checkpoint path at /home/kacper/ecoNLP/.lr_find_c09484b6-0937-4d8d-a898-63dc462fbe48.ckpt
Restored all states from the checkpoint at /home/kacper/ecoNLP/.lr_find_c09484b6-0937-4d8d-a898-63dc462fbe48.ckpt

  | Name  | Type             | Params
-------------------------------------------
0 | loss  | CrossEntropyLoss | 0     
1 | model | SimpleLSTM       | 1.7 M 
-------------------------------------------
1.7 M     Trainable params
0         Non-trainable params
1.7 M     Total params
6.903     Total estimated model params size (MB)
Restored all states from the checkpoint at /home/kacper/ecoNLP/.lr_find_c09484b6-0937-4d8d-a898-63dc462fbe48.ckpt


Epoch 1:   0%|          | 0/130 [00:00<?, ?it/s, v_num=0x61, train_loss=2.260]         

/home/kacper/anaconda3/envs/gpu_torch/lib/python3.12/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:382: `ModelCheckpoint(monitor='val_acc')` could not find the monitored key in the returned metrics: ['scheduler_lr', 'train_loss', 'epoch', 'step']. HINT: Did you call `log('val_acc', value)` in the `LightningModule`?
/home/kacper/anaconda3/envs/gpu_torch/lib/python3.12/site-packages/pytorch_lightning/loops/training_epoch_loop.py:381: ReduceLROnPlateau conditioned on metric val_acc which is not available but strict is set to `False`. Skipping learning rate update.


Epoch 14: 100%|██████████| 130/130 [00:08<00:00, 15.24it/s, v_num=0x61, train_loss=1.890, val_loss=1.870, val_acc=23.40]




0,1
epoch,▁▁▁▁▂▂▂▃▃▃▃▃▃▃▃▄▄▄▅▅▅▅▅▅▅▅▆▆▆▇▇▇▇▇▇▇▇▇██
scheduler_lr,███████████▁▁▁▁
train_loss,█▇▅▅▂▃▇▄▃▅▂▃▅▅▄▃▃▂▃▃▄▄▁▄▁▃▄▂▂▁▁▁▃▄▂▃▃
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
val_acc,▂▂▂█▁▃▅▄▅▅▄▅▃▃
val_loss,█▅▄▃█▇▂▂▁▁▁▁▁▁

0,1
epoch,14.0
scheduler_lr,3e-05
train_loss,1.82659
trainer/global_step,1879.0
val_acc,23.39056
val_loss,1.87253


# Simple Transformer

In [33]:
class TransformerModel(nn.Module):
    def __init__(self, hid_dim, out_dim, vocab_size, dropout_p=0, nonlinearity=nn.ReLU(), w2v_init=False, num_layers=1, bidirectional=False,
                 nheads=8,):
        super().__init__()
        self.name='SimpleTransformer'
        self.hid_dim=hid_dim
        self.dropout_p=dropout_p
        self.nonlinearity=str(nonlinearity)
        self.w2v_init=w2v_init
        self.num_layers=num_layers
        self.bidirectional=bidirectional
        self.nheads=nheads
        # last token states 'end of string' and is repeated multiple time at the end of an input
        # therefore set its embedding to 0 with padding_idx=-1
        self.emb=nn.Embedding(vocab_size, hid_dim, padding_idx=-1) 
        if w2v_init:
            w2vmodel = Word2Vec(bookwords, vector_size=hid_dim, min_count=0)
            emb_lst = []
            for v in range(vocab_size):
                emb_lst.append(w2vmodel.wv[str(v)])
            emb_mat = np.array(emb_lst)
            self.emb.load_state_dict({'weight': torch.from_numpy(emb_mat)})

        t_layer= nn.TransformerEncoderLayer(d_model=hid_dim, nhead=nheads, batch_first=True, dropout=dropout_p, activation=nonlinearity)
        self.model= nn.TransformerEncoder(t_layer, num_layers=num_layers)
      #  self.out_layer=nn.Linear(self.bimult*num_layers*hid_dim, out_dim)
        self.out_layer=nn.Sequential(nn.Dropout(p=dropout_p),
                                     nn.Linear(hid_dim+hid_dim, hid_dim),
                                     nn.Tanh(),
                                     nn.Dropout(p=0.2),
                                     nn.Linear(hid_dim, out_dim)
                                     )
        
    def forward(self, x):
        inputs, lengths=x
        max_len=min(512, torch.max(lengths))
        t_input=self.emb(inputs[:, :max_len])

        hstates=self.model(t_input)
        hstates_avg=hstates.sum(dim=1).div(lengths.float().unsqueeze(dim=1))
        hn=hstates[:,-1]

        return self.out_layer(torch.concat([hstates_avg, hn], dim=1))

In [34]:
trans_model=BookGenreClassifier(TransformerModel(256, 10, vocab_size=vocab_size,
                                       # nonlinearity='tanh',
                                        dropout_p=0.5,
                                        w2v_init=True,
                                        num_layers=2,
                                      #  nonlinearity=nn.Tanh()
                                        ), 
                             loss=nn.CrossEntropyLoss(),#weight=torch.tensor(label_weights, dtype=torch.float32)),
                             lr=1e-4,
                             lr_dc=0.25,
                             lr_dc_step=4,
                             weight_init='normal'
                             )

In [35]:
wandb_logger = WandbLogger(
        project="ecoNLP", entity="kpuchalskixiv", log_model=False
    )

In [36]:
trainer=pl.Trainer(max_epochs=50,
                   callbacks=[
            EarlyStopping(
                monitor="val_acc", patience=10, mode="max", check_finite=True, check_on_train_epoch_end=False
            ),
            LearningRateMonitor(),
            ModelCheckpoint(monitor="val_acc", mode="max"),
          #  LearningRateFinder(num_training_steps=200)
            ],
            logger=wandb_logger,
        )

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [37]:
trainer.fit(trans_model, train_dataloader, val_dataloader)
wandb.finish()

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type             | Params
-------------------------------------------
0 | loss  | CrossEntropyLoss | 0     
1 | model | TransformerModel | 9.5 M 
-------------------------------------------
9.5 M     Trainable params
0         Non-trainable params
9.5 M     Total params
37.994    Total estimated model params size (MB)


Epoch 23: 100%|██████████| 130/130 [00:14<00:00,  8.91it/s, v_num=cj56, train_loss=0.787, val_loss=1.780, val_acc=46.80]




0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇███
scheduler_lr,██████▃▃▃▃▃▃▁▁▁▁▁▁▁▁▁▁▁▁
train_loss,█▇▆▆▇▆▅▆▆▄▅▄▅▄▄▄▄▂▂▃▃▂▂▂▃▃▃▂▃▁▂▃▃▃▂▂▂▃▃▂
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_acc,▁▂▂▃▄▆▇▇█▇▇▆██▇▇▇▇▇▇▇▇▇▇
val_loss,█▇▆▆▄▃▂▁▁▂▂▅▃▃▄▅▅▅▆▅▅▆▅▆

0,1
epoch,23.0
scheduler_lr,0.0
train_loss,0.99748
trainer/global_step,3119.0
val_acc,46.78112
val_loss,1.78359


# Asses cost

In [47]:
api = wandb.Api()

In [62]:

run = api.run("kpuchalskixiv/ecoNLP/9f62gf68")
system_metrics = run.history(stream="events")

In [63]:
system_metrics

Unnamed: 0,system.network.sent,system.gpu.0.powerPercent,system.network.recv,system.cpu.1.cpu_percent,system.gpu.0.powerWatts,_wandb,system.disk.out,system.disk.\.usageGB,system.gpu.0.temp,system.gpu.0.memory,...,system.proc.memory.availableMB,system.cpu,system.proc.cpu.threads,system.memory,system.cpu.0.cpu_percent,system.proc.memory.percent,system.disk.\.usagePercent,system.disk.in,system.gpu.0.memoryAllocated,_timestamp
0,69511.07,32.63,53265.0,1.99,80.51,True,0.76,5689.88,41.27,15.27,...,28804.39,10.85,33,10.27,30.71,8.23,70.6,3.61,16.93,1718619000.0
1,531024.27,76.74,351139.33,30.5,192.76,True,1.86,5689.88,55.53,37.4,...,28892.33,25.05,33,10.0,32.0,8.23,70.6,6.68,19.75,1718619000.0
2,1070869.73,77.82,687942.73,60.65,195.28,True,2.6,5689.88,64.6,37.87,...,28891.84,25.05,33,10.0,24.99,8.23,70.6,6.68,19.59,1718619000.0
3,1644629.47,77.47,1044225.8,77.0,195.82,True,3.1,5689.88,72.07,37.93,...,28882.2,25.06,33,10.06,3.35,8.23,70.6,6.68,19.2,1718619000.0
4,2251116.73,79.24,1421881.67,50.45,200.6,True,3.53,5689.88,74.53,37.2,...,28869.29,25.05,33,10.1,39.68,8.23,70.6,6.68,19.12,1718619000.0
5,2902029.93,79.02,1824326.27,7.76,197.68,True,4.01,5689.88,74.0,37.53,...,28869.31,25.06,33,10.1,4.79,8.23,70.6,6.68,19.12,1718619000.0
6,4045751.0,82.02,2531673.8,22.7,205.82,True,600.41,5689.88,73.67,37.67,...,28899.5,24.69,33,10.0,24.78,8.32,70.6,6.68,19.84,1718619000.0
7,4371209.0,91.24,2796206.0,1.2,228.09,True,963.45,5689.88,72.2,31.0,...,28871.5,0.17,30,10.1,2.3,8.23,70.6,6.68,20.73,1718620000.0


In [64]:
system_metrics['system.gpu.0.powerWatts'].mean()*(511/3600)

26.553547222222225

In [65]:
1.15*system_metrics['system.gpu.0.powerWatts'].mean()*(511/3600)/1000

0.030536579305555554

# inference

In [88]:
from time import time

In [81]:
alldata=BookDataset(df)

whole_datalaoder=data_utils.DataLoader(alldata, batch_size=32, num_workers=cpu_count())

In [83]:
test_trainer=pl.Trainer(max_epochs=1,
                
        )

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/home/kacper/anaconda3/envs/gpu_torch/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


In [114]:
w2v_simple_model.model.hid_dim

64

In [115]:
t0=time()
test_trainer.test(w2v_simple_model, whole_datalaoder)
infertime=(time()-t0)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing DataLoader 0: 100%|██████████| 146/146 [00:00<00:00, 236.01it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_acc             95.12561798095703
        test_loss           0.20292286574840546
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


In [116]:
(75*infertime*100000/df.shape[0])/3600

0.5918711148696778

In [117]:
((75*infertime*100000/df.shape[0])/3600)*1.15/1000

0.0006806517821001295