In [1]:
import logging
import sys
from pathlib import Path
import pandas as pd
import torch
from torchmetrics.classification import MultilabelAccuracy
from torchmetrics.regression import MeanSquaredError
from torch import nn

proj_path = Path('/cluster') / 'work' / 'jacobaal' / 'pers-pred'
proj_path = proj_path.resolve()
if proj_path not in sys.path: sys.path.append(str(proj_path))

from torch.utils.data import DataLoader
from src.ffn import Decoder
from src.datasets import SingleInputDataset, get_stl_dataloaders
from src.trainer import train, get_optimizer, get_scheduler
from src.utils import get_commons

  from .autonotebook import tqdm as notebook_tqdm


device: cpu


In [2]:
paths, constants, config, logger, device = get_commons(log=True)

2024-06-01 12:59:52,276 - ArgumentLogger - INFO - Arguments:
2024-06-01 12:59:52,276 - ArgumentLogger - INFO - Arguments:
2024-06-01 12:59:52,278 - ArgumentLogger - INFO - seed: 42
2024-06-01 12:59:52,278 - ArgumentLogger - INFO - seed: 42
2024-06-01 12:59:52,279 - ArgumentLogger - INFO - dataframe: {'generate': False, 'mbti_frac': 0.1, 'bigfive_c_frac': 1.0, 'bigfive_s_frac': 1.0}
2024-06-01 12:59:52,279 - ArgumentLogger - INFO - dataframe: {'generate': False, 'mbti_frac': 0.1, 'bigfive_c_frac': 1.0, 'bigfive_s_frac': 1.0}
2024-06-01 12:59:52,281 - ArgumentLogger - INFO - eda: {'generate': False}
2024-06-01 12:59:52,281 - ArgumentLogger - INFO - eda: {'generate': False}
2024-06-01 12:59:52,282 - ArgumentLogger - INFO - reduce: {'generate': False, 'use_full': False}
2024-06-01 12:59:52,282 - ArgumentLogger - INFO - reduce: {'generate': False, 'use_full': False}
2024-06-01 12:59:52,283 - ArgumentLogger - INFO - preprocessing: {'generate_features': False, 'generate_partially_cleaned': Fa

device: cpu


In [3]:
model_name = config['embeddings']['model']
embedding_size = constants['embedding_sizes'][model_name]
stats_size = 0

In [5]:
dataframes = {task: pd.read_csv(paths['split'][model_name][task], header=[0, 1], index_col=0).drop('STATS', axis='columns') for task in constants['tasks']}
datasets = {task: SingleInputDataset(dataframe) for task, dataframe in dataframes.items()}
dataloaders = {task: get_stl_dataloaders(dataset, config['split']['train'], config['split']['test'], config['dataloaders']) for task, dataset in datasets.items()}
dataloaders

{'mbti': {'train': <torch.utils.data.dataloader.DataLoader at 0x15087d9da190>,
  'test': <torch.utils.data.dataloader.DataLoader at 0x15087d9da3d0>,
  'val': <torch.utils.data.dataloader.DataLoader at 0x15087c3b6b10>},
 'bigfive_c': {'train': <torch.utils.data.dataloader.DataLoader at 0x150877843b50>,
  'test': <torch.utils.data.dataloader.DataLoader at 0x150876bc3350>,
  'val': <torch.utils.data.dataloader.DataLoader at 0x150876bc3290>},
 'bigfive_s': {'train': <torch.utils.data.dataloader.DataLoader at 0x150876bc31d0>,
  'test': <torch.utils.data.dataloader.DataLoader at 0x150876bc3050>,
  'val': <torch.utils.data.dataloader.DataLoader at 0x150876bc2f90>}}

In [None]:
input_size = [embedding_size + stats_size]
decoders = {
    'mbti': Decoder(input_size + config['stl-decoders']['hidden_nn'] + [4], final='sigmoid', dropout=config['stl-decoders']['dropout']).to(device),
    'bigfive_c': Decoder(input_size + config['stl-decoders']['hidden_nn'] + [5], final='sigmoid', dropout=config['stl-decoders']['dropout']).to(device),
    'bigfive_s': Decoder(input_size + config['stl-decoders']['hidden_nn'] + [5], final='none', dropout=config['stl-decoders']['dropout']).to(device)
    }

In [None]:
alg, optim_arg = get_optimizer(config['optim_param'])
optimizers = {task: alg(decoders[task].parameters(), **optim_arg) for task in constants["tasks"]}
schedulers = {k: get_scheduler(v, config['scheduler_param']) for k, v in optimizers.items()}

In [None]:
loss_fns = {
    'mbti': nn.BCELoss(),
    'bigfive_c': nn.BCELoss(),
    'bigfive_s': nn.MSELoss()
    }

metrics = {
    'mbti': MultilabelAccuracy(num_labels=4),
    'bigfive_c': MultilabelAccuracy(num_labels=5),
    'bigfive_s': MeanSquaredError()
}

In [None]:
def exec_task(task:str):
    return train(
        decoders[task], 
        dataloaders[task], 
        optimizers[task], 
        loss_fns[task], 
        metric_fn=metrics[task],
        n_epochs=config['training']['epochs'], 
        checkpoint_name=config['training']['checkpoint_name'], 
        patience=config['training']['patience'],
        device=device,
        logger=logger,
        higher_is_better=True,
        scheduler=schedulers[task]
    )

In [None]:
exec_task('mbti')

2024-06-01 11:04:30,348 - ArgumentLogger - INFO - Model: Decoder(
  (model): Sequential(
    (0): Linear(in_features=768, out_features=4096, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=4096, out_features=1024, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.3, inplace=False)
    (6): Linear(in_features=1024, out_features=1024, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.3, inplace=False)
    (9): Linear(in_features=1024, out_features=4, bias=True)
    (10): Sigmoid()
  )
)
2024-06-01 11:04:30,348 - ArgumentLogger - INFO - Model: Decoder(
  (model): Sequential(
    (0): Linear(in_features=768, out_features=4096, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=4096, out_features=1024, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.3, inplace=False)
    (6): Linear(in_features=1024, out_features=1024, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.3, inplace=False)
    (9): Linear(in_feat

Dataloader train: 14192
Dataloader test: 1774
Dataloader val: 1775


EPOCH 11: Loss train: 0.155, val: 0.155 | Metric train: 0.670, val: 0.666:  11%|█         | 11/100 [01:23<11:19,  7.63s/it]

Early stopped after 10 epochs of no progress. Best validation metric 0.666





Training finished after 0:01:24.997666. Test metric 0.676


2024-05-20 09:32:15,232 - ArgumentLogger - INFO - <generator object train.<locals>.<genexpr> at 0x14a15c1fbb40>
2024-05-20 09:32:15,234 - ArgumentLogger - INFO - Model: Decoder(
  (model): Sequential(
    (0): Linear(in_features=772, out_features=2048, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5, inplace=False)
    (3): Linear(in_features=2048, out_features=2048, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.5, inplace=False)
    (6): Linear(in_features=2048, out_features=512, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.5, inplace=False)
    (9): Linear(in_features=512, out_features=4, bias=True)
    (10): Sigmoid()
  )
)
EPOCH 57: Loss train: 0.148, val: 0.148 | Metric train: 0.677, val: 0.679:  28%|██▊       | 57/200 [10:17<25:48, 10.83s/it]
Training finished after 0:10:18.366500. Test metric 0.676


2024-05-20 10:30:13,425 - ArgumentLogger - INFO - <generator object train.<locals>.<genexpr> at 0x152cac9ee640>
2024-05-20 10:30:13,428 - ArgumentLogger - INFO - Model: Decoder(
  (model): Sequential(
    (0): Linear(in_features=772, out_features=1024, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=1024, out_features=512, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.3, inplace=False)
    (6): Linear(in_features=512, out_features=4, bias=True)
    (7): Sigmoid()
  )
)
EPOCH 75: Loss train: 0.145, val: 0.144 | Metric train: 0.690, val: 0.694:  38%|███▊      | 75/200 [04:24<07:21,  3.53s/it]
Early stopped after 12 epochs of no progress. Best validation metric 0.698

Training finished after 0:04:25.947753. Test metric 0.687


In [None]:
exec_task('bigfive_c')

2024-06-01 11:05:56,491 - ArgumentLogger - INFO - Model: Decoder(
  (model): Sequential(
    (0): Linear(in_features=768, out_features=4096, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=4096, out_features=1024, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.3, inplace=False)
    (6): Linear(in_features=1024, out_features=1024, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.3, inplace=False)
    (9): Linear(in_features=1024, out_features=5, bias=True)
    (10): Sigmoid()
  )
)
2024-06-01 11:05:56,491 - ArgumentLogger - INFO - Model: Decoder(
  (model): Sequential(
    (0): Linear(in_features=768, out_features=4096, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=4096, out_features=1024, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.3, inplace=False)
    (6): Linear(in_features=1024, out_features=1024, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.3, inplace=False)
    (9): Linear(in_feat

Dataloader train: 2172
Dataloader test: 271
Dataloader val: 272


EPOCH 11: Loss train: 0.139, val: 0.139 | Metric train: 0.500, val: 0.514:  11%|█         | 11/100 [00:27<03:42,  2.50s/it]

Early stopped after 10 epochs of no progress. Best validation metric 0.516





Training finished after 0:00:28.216062. Test metric 0.500


2024-05-20 09:42:34,083 - ArgumentLogger - INFO - <generator object train.<locals>.<genexpr> at 0x14a15bf8c940>
2024-05-20 09:42:34,085 - ArgumentLogger - INFO - Model: Decoder(
  (model): Sequential(
    (0): Linear(in_features=772, out_features=2048, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5, inplace=False)
    (3): Linear(in_features=2048, out_features=2048, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.5, inplace=False)
    (6): Linear(in_features=2048, out_features=512, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.5, inplace=False)
    (9): Linear(in_features=512, out_features=5, bias=True)
    (10): Sigmoid()
  )
)
EPOCH 13: Loss train: 0.141, val: 0.139 | Metric train: 0.504, val: 0.464:   6%|▋         | 13/200 [00:38<09:07,  2.93s/it]
Training finished after 0:00:38.808136. Test metric 0.485


In [None]:
exec_task('bigfive_s')

2024-06-01 11:06:25,229 - ArgumentLogger - INFO - Model: Decoder(
  (model): Sequential(
    (0): Linear(in_features=768, out_features=4096, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=4096, out_features=1024, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.3, inplace=False)
    (6): Linear(in_features=1024, out_features=1024, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.3, inplace=False)
    (9): Linear(in_features=1024, out_features=5, bias=True)
  )
)
2024-06-01 11:06:25,229 - ArgumentLogger - INFO - Model: Decoder(
  (model): Sequential(
    (0): Linear(in_features=768, out_features=4096, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=4096, out_features=1024, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.3, inplace=False)
    (6): Linear(in_features=1024, out_features=1024, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.3, inplace=False)
    (9): Linear(in_features=1024, out_featu

Dataloader train: 1452
Dataloader test: 181
Dataloader val: 183


EPOCH 12: Loss train: 614.032, val: 586.109 | Metric train: 3070.159, val: 2930.544:  12%|█▏        | 12/100 [00:25<03:07,  2.13s/it]

Early stopped after 10 epochs of no progress. Best validation metric 13138.345





Training finished after 0:00:26.340916. Test metric 3125.406


2024-05-20 09:43:14,305 - ArgumentLogger - INFO - <generator object train.<locals>.<genexpr> at 0x14a161446a40>
2024-05-20 09:43:14,308 - ArgumentLogger - INFO - Model: Decoder(
  (model): Sequential(
    (0): Linear(in_features=772, out_features=2048, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5, inplace=False)
    (3): Linear(in_features=2048, out_features=2048, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.5, inplace=False)
    (6): Linear(in_features=2048, out_features=512, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.5, inplace=False)
    (9): Linear(in_features=512, out_features=5, bias=True)
  )
)
EPOCH 14: Loss train: 306.757, val: 258.280 | Metric train: 578421.500, val: 447273.906:   7%|▋         | 14/200 [00:31<06:59,  2.26s/it]
Training finished after 0:00:32.339656. Test metric 467401.594
