In [1]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('src'), '..')))

import pandas as pd
from sklearn.model_selection import train_test_split
import wandb

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.optim.lr_scheduler as lr

seed_value = 42
torch.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value)
generator = torch.Generator()
generator.manual_seed(seed_value)
torch.backends.cudnn.deterministic = True

from functools import partial

from src.trainer.trainer_classifier import Trainer_classifier
from src.trainer.trainer_VAE import Trainer_VAE
from src.models.classifiers import *
from src.trainer.model_class import Model_class
from src.trainer.loss_class import Loss_class

from src.models.autoencoders import *
from src.models.joinedModel import *

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
HIDDEN_PARAM = 512
LATENT_REPR = 5

BATCH_SIZE = 1024

In [3]:
df = pd.read_csv('../data/df_to_enc.csv')

In [4]:
def prepare_data_for_test(X_data, y_data, test_ratio):
    # Prepare dataset for testing
    X_train, X_test, y_train, y_test = train_test_split(X_data,
                                                     y_data,
                                                     shuffle = True,
                                                     stratify = y_data,
                                                     random_state = 42,
                                                     test_size = test_ratio)
    
    return X_train, X_test, y_train, y_test

In [5]:
def prepare_data_for_enc(X_data, y_data, autoenc_requared):

    # Check that amount rows for enc less than length of data
    if autoenc_requared >= len(X_data):
        raise ValueError("The number of rows for autoencoder more than amount of X_train data")
    
    autoenc_ratio = autoenc_requared/len(X_data)
    
    X_to_enc, X_to_clas,\
    y_to_enc, y_to_clas = train_test_split(X_data,
                                           y_data,
                                           shuffle = True,
                                           stratify = y_data,
                                           random_state = 42,
                                           train_size = autoenc_ratio)
    
    X_encoder_train, X_encoder_test = train_test_split(X_to_enc,
                                       shuffle = True,
                                       random_state = 42,
                                       train_size = 0.9)
    
    return X_encoder_train, X_encoder_test, X_to_clas, y_to_clas

In [6]:
def prepare_data_for_classif(X_data, y_data, classif_requared):

    # Prepare dataset for encoder
    if classif_requared >= len(X_data):
        raise ValueError("The number of rows for classifier more than amount of X_train data")
    classif_ratio = classif_requared / len(X_data)
    
    X_train, X_test,\
    y_train, y_test = train_test_split(X_data,
                                       y_data,
                                       shuffle = True,
                                       stratify = y_data,
                                       random_state = 42,
                                       train_size = classif_ratio)
    
    return X_train, y_train

In [7]:
def make_dataloader(*data, encoder_data = False):
    
    if len(data) > 1:
        data_list = [data[i] for i in range(len(data))]
        dataset = pd.concat(data_list, axis = 1)
    else:
        dataset = data[0]
    if encoder_data == False:
        dataset = TableDatasetDF(dataset)
        dataloader = DataLoader(
            dataset,
            batch_size=BATCH_SIZE, 
            shuffle=True,
            generator=generator
        )
    else:
        dataset = EncoderDataset(dataset)
        dataloader = DataLoader(
            dataset,
            batch_size=BATCH_SIZE, 
            shuffle=True,
            generator=generator
        )

    return dataloader

In [8]:
def prepare_data(X_data, y_data, test_ratio, autoenc_requared, classif_requared):
    X_train, X_test, y_train, y_test = prepare_data_for_test(X_data, y_data, test_ratio)
    X_encoder_train, X_encoder_test, X_to_clas, y_to_clas = prepare_data_for_enc(X_train, y_train, autoenc_requared)
    X_train_classif, y_train_classif = prepare_data_for_classif(X_to_clas, y_to_clas, classif_requared)

    test_dl = make_dataloader(X_test, y_test)
    train_dl = make_dataloader(X_train_classif, y_train_classif)

    enc_train_dl = make_dataloader(X_encoder_train, encoder_data=True)
    enc_test_dl = make_dataloader(X_encoder_test, encoder_data=True)

    return train_dl, test_dl, enc_train_dl, enc_test_dl

In [9]:
encoder = nn.Sequential(
            nn.Linear(df.shape[1]-1, int(HIDDEN_PARAM)),
            nn.ReLU(),
            nn.Linear(int(HIDDEN_PARAM), int(HIDDEN_PARAM/2)),
            nn.ReLU(),
            nn.Linear(int(HIDDEN_PARAM/2), int(HIDDEN_PARAM/4)),
            nn.ReLU(),
            nn.Linear(int(HIDDEN_PARAM/4), int(HIDDEN_PARAM/8)),
            nn.ReLU(),
            nn.Linear(int(HIDDEN_PARAM/8), int(HIDDEN_PARAM/16)),
            nn.ReLU(),
            nn.Linear(int(HIDDEN_PARAM/16), LATENT_REPR)
        )

decoder = nn.Sequential(
            nn.Linear(LATENT_REPR, int(HIDDEN_PARAM/16)),
            nn.ReLU(),
            nn.Linear(int(HIDDEN_PARAM/16), int(HIDDEN_PARAM/8)),
            nn.ReLU(),
            nn.Linear(int(HIDDEN_PARAM/8), int(HIDDEN_PARAM/4)),
            nn.ReLU(),
            nn.Linear(int(HIDDEN_PARAM/4), int(HIDDEN_PARAM/2)),
            nn.ReLU(),
            nn.Linear(int(HIDDEN_PARAM/2), int(HIDDEN_PARAM)),
            nn.ReLU(),
            nn.Linear(int(HIDDEN_PARAM), df.shape[1]-1)
        )

In [10]:
def train_decoder(encoder, decoder, train_dl, test_ld, VAE = False):

        if VAE == False:
            autoencoder = Autoencoder(encoder, decoder)
            loss = Encoder_loss(nn.MSELoss())
        else:
            autoencoder = VAE(encoder, decoder)
            loss = vae_loss(vae_loss_function)

        model_factory = partial(Model_class)
        optimizer_factory = partial(torch.optim.AdamW)
        scheduler_factory = partial(lr.ExponentialLR)

        model_params = dict(model=autoencoder,
                            device=device)

        optimizer_params = dict(weight_decay=1e-3, lr=1e-2)
        scheduler_params = dict(gamma=0.95)

        learning_params = dict(batch_size=BATCH_SIZE, num_epoch=30)

        if VAE == False:
            wandb_init_params = dict(
                name=f'Autoencoder_simple_HidParam-{HIDDEN_PARAM}_Latent-{LATENT_REPR}',
                project="Internship_project",
                dir = '../logs/'
            )
        else:
             wandb_init_params = dict(
                name=f'VAE_HidParam-{HIDDEN_PARAM}_Latent-{LATENT_REPR}',
                project="Internship_project",
                dir = '../logs/'
            )
        # Start training
        trainer = Trainer_classifier(train_dl,
                          test_ld,
                          loss,
                          model_factory=model_factory,
                          optimizer_factory=optimizer_factory,
                          scheduler_factory=scheduler_factory,
                          model_params=model_params,
                          optimizer_params=optimizer_params,
                          scheduler_params=scheduler_params,
                          log=False,
                          wandb_init_params=wandb_init_params,
                          model_dir='../logs/nn_models/autoencoder/',
                          saving_model=False
                          )
        trainer.train_model(learning_params)
        wandb.finish()
        return trainer.model.model.decoder

In [11]:
def train_classifier(decoder, train_dl, test_dl, labels_amount, VAE = False):

    if VAE == False:
        classifier = Simple_classifier(train_dl.dataset.data.shape[1], 50)
        jm = JoinedModel(decoder, classifier)
    else:
        classifier = Simple_classifier(train_dl.dataset.data.shape[1], 50)
        jm = JoinedModel_VAE(decoder, classifier)

    loss = Loss_class(FocalLoss(gamma=2))
    model_factory = partial(Model_class)
    optimizer_factory = partial(torch.optim.AdamW)
    scheduler_factory = partial(lr.ExponentialLR)

    model_params = dict(model=jm,
                        device=device)

    optimizer_params = dict(weight_decay=1e-3, lr=1e-2)
    scheduler_params = dict(gamma=0.95)

    learning_params = dict(batch_size=BATCH_SIZE, num_epoch=20)

    wandb_init_params = dict(
        name=f'JM_NumLab-{labels_amount}_LatDim-{LATENT_REPR}',
        project="Internship_project",
        dir = '../logs/'
    )
    
    trainer = Trainer_classifier(train_dl,
                        test_dl,
                        loss,
                        model_factory=model_factory,
                        optimizer_factory=optimizer_factory,
                        scheduler_factory=scheduler_factory,
                        model_params=model_params,
                        optimizer_params=optimizer_params,
                        scheduler_params=scheduler_params,
                        log=False,
                        wandb_init_params=wandb_init_params,
                        model_dir='../logs/nn_models/joined_models/',
                        saving_model=False
                        )
    
    trainer.train_model(learning_params)
    wandb.finish()

In [12]:
def train_cycle(encoder, decoder, df, list_amount = [50, 100, 500, 1000, 1990]):
    for labels_amount in list_amount:
        ############################################################
        # PREPARE DATA
        ############################################################
        train_dl, test_dl, enc_train_dl, enc_test_dl = prepare_data(df.drop(columns = ['Machine failure']),
                                                                    df['Machine failure'], 0.2, 6000, labels_amount)
        
        ############################################################
        # Autoenc cycle
        ############################################################
        decoder = train_decoder(encoder, decoder, enc_train_dl, enc_test_dl)
        train_classifier(decoder,train_dl, test_dl,labels_amount)
        ############################################################
        # VAE cycle
        ############################################################
        decoder = train_decoder(encoder, decoder, enc_train_dl, enc_test_dl,VAE = True)
        train_classifier(decoder,train_dl, test_dl,labels_amount,VAE = True)

In [13]:
train_cycle(encoder, decoder, df)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Currently logged in as: [33mdmitrii_fomin[0m ([33mdmitrii_fomin_uga[0m). Use [1m`wandb login --relogin`[0m to force relogin


I'm studying hard now🧐, don't disturb!: 100%|██████████| 6/6 [00:00<00:00, 21.85it/s]
Let's see how good I am...: 100%|██████████| 1/1 [00:00<00:00, 48.15it/s]


Epoch: 1 of 30, 0.005 min


I'm studying hard now🧐, don't disturb!: 100%|██████████| 6/6 [00:00<00:00, 26.84it/s]
Let's see how good I am...: 100%|██████████| 1/1 [00:00<00:00, 65.27it/s]


Epoch: 2 of 30, 0.004 min


I'm studying hard now🧐, don't disturb!: 100%|██████████| 6/6 [00:00<00:00, 27.83it/s]
Let's see how good I am...: 100%|██████████| 1/1 [00:00<00:00, 57.88it/s]


Epoch: 3 of 30, 0.004 min


I'm studying hard now🧐, don't disturb!: 100%|██████████| 6/6 [00:00<00:00, 28.41it/s]
Let's see how good I am...: 100%|██████████| 1/1 [00:00<00:00, 32.12it/s]


Epoch: 4 of 30, 0.004 min


I'm studying hard now🧐, don't disturb!: 100%|██████████| 6/6 [00:00<00:00, 26.95it/s]
Let's see how good I am...: 100%|██████████| 1/1 [00:00<00:00, 57.51it/s]


Epoch: 5 of 30, 0.004 min


I'm studying hard now🧐, don't disturb!: 100%|██████████| 6/6 [00:00<00:00, 17.69it/s]
Let's see how good I am...: 100%|██████████| 1/1 [00:00<00:00, 57.82it/s]


Epoch: 6 of 30, 0.006 min


I'm studying hard now🧐, don't disturb!: 100%|██████████| 6/6 [00:00<00:00, 19.91it/s]
Let's see how good I am...: 100%|██████████| 1/1 [00:00<00:00, 59.37it/s]


Epoch: 7 of 30, 0.006 min


I'm studying hard now🧐, don't disturb!: 100%|██████████| 6/6 [00:00<00:00, 30.95it/s]
Let's see how good I am...: 100%|██████████| 1/1 [00:00<00:00, 73.39it/s]


Epoch: 8 of 30, 0.004 min


I'm studying hard now🧐, don't disturb!: 100%|██████████| 6/6 [00:00<00:00, 29.10it/s]
Let's see how good I am...: 100%|██████████| 1/1 [00:00<00:00, 58.38it/s]


Epoch: 9 of 30, 0.004 min


I'm studying hard now🧐, don't disturb!: 100%|██████████| 6/6 [00:00<00:00, 27.99it/s]
Let's see how good I am...: 100%|██████████| 1/1 [00:00<00:00, 55.05it/s]


Epoch: 10 of 30, 0.004 min


I'm studying hard now🧐, don't disturb!: 100%|██████████| 6/6 [00:00<00:00, 27.42it/s]
Let's see how good I am...: 100%|██████████| 1/1 [00:00<00:00, 56.68it/s]


Epoch: 11 of 30, 0.004 min


I'm studying hard now🧐, don't disturb!: 100%|██████████| 6/6 [00:00<00:00, 23.65it/s]
Let's see how good I am...: 100%|██████████| 1/1 [00:00<00:00, 55.78it/s]


Epoch: 12 of 30, 0.005 min


I'm studying hard now🧐, don't disturb!: 100%|██████████| 6/6 [00:00<00:00, 26.58it/s]
Let's see how good I am...: 100%|██████████| 1/1 [00:00<00:00, 56.05it/s]


Epoch: 13 of 30, 0.004 min


I'm studying hard now🧐, don't disturb!: 100%|██████████| 6/6 [00:00<00:00, 12.17it/s]
Let's see how good I am...: 100%|██████████| 1/1 [00:00<00:00, 48.31it/s]


Epoch: 14 of 30, 0.009 min


I'm studying hard now🧐, don't disturb!: 100%|██████████| 6/6 [00:00<00:00, 19.73it/s]
Let's see how good I am...: 100%|██████████| 1/1 [00:00<00:00, 68.96it/s]


Epoch: 15 of 30, 0.006 min


I'm studying hard now🧐, don't disturb!: 100%|██████████| 6/6 [00:00<00:00, 25.78it/s]
Let's see how good I am...: 100%|██████████| 1/1 [00:00<00:00, 39.18it/s]


Epoch: 16 of 30, 0.005 min


I'm studying hard now🧐, don't disturb!: 100%|██████████| 6/6 [00:00<00:00, 26.26it/s]
Let's see how good I am...: 100%|██████████| 1/1 [00:00<00:00, 58.74it/s]


Epoch: 17 of 30, 0.004 min


I'm studying hard now🧐, don't disturb!: 100%|██████████| 6/6 [00:00<00:00, 23.09it/s]
Let's see how good I am...: 100%|██████████| 1/1 [00:00<00:00, 59.79it/s]


Epoch: 18 of 30, 0.005 min


I'm studying hard now🧐, don't disturb!: 100%|██████████| 6/6 [00:00<00:00, 26.10it/s]
Let's see how good I am...: 100%|██████████| 1/1 [00:00<00:00, 66.72it/s]


Epoch: 19 of 30, 0.004 min


I'm studying hard now🧐, don't disturb!: 100%|██████████| 6/6 [00:00<00:00, 28.00it/s]
Let's see how good I am...: 100%|██████████| 1/1 [00:00<00:00, 56.05it/s]


Epoch: 20 of 30, 0.004 min


I'm studying hard now🧐, don't disturb!: 100%|██████████| 6/6 [00:00<00:00, 24.87it/s]
Let's see how good I am...: 100%|██████████| 1/1 [00:00<00:00, 44.72it/s]


Epoch: 21 of 30, 0.005 min


I'm studying hard now🧐, don't disturb!: 100%|██████████| 6/6 [00:00<00:00, 16.27it/s]
Let's see how good I am...: 100%|██████████| 1/1 [00:00<00:00, 54.40it/s]


Epoch: 22 of 30, 0.007 min


I'm studying hard now🧐, don't disturb!: 100%|██████████| 6/6 [00:00<00:00, 27.02it/s]
Let's see how good I am...: 100%|██████████| 1/1 [00:00<00:00, 48.60it/s]


Epoch: 23 of 30, 0.004 min


I'm studying hard now🧐, don't disturb!: 100%|██████████| 6/6 [00:00<00:00, 25.68it/s]
Let's see how good I am...: 100%|██████████| 1/1 [00:00<00:00, 48.38it/s]


Epoch: 24 of 30, 0.004 min


I'm studying hard now🧐, don't disturb!: 100%|██████████| 6/6 [00:00<00:00, 22.87it/s]
Let's see how good I am...: 100%|██████████| 1/1 [00:00<00:00, 42.91it/s]


Epoch: 25 of 30, 0.005 min


I'm studying hard now🧐, don't disturb!: 100%|██████████| 6/6 [00:00<00:00, 19.85it/s]
Let's see how good I am...: 100%|██████████| 1/1 [00:00<00:00, 30.87it/s]


Epoch: 26 of 30, 0.006 min


I'm studying hard now🧐, don't disturb!: 100%|██████████| 6/6 [00:00<00:00, 25.97it/s]
Let's see how good I am...: 100%|██████████| 1/1 [00:00<00:00, 57.42it/s]


Epoch: 27 of 30, 0.004 min


I'm studying hard now🧐, don't disturb!: 100%|██████████| 6/6 [00:00<00:00, 19.57it/s]
Let's see how good I am...: 100%|██████████| 1/1 [00:00<00:00, 57.89it/s]


Epoch: 28 of 30, 0.006 min


I'm studying hard now🧐, don't disturb!: 100%|██████████| 6/6 [00:00<00:00, 13.79it/s]
Let's see how good I am...: 100%|██████████| 1/1 [00:00<00:00, 59.61it/s]


Epoch: 29 of 30, 0.008 min


I'm studying hard now🧐, don't disturb!: 100%|██████████| 6/6 [00:00<00:00, 25.66it/s]
Let's see how good I am...: 100%|██████████| 1/1 [00:00<00:00, 61.00it/s]

Epoch: 30 of 30, 0.004 min





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
lr,██▇▇▆▆▆▅▅▅▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁▁▁
test_loss,▆█▇▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁
train_loss,█▇▆▅▄▄▄▃▃▃▃▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁

0,1
epoch,29.0
lr,0.00215
test_loss,0.22725
train_loss,0.22744


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016723078350000028, max=1.0…

I'm studying hard now🧐, don't disturb!:   0%|          | 0/1 [00:00<?, ?it/s]


RuntimeError: mat1 and mat2 shapes cannot be multiplied (50x15 and 5x32)