- https://discuss.huggingface.co/t/training-a-regression-model-using-roberta-smiles-to-ccs-cheminformatics/1314

# Import

In [1]:
# !pip install -q rdkit
# !pip install -q albumentations
# !pip install accelerate -U
# !pip install -q tokenizer
# !pip install -q transformer

# import accelerate
# accelerate.__version__

In [2]:
import random
import os

from tqdm import tqdm

import numpy as np
import pandas as pd
pd.set_option('mode.chained_assignment', None)

import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from rdkit import DataStructs
from rdkit.Chem import PandasTools, AllChem

In [3]:
import os
import random
import numpy as np
import torch

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [4]:
class CFG:
    SEED = 0
    IMG_SIZE = 224
    BATCH_SIZE = 32
    EPOCHS = 512
    LEARNING_RATE = 0.003

<br></br>

# Data Load

In [5]:
train_df = pd.read_csv('./data/train.csv')
test_df  = pd.read_csv('./data/test.csv')

In [6]:
train_df.head()

Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea
0,TRAIN_0000,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,26.01,50.68,3.259,400.495,5,2,8,3.259,117.37
1,TRAIN_0001,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,29.27,50.59,2.169,301.407,2,1,2,2.172,73.47
2,TRAIN_0002,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,5.586,80.892,1.593,297.358,5,0,3,1.585,62.45
3,TRAIN_0003,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,5.71,2.0,4.771,494.652,6,0,5,3.475,92.6
4,TRAIN_0004,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,93.27,99.99,2.335,268.31,3,0,1,2.337,42.43


<br></br>

# EDA

In [7]:
# train_df.describe()

In [8]:
# num_features = train_df.columns[train_df.dtypes!='object'].tolist()
# for i,col in enumerate(num_features):

#     fig = plt.figure(figsize=(15,7))
#     fig.add_subplot(121)
#     sns.histplot(train_df[col],bins=20)
#     plt.grid()

#     fig.add_subplot(122)
#     sns.histplot(np.log(train_df[col]+1e-3),bins=20)
#     plt.grid()

#     plt.suptitle('[{}/{}] {}'.format(i+1,len(num_features),col))
#     plt.tight_layout()
#     plt.show()

# # -> ['Molecular_Weight','Molecular_PolarSurfaceArea']

In [9]:
# cols = ['AlogP','Molecular_Weight','Num_H_Acceptors','Num_H_Donors','Num_RotatableBonds','LogD','Molecular_PolarSurfaceArea']
# for col in cols:
#     print(col)
#     plt.figure(figsize=(15,7))
#     sns.scatterplot(x=train_df[col],y=train_df['HLM'])
#     plt.grid()
#     plt.show()

In [10]:
# cols = ['Num_H_Acceptors','Num_H_Donors','Num_RotatableBonds']
# for col in cols:
#     print(col)
#     plt.figure(figsize=(15,7))
#     sns.boxplot(x=train_df[col],y=train_df.MLM)
#     plt.show()

In [11]:
# train_df.nunique()

In [12]:
# lists = sorted(train_df['Num_H_Acceptors'].unique())
# for v in lists:
#     print('########',v)
#     d = train_df[train_df['Num_H_Acceptors']==v]
    
#     cols = ['AlogP','Molecular_Weight','Num_H_Acceptors','Num_H_Donors','Num_RotatableBonds','LogD','Molecular_PolarSurfaceArea']
#     for col in cols:
#         print(col)
#         plt.figure(figsize=(15,7))
#         sns.scatterplot(x=d[col],y=d['HLM'])
#         plt.grid()
#         plt.show()

<br></br>

# Pre-Processing

<br>

## Set target range to [0,100]

- [Dacon](https://dacon.io/competitions/official/236127/talkboard/409051?page=1&dtype=recent)에 따르면 100이 넘는 값도 나올 수 있음

In [13]:
# targets = ['MLM','HLM']
# for t in targets:
#     train_df[t] = [0 if x<0 else
#                    100 if x>100 else
#                    x for x in train_df[t]]

<br>

## Make molecule features

In [14]:
# Molecule to MorganFingerprint
def mol2fp(mol):
    #radius, nBits = 6, 4096
    radius, nBits = 12, (2**10)*3
    fp = AllChem.GetHashedMorganFingerprint(mol, radius=radius, nBits=nBits)
    ar = np.zeros((1,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(fp, ar)
    return ar

In [15]:
# (1) SMILES를 통해 Molecule(분자구조) 생성
PandasTools.AddMoleculeColumnToFrame(train_df,'SMILES','Molecule')
PandasTools.AddMoleculeColumnToFrame(test_df ,'SMILES','Molecule')

In [16]:
from sklearn.feature_selection import VarianceThreshold

In [17]:
# (2) Morgan Fingerprint column 추가
train_df["FPs"] = train_df.Molecule.apply(mol2fp)
test_df ["FPs"] = test_df .Molecule.apply(mol2fp)

In [18]:
# (3) Morgan Fingerprint 중, variance가 0.05보다 작은 컬럼들을 지우기
feature_select = VarianceThreshold(threshold=0.05)

# 일부사용
tr_fps_selected = feature_select.fit_transform(np.stack(train_df['FPs']))
te_fps_selected = feature_select.transform(np.stack(test_df['FPs']))
print(tr_fps_selected.shape[1])

# # 전체사용
# tr_fps_selected = np.stack(train_df['FPs'])
# te_fps_selected = np.stack(test_df ['FPs'])

fps_names = ['fps'+str(i+1) for i in range(tr_fps_selected.shape[1])]

train_df = pd.concat([train_df,pd.DataFrame(tr_fps_selected,columns=fps_names)],axis=1)
test_df  = pd.concat([test_df ,pd.DataFrame(te_fps_selected,columns=fps_names)],axis=1)

293


In [19]:
# 사용할 column만 추출
features = ['AlogP','Molecular_Weight','Num_H_Acceptors','Num_H_Donors',
            'Num_RotatableBonds','LogD','Molecular_PolarSurfaceArea']
fps_feature = 'FPs'
smiles_feature = 'SMILES'
targets  = ['MLM','HLM']

train_df = train_df[features+fps_names+[fps_feature,smiles_feature]+targets]
test_df  = test_df[features+fps_names+[fps_feature,smiles_feature]]

In [20]:
train_df.shape

(3498, 304)

In [21]:
train_df.head()

Unnamed: 0,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,fps1,fps2,fps3,...,fps288,fps289,fps290,fps291,fps292,fps293,FPs,SMILES,MLM,HLM
0,3.259,400.495,5,2,8,3.259,117.37,0,0,4,...,0,0,2,0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,26.01,50.68
1,2.169,301.407,2,1,2,2.172,73.47,0,0,3,...,0,0,0,0,0,1,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, ...",Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,29.27,50.59
2,1.593,297.358,5,0,3,1.585,62.45,0,0,1,...,0,0,0,2,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,5.586,80.892
3,4.771,494.652,6,0,5,3.475,92.6,0,0,3,...,2,0,0,0,0,1,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,5.71,2.0
4,2.335,268.31,3,0,1,2.337,42.43,0,0,2,...,0,0,0,0,0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,93.27,99.99


<br>

## Imputaion

In [22]:
from sklearn.impute import SimpleImputer

In [23]:
def null_check(data):
    d = data.copy()
    null_info = d.isnull().sum()
    null_info = null_info[null_info!=0]
    display(null_info)

In [24]:
print('> train')
null_check(train_df)

print('> test')
null_check(test_df)

> train


AlogP    2
dtype: int64

> test


AlogP    1
dtype: int64

In [25]:
# null_features = ['AlogP']

# imputer = SimpleImputer(missing_values=np.nan, strategy='median')
# train_df[null_features] = imputer.fit_transform(train_df[null_features])
# test_df [null_features] = imputer.transform(test_df[null_features])

In [26]:
features = list(set(features)-set(['AlogP']))
train_df.drop('AlogP',axis=1,inplace=True)
test_df .drop('AlogP',axis=1,inplace=True)

In [27]:
print('> train')
null_check(train_df)

print('> test')
null_check(test_df)

> train


Series([], dtype: int64)

> test


Series([], dtype: int64)

<br>

## Train Test Split

In [28]:
from sklearn.model_selection import train_test_split

In [29]:
tr_df, va_df = train_test_split(train_df,test_size=0.2,shuffle=True,random_state=CFG.SEED)
te_df = test_df.copy()

In [30]:
len(tr_df), len(va_df), len(te_df)

(2798, 700, 483)

<br>

## Scaling

In [31]:
from sklearn.preprocessing import MinMaxScaler

scaling_features = features+fps_names
scalers = {}
for f in scaling_features:
    scaler = MinMaxScaler(feature_range=(-1,1))
    tr_df[f] = scaler.fit_transform(np.array(tr_df[f]).reshape(-1,1))
    va_df[f] = scaler.transform(np.array(va_df[f]).reshape(-1,1))
    te_df[f] = scaler.transform(np.array(te_df[f]).reshape(-1,1))
    scalers[f] = scaler

<br>

## Interaction Term

In [32]:
import warnings
from tqdm import trange

def get_abs_corr(x,y):
    return np.abs(np.corrcoef(x,y))[0,1]

class InteractionTerm:
    def __init__(self):
        pass
    
    def fit(self,data,num_features,corr_cutoff=0.7):
        warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
        warnings.filterwarnings("ignore", category=RuntimeWarning)
        
        d = data.copy()
        self.interaction_list = []
        for i in trange(len(num_features),desc='fitting...'):
            for j in range(len(num_features)):
                if i>j:
                    col_i = num_features[i]
                    col_j = num_features[j]
                    
                    # 상관계수가 cutoff보다 큰 경우에는 interaction을 생성하지 않음
                    if (get_abs_corr(d[col_i]*d[col_j],d[col_i])>=corr_cutoff) | (get_abs_corr(d[col_i]*d[col_j],d[col_j])>=corr_cutoff):
                        pass
                    else:
                        self.interaction_list.append(f'{col_i}*{col_j}')
    
    def transform(self,data):
        d = data.copy()
        print('> the number of interaction term:',len(self.interaction_list))
        for interaction in self.interaction_list:
            col_i,col_j = interaction.split('*')
            d[interaction] = d[col_i]*d[col_j]
        return d
    
    def fit_transform(self,data,num_features,corr_cutoff=0.7):
        self.fit(data,num_features,corr_cutoff)
        return self.transform(data)

In [33]:
# num_features = features + fps_features

In [34]:
# interaction_maker = InteractionTerm()
# interaction_maker.fit(
#     data=tr_df,
#     num_features=num_features,
#     corr_cutoff=0.15,
# )
# tr_df = interaction_maker.transform(tr_df)
# va_df = interaction_maker.transform(va_df)
# te_df = interaction_maker.transform(te_df)

In [35]:
tr_df.shape

(2798, 303)

<br>

## Target Transformation

In [36]:
# for t in targets:
#     tr_df[t] = np.log(tr_df[t]+1e-3)
#     va_df[t] = np.log(va_df[t]+1e-3)

# def inverse_transform(x):
#     return torch.exp(x)-1e-3

In [37]:
# from scipy.special import boxcox, inv_boxcox

# def boxcox_transform(x):
#     _lambda = 0.25
#     return boxcox(x,_lambda)

# def inverse_boxcox_transform(x):
#     _lambda = 0.25
#     return inv_boxcox(x,_lambda)

# tr_df[targets] = tr_df[targets].apply(boxcox_transform)
# va_df[targets] = va_df[targets].apply(boxcox_transform)

In [38]:
inverse_transform = None

<br></br>

In [39]:
class MultiRMSELoss(nn.Module):
    def __init__(self):
        super(MultiRMSELoss, self).__init__()

    def forward(self, output, target):
        loss1 = torch.sqrt(torch.mean((output[:,0]-target[:,0])**2))
        loss2 = torch.sqrt(torch.mean((output[:,1]-target[:,1])**2))
        loss = 0.5*loss1+0.5*loss2
        return loss

In [40]:
import gc
import numpy as np
import torch
import torch.nn as nn
import time

class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt', trace_func=print):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
            path (str): Path for the checkpoint to be saved to.
                            Default: 'checkpoint.pt'
            trace_func (function): trace print function.
                            Default: print            
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path
        self.trace_func = trace_func
        
    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.verbose:
                self.trace_func(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            self.trace_func(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        if self.path!='':
            torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

        
def train(
    model, criterion, optimizer, train_loader, valid_loader, epochs,
    early_stopping, device='cpu', scheduler=None, metric_period=1, verbose=True, 
    save_model_path = './mc/best_model.pt', final_model_path = './mc/final_model.pt',
    use_best_model=True,
    inverse_transform=None,
):  
    seed_everything(CFG.SEED)
    model.to(device)

    best_loss  = 999999999
    best_epoch = 1
    best_model = None
    is_best    = np.nan
    
    start_time = time.time()
    epoch_s = time.time()
    
    model.train()
    for epoch in range(1, epochs+1):
        gc.collect()
        
        #model.train()
        train_loss = []
        for ids,target in train_loader:
            ids = ids.to(device)
            target = target.to(device)

            optimizer.zero_grad()
            output = model(ids).float()
            
            if inverse_transform is not None:
                output = inverse_transform(output)
                target = inverse_transform(target)
            
            loss = criterion(output, target)
            loss.backward()  # Getting gradients
            optimizer.step() # Updating parameters

            train_loss.append(loss.item())

        if valid_loader is not None:
            valid_loss = validation(model, valid_loader, criterion, device, inverse_transform)
        else:
            valid_loss = loss
            
        epoch_e = time.time()
            
        if scheduler is not None:
            scheduler.step(valid_loss)

        # update the best epoch & best loss
        if (best_loss > valid_loss) | (epoch==1):
            best_epoch = epoch
            best_loss = valid_loss
            best_model = model
            is_best = 1
            torch.save(best_model.state_dict(), save_model_path)
        else:
            is_best = 0
            if not use_best_model:
                torch.save(best_model.state_dict(), save_model_path)
            
        # 결과물 printing
        if (verbose) & (epoch % metric_period == 0):
            mark = '*' if is_best else ' '
            epoch_str = str(epoch).zfill(len(str(epochs)))
            if valid_loader is not None:
                progress = '{}[{}/{}] loss: {:.5f}, val_loss: {:.5f}, best_epoch: {}, elapsed: {:.2f}s, total: {:.2f}s, remaining: {:.2f}s'\
                    .format(
                        mark,
                        epoch_str,
                        epochs,
                        np.mean(train_loss),
                        valid_loss,
                        best_epoch,
                        epoch_e-epoch_s,
                        epoch_e-start_time,
                        (epoch_e-epoch_s)*(epochs-epoch)/metric_period,
                    )
            else:
                progress = '{}[{}/{}] loss: {:.5f}, best_epoch: {}, elapsed: {:.2f}s, total: {:.2f}s, remaining: {:.2f}s'\
                    .format(
                        mark,
                        epoch_str,
                        epochs,
                        np.mean(train_loss),
                        best_epoch,
                        epoch_e-epoch_s,
                        epoch_e-start_time,
                        (epoch_e-epoch_s)*(epochs-epoch)/metric_period,
                    )
            epoch_s = time.time()
            print(progress)

        # early stopping 여부를 체크. 현재 과적합 상황 추적
        if early_stopping is not None:
            early_stopping(valid_loss, model)
            if early_stopping.early_stop:
                break
                
        torch.save(best_model.state_dict(), final_model_path)

    return best_model

def validation(model, valid_loader, criterion, device, inverse_transform):
    valid_loss = []
    with torch.no_grad():
        for ids,target in valid_loader:
            ids = ids.to(device)
            target = target.to(device)
            
            output = model(ids).float()
            
            if inverse_transform is not None:
                output = inverse_transform(output)
                target = inverse_transform(target)

            loss = criterion(output, target)
            valid_loss.append(loss.item())

    return np.mean(valid_loss)

In [41]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

In [42]:
train_df['SMILES'].to_csv('./out/smiles.txt',index=False,header=False)

In [43]:
special_tokens = ["<s>","<PAD>","<MASK>"]

tokenizer = ByteLevelBPETokenizer()
tokenizer.train('./out/smiles.txt', vocab_size=800, min_frequency=1, special_tokens=special_tokens)
tokenizer.save_model('./mc/ByteLevelBPETokenizer')

tokenizer = ByteLevelBPETokenizer(
    "./mc/ByteLevelBPETokenizer/vocab.json",
    "./mc/ByteLevelBPETokenizer/merges.txt",
)

tokenizer._tokenizer.post_processor = BertProcessing(
    ("<PAD>" , tokenizer.token_to_id("<PAD>")),
    ("<MASK>", tokenizer.token_to_id("<MASK>")),
)

tokenizer.enable_padding(length=600)
tokenizer.save






<bound method BaseTokenizer.save of Tokenizer(vocabulary_size=564, model=ByteLevelBPE, add_prefix_space=False, lowercase=False, dropout=None, unicode_normalizer=None, continuing_subword_prefix=None, end_of_word_suffix=None, trim_offsets=False)>

In [44]:
from tokenizers import ByteLevelBPETokenizer
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn

# Dataset 정의
class SMILESDataset(Dataset):
    def __init__(self, data, smiles, targets, tokenizer, is_test):
        self.smiles = data[smiles].values
        self.tokenizer = tokenizer
        self.is_test = is_test
        if not self.is_test:
            self.targets = data[targets].values
    
    def __len__(self):
        return len(self.smiles)

    def __getitem__(self, idx):
        smile = self.smiles[idx]
        encoded = self.tokenizer.encode(smile)
        
        if self.is_test:
            return (
                torch.tensor(encoded.ids),
            )
        else:
            return (
                torch.tensor(encoded.ids),
                torch.tensor(self.targets[idx]),
            )

In [54]:
# DataLoader 설정
train_dataset = SMILESDataset(tr_df, 'SMILES', ['MLM','HLM'], tokenizer, False)
valid_dataset = SMILESDataset(va_df, 'SMILES', ['MLM','HLM'], tokenizer, False)
test_dataset  = SMILESDataset(te_df, 'SMILES', ['MLM','HLM'], tokenizer, True)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=False, pin_memory=True, num_workers=0)
valid_loader = DataLoader(valid_dataset, batch_size=64, shuffle=False, pin_memory=True, num_workers=0)
test_loader  = DataLoader(test_dataset , batch_size=64, shuffle=False, pin_memory=True, num_workers=0)

In [55]:
# [ids for ids,target in train_loader][0].shape

In [70]:
class RegressionModel(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super(RegressionModel, self).__init__()
        self.emb_layer = nn.Embedding(vocab_size, hidden_size)
        self.emb_sequence = nn.Sequential(
            #nn.BatchNorm1d(hidden_size),
            nn.LeakyReLU(0.1),
            #nn.Dropout(self.dropout_rate),
            nn.Linear(hidden_size, 2),
        )

    def forward(self, x):
        x = self.emb_layer(x).mean(dim=1)
        x = self.emb_sequence(x)
        return x

In [71]:
# 모델 생성
model = RegressionModel(vocab_size=800, hidden_size=512)

In [72]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [73]:
CFG.LEARNING_RATE = 0.01

In [74]:
criterion = MultiRMSELoss()
# optimizer= torch.optim.Adam(model.parameters(), lr=CFG.LEARNING_RATE)#, weight_decay=5e-4)
optimizer = torch.optim.AdamW(model.parameters(), lr=CFG.LEARNING_RATE)
# optimizer = torch.optim.SGD(model.parameters(), lr=CFG.LEARNING_RATE, momentum=0.9)
# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
#     optimizer, mode='min', factor=0.5, patience=2, threshold_mode='abs', min_lr=1e-7, verbose=True)
scheduler = None
# early_stopping = EarlyStopping(patience=10,verbose=False,path='')
early_stopping = None

In [75]:
torch.cuda.empty_cache()
gc.collect()

2013

In [76]:
best_model = train(
    model, criterion, optimizer, train_loader, valid_loader,
    1024, early_stopping, device, scheduler,
    metric_period=1, verbose=True,
    save_model_path='./mc/best_model.pt',
    final_model_path='./mc/final_model.pt',
    use_best_model=True,
    inverse_transform=inverse_transform,
)

*[0001/1024] loss: 40.12185, val_loss: 36.15742, best_epoch: 1, elapsed: 0.99s, total: 0.99s, remaining: 1008.32s
*[0002/1024] loss: 35.83322, val_loss: 36.05397, best_epoch: 2, elapsed: 1.01s, total: 2.00s, remaining: 1035.62s
*[0003/1024] loss: 35.77955, val_loss: 36.03536, best_epoch: 3, elapsed: 0.98s, total: 2.99s, remaining: 1000.66s
*[0004/1024] loss: 35.76068, val_loss: 36.01447, best_epoch: 4, elapsed: 1.00s, total: 3.99s, remaining: 1024.42s
*[0005/1024] loss: 35.74170, val_loss: 35.99284, best_epoch: 5, elapsed: 0.99s, total: 4.98s, remaining: 1004.79s
*[0006/1024] loss: 35.71864, val_loss: 35.96947, best_epoch: 6, elapsed: 0.98s, total: 5.97s, remaining: 998.20s
*[0007/1024] loss: 35.69179, val_loss: 35.94370, best_epoch: 7, elapsed: 0.98s, total: 6.95s, remaining: 1001.15s
*[0008/1024] loss: 35.66010, val_loss: 35.91384, best_epoch: 8, elapsed: 0.98s, total: 7.94s, remaining: 995.31s
*[0009/1024] loss: 35.62270, val_loss: 35.87650, best_epoch: 9, elapsed: 0.98s, total: 8.9

KeyboardInterrupt: 

<br></br>

# Inference

In [None]:
def predict(best_model,loader,device,inverse_transform):
    best_model.to(device)

    true_list = []
    pred_list = []
    with torch.no_grad():
        for ids,target in iter(loader):
            ids = ids.to(device)
            target = target.to(device)

            output = best_model(ids)
            
            if inverse_transform is not None:
                output = inverse_transform(output)
                target = inverse_transform(target)

            true_list.append(target)
            pred_list.append(output)

    trues = torch.cat(true_list,dim=0)
    preds = torch.cat(pred_list,dim=0)

    trues = trues.cpu().numpy()
    preds = preds.cpu().numpy()

    return trues, preds

def predict_test(best_model,loader,device,inverse_transform):
    best_model.to(device)

    #true_list = []
    pred_list = []
    with torch.no_grad():
        for ids in iter(loader):
            ids = ids.to(device)
            #target = target.to(device)

            output = best_model(ids)
            
            if inverse_transform is not None:
                output = inverse_transform(output)
                #target = inverse_transform(target)

            #true_list.append(target)
            pred_list.append(output)

    #trues = torch.cat(true_list,dim=0)
    preds = torch.cat(pred_list,dim=0)

    #trues = trues.cpu().numpy()
    preds = preds.cpu().numpy()

    return preds

In [None]:
# best_model = MultiTaskModel(feature_input_size,output_size,hidden_size,dropout_rate)
best_model = RegressionModel(vocab_size=800, hidden_size=512)
best_model.load_state_dict(torch.load('./mc/best_model.pt'))

In [None]:
tr_true, tr_pred = predict(best_model,train_loader,device,inverse_transform)
va_true, va_pred = predict(best_model,valid_loader,device,inverse_transform)

In [None]:
(MultiRMSELoss()(torch.tensor(tr_true),torch.tensor(tr_pred)),
 MultiRMSELoss()(torch.tensor(va_true),torch.tensor(va_pred)))

In [None]:
tr_true[:10].round(1), tr_pred[:10].round(1)
# va_true[:10].round(1), va_pred[:10].round(1)

In [None]:
def abline(intercept,slope):
    axes = plt.gca()
    x_vals = np.array(axes.get_xlim())
    y_vals = intercept + slope * x_vals
    plt.plot(x_vals, y_vals, linestyle='--', color='red')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

fig = plt.figure(figsize=(15,7))
fig.add_subplot(121)
sns.scatterplot(x=tr_true[:,0],y=tr_pred[:,0])
abline(0,1)
plt.xlabel('true')
plt.ylabel('pred')
plt.grid()
fig.add_subplot(122)
sns.scatterplot(x=tr_true[:,1],y=tr_pred[:,1])
abline(0,1)
plt.xlabel('true')
plt.ylabel('pred')
plt.grid()
plt.suptitle('train',fontsize=20)
plt.tight_layout()
plt.show()

fig = plt.figure(figsize=(15,7))
fig.add_subplot(121)
sns.scatterplot(x=va_true[:,0],y=va_pred[:,0])
abline(0,1)
plt.xlabel('true')
plt.ylabel('pred')
plt.grid()
fig.add_subplot(122)
sns.scatterplot(x=va_true[:,1],y=va_pred[:,1])
abline(0,1)
plt.xlabel('true')
plt.ylabel('pred')
plt.grid()
plt.suptitle('validation',fontsize=20)
plt.tight_layout()
plt.show()

In [None]:
te_pred = predict_test(best_model,test_loader,device,inverse_transform)

In [None]:
submit = pd.read_csv('./data/sample_submission.csv')
submit[targets] = te_pred
submit.head()

In [None]:
submit.to_csv('./out/mlt_efficientnet_mol&fps.csv',index=False)