# Import

In [1]:
# !pip install -q rdkit
# !pip install -q albumentations
# !pip install accelerate -U
# !pip install -q tokenizer
# !pip install -q transformer

# import accelerate
# accelerate.__version__

In [2]:
import random
import os

from tqdm import tqdm

import numpy as np
import pandas as pd
pd.set_option('mode.chained_assignment', None)

import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from rdkit import DataStructs
from rdkit.Chem import PandasTools, AllChem

In [3]:
import os
import random
import numpy as np
import torch

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [4]:
class CFG:
    SEED = 0
    IMG_SIZE = 224
    BATCH_SIZE = 32
    EPOCHS = 512
    LEARNING_RATE = 0.01

<br></br>

# Data Load

In [5]:
train_df = pd.read_csv('./data/train.csv')
test_df  = pd.read_csv('./data/test.csv')

In [6]:
train_df.head()

Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea
0,TRAIN_0000,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,26.01,50.68,3.259,400.495,5,2,8,3.259,117.37
1,TRAIN_0001,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,29.27,50.59,2.169,301.407,2,1,2,2.172,73.47
2,TRAIN_0002,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,5.586,80.892,1.593,297.358,5,0,3,1.585,62.45
3,TRAIN_0003,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,5.71,2.0,4.771,494.652,6,0,5,3.475,92.6
4,TRAIN_0004,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,93.27,99.99,2.335,268.31,3,0,1,2.337,42.43


<br></br>

# EDA

In [7]:
# train_df.describe()

In [8]:
# num_features = train_df.columns[train_df.dtypes!='object'].tolist()
# for i,col in enumerate(num_features):

#     fig = plt.figure(figsize=(15,7))
#     fig.add_subplot(121)
#     sns.histplot(train_df[col],bins=20)
#     plt.grid()

#     fig.add_subplot(122)
#     sns.histplot(np.log(train_df[col]+1e-3),bins=20)
#     plt.grid()

#     plt.suptitle('[{}/{}] {}'.format(i+1,len(num_features),col))
#     plt.tight_layout()
#     plt.show()

# # -> ['Molecular_Weight','Molecular_PolarSurfaceArea']

In [9]:
# cols = ['AlogP','Molecular_Weight','Num_H_Acceptors','Num_H_Donors','Num_RotatableBonds','LogD','Molecular_PolarSurfaceArea']
# for col in cols:
#     print(col)
#     plt.figure(figsize=(15,7))
#     sns.scatterplot(x=train_df[col],y=train_df['HLM'])
#     plt.grid()
#     plt.show()

In [10]:
# cols = ['Num_H_Acceptors','Num_H_Donors','Num_RotatableBonds']
# for col in cols:
#     print(col)
#     plt.figure(figsize=(15,7))
#     sns.boxplot(x=train_df[col],y=train_df.MLM)
#     plt.show()

In [11]:
# train_df.nunique()

In [12]:
# lists = sorted(train_df['Num_H_Acceptors'].unique())
# for v in lists:
#     print('########',v)
#     d = train_df[train_df['Num_H_Acceptors']==v]
    
#     cols = ['AlogP','Molecular_Weight','Num_H_Acceptors','Num_H_Donors','Num_RotatableBonds','LogD','Molecular_PolarSurfaceArea']
#     for col in cols:
#         print(col)
#         plt.figure(figsize=(15,7))
#         sns.scatterplot(x=d[col],y=d['HLM'])
#         plt.grid()
#         plt.show()

<br></br>

# Pre-Processing

<br>

## Set target range to [0,100]

- [Dacon](https://dacon.io/competitions/official/236127/talkboard/409051?page=1&dtype=recent)에 따르면 100이 넘는 값도 나올 수 있음

In [13]:
# targets = ['MLM','HLM']
# for t in targets:
#     train_df[t] = [0 if x<0 else
#                    100 if x>100 else
#                    x for x in train_df[t]]

<br>

## Make molecule features

In [14]:
# Molecule to MorganFingerprint
def mol2fp(mol):
    #radius, nBits = 6, 4096
    radius, nBits = 12, (2**10)*3
    fp = AllChem.GetHashedMorganFingerprint(mol, radius=radius, nBits=nBits)
    ar = np.zeros((1,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(fp, ar)
    return ar

In [15]:
# (1) SMILES를 통해 Molecule(분자구조) 생성
PandasTools.AddMoleculeColumnToFrame(train_df,'SMILES','Molecule')
PandasTools.AddMoleculeColumnToFrame(test_df ,'SMILES','Molecule')

In [16]:
from sklearn.feature_selection import VarianceThreshold

In [17]:
# (2) Morgan Fingerprint column 추가
train_df["FPs"] = train_df.Molecule.apply(mol2fp)
test_df ["FPs"] = test_df .Molecule.apply(mol2fp)

In [18]:
# (3) Morgan Fingerprint 중, variance가 0.05보다 작은 컬럼들을 지우기
feature_select = VarianceThreshold(threshold=0.05)

# 일부사용
tr_fps_selected = feature_select.fit_transform(np.stack(train_df['FPs']))
te_fps_selected = feature_select.transform(np.stack(test_df['FPs']))
print(tr_fps_selected.shape[1])

# # 전체사용
# tr_fps_selected = np.stack(train_df['FPs'])
# te_fps_selected = np.stack(test_df ['FPs'])

fps_names = ['fps'+str(i+1) for i in range(tr_fps_selected.shape[1])]

train_df = pd.concat([train_df,pd.DataFrame(tr_fps_selected,columns=fps_names)],axis=1)
test_df  = pd.concat([test_df ,pd.DataFrame(te_fps_selected,columns=fps_names)],axis=1)

293


In [19]:
# 사용할 column만 추출
features = ['AlogP','Molecular_Weight','Num_H_Acceptors','Num_H_Donors',
            'Num_RotatableBonds','LogD','Molecular_PolarSurfaceArea']
fps_feature = 'FPs'
smiles_feature = 'SMILES'
targets  = ['MLM','HLM']

train_df = train_df[features+[fps_feature,smiles_feature]+targets] #fps_names
test_df  = test_df[features+[fps_feature,smiles_feature]]          #fps_names

In [20]:
train_df.shape

(3498, 11)

In [21]:
train_df.head()

Unnamed: 0,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,FPs,SMILES,MLM,HLM
0,3.259,400.495,5,2,8,3.259,117.37,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,26.01,50.68
1,2.169,301.407,2,1,2,2.172,73.47,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, ...",Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,29.27,50.59
2,1.593,297.358,5,0,3,1.585,62.45,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,5.586,80.892
3,4.771,494.652,6,0,5,3.475,92.6,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,5.71,2.0
4,2.335,268.31,3,0,1,2.337,42.43,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,93.27,99.99


<br>

## Imputaion

In [22]:
from sklearn.impute import SimpleImputer

In [23]:
def null_check(data):
    d = data.copy()
    null_info = d.isnull().sum()
    null_info = null_info[null_info!=0]
    display(null_info)

In [24]:
print('> train')
null_check(train_df)

print('> test')
null_check(test_df)

> train


AlogP    2
dtype: int64

> test


AlogP    1
dtype: int64

In [25]:
# null_features = ['AlogP']

# imputer = SimpleImputer(missing_values=np.nan, strategy='median')
# train_df[null_features] = imputer.fit_transform(train_df[null_features])
# test_df [null_features] = imputer.transform(test_df[null_features])

In [26]:
features = list(set(features)-set(['AlogP']))
train_df.drop('AlogP',axis=1,inplace=True)
test_df .drop('AlogP',axis=1,inplace=True)

In [27]:
print('> train')
null_check(train_df)

print('> test')
null_check(test_df)

> train


Series([], dtype: int64)

> test


Series([], dtype: int64)

<br></br>

# Interaction Term

In [28]:
# interaction_maker = InteractionTerm()
# interaction_maker.fit(
#     data=train_df,
#     num_features=features,
#     corr_cutoff=0.8,
# )
# train_df = interaction_maker.transform(train_df)
# test_df  = interaction_maker.transform(test_df)

<br></br>

# Tokenizer Save

In [29]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

In [30]:
train_df['SMILES'].to_csv('./out/smiles.txt',index=False,header=False)

In [31]:
vocab_size = 200
special_tokens = ["<s>","<PAD>","<MASK>"]

tokenizer = ByteLevelBPETokenizer()
tokenizer.train('./out/smiles.txt', vocab_size=vocab_size, min_frequency=1, special_tokens=special_tokens)
tokenizer.save_model('./mc/ByteLevelBPETokenizer')

tokenizer = ByteLevelBPETokenizer(
    "./mc/ByteLevelBPETokenizer/vocab.json",
    "./mc/ByteLevelBPETokenizer/merges.txt",
)

tokenizer._tokenizer.post_processor = BertProcessing(
    ("<PAD>" , tokenizer.token_to_id("<PAD>")),
    ("<MASK>", tokenizer.token_to_id("<MASK>")),
)

tokenizer.enable_padding(length=600)
tokenizer.save






<bound method BaseTokenizer.save of Tokenizer(vocabulary_size=259, model=ByteLevelBPE, add_prefix_space=False, lowercase=False, dropout=None, unicode_normalizer=None, continuing_subword_prefix=None, end_of_word_suffix=None, trim_offsets=False)>

<br></br>

# Custom Dataset

In [32]:
from rdkit import Chem
from rdkit.Chem import Draw

class CustomDataset(Dataset):
    def __init__(self, data, targets, smiles, fps, fps_maximum, tokenizer, transforms=[None,None], is_test=False):
        self.data = data.copy()
        self.targets = targets
        self.smiles = smiles
        self.fps = fps
        self.tokenizer = tokenizer
        self.smiles_transforms, self.fps_transforms = transforms
        self.is_test = is_test
        
        # self.smiles_features = []
        # for s in tqdm(data[smiles].values,desc='Setup SMILES... '):
        #     m = Chem.MolFromSmiles(s)
        #     img = Draw.MolToImage(m)#, size=(224,224))
        #     img = np.array(img)
        #     if self.smiles_transforms is not None:
        #         img = self.smiles_transforms(image=img)['image']
        #     self.smiles_features.append(img)
        # self.smiles_features = torch.stack(self.smiles_features)
        
        # fps_values = (np.stack(data[fps])/fps_maximum)*255 # max -> 255
        # self.fps_features = []
        # for f in fps_values:
        #     img = f.reshape(32,32,3)
        #     if self.fps_transforms is not None:
        #         img = self.fps_transforms(image=img)['image']
        #     self.fps_features.append(img)
        
        self.embedding = []
        for s in data[smiles].values:
            encoded = self.tokenizer.encode(s)
            self.embedding.append(encoded.ids)
            
        if not self.is_test:
            self.target_features = self.data[self.targets].values
            self.num_features = self.data.drop(columns=targets+[fps,smiles],axis=1).values
        else:
            self.num_features = self.data.drop(columns=[fps,smiles],axis=1).values

    def __getitem__(self, index):
        if self.is_test:
            return (
                torch.Tensor(self.num_features[index]),
                #torch.Tensor(self.smiles_features[index]),
                #torch.Tensor(self.fps_features[index]),
                torch.tensor(self.embedding[index]),
            )
        else:
            return (
                torch.Tensor(self.num_features[index]),
                #torch.Tensor(self.smiles_features[index]),
                #torch.Tensor(self.fps_features[index]),
                torch.tensor(self.embedding[index]),
                torch.Tensor(self.target_features[index]),
            )
        
    def __len__(self):
        return len(self.data)

In [33]:
import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2
import torchvision.models as models

In [34]:
# import matplotlib.pyplot as plt

# # (1) smiles
# s = tr_df['SMILES'].values[0]
# m = Chem.MolFromSmiles(s)
# img = Draw.MolToImage(m)#, size=(224,224))
# img = np.array(img)
# img = smiles_transform(image=img)['image']
# img.shape
# plt.imshow(img.T)

# # (2) fps
# fps_maximum = max(np.stack(tr_df.FPs).max(),np.stack(te_df.FPs).max())
# fps_value = (np.stack(tr_df['FPs'])/fps_maximum)*255 # max -> 255

# f = fps_value[0]
# img = f.reshape(32,32,3)
# img = img.astype(np.float32)
# img = fps_transform(image=img)['image']
# plt.imshow(img.T)

In [35]:
#  이미지 변환
smiles_transform = A.Compose([
    A.Resize(CFG.IMG_SIZE,CFG.IMG_SIZE),
    A.ToGray(p=1),
    A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), max_pixel_value=255.0, always_apply=False, p=1.0),
    ToTensorV2(),
])

fps_transform = A.Compose([
    A.Resize(CFG.IMG_SIZE,CFG.IMG_SIZE),
    #A.ToGray(p=1),
    A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), max_pixel_value=255.0, always_apply=False, p=1.0),
    ToTensorV2(),
])

<br></br>

# Define Model

In [36]:
import torchvision.models as models
from copy import deepcopy

In [37]:
# class MultiTaskModel(nn.Module):
#     def __init__(self, feature_input_size, output_size, hidden_size, dropout_rate):
#         super(MultiTaskModel, self).__init__()
#         self.feature_output_size = 500
#         self.dropout_rate = dropout_rate
        
#         # BatchNorm1d
#         self.feature_layer = nn.Sequential(
#             nn.Linear(feature_input_size,hidden_size),
#             #nn.BatchNorm1d(hidden_size),
#             nn.LeakyReLU(0.1),
#             #nn.Dropout(self.dropout_rate),
#             nn.Linear(hidden_size,hidden_size),
#             #nn.BatchNorm1d(hidden_size),
#             nn.LeakyReLU(0.1),
#             #nn.Dropout(self.dropout_rate),
#             # nn.Linear(hidden_size,hidden_size),
#             # nn.BatchNorm1d(hidden_size),
#             # nn.LeakyReLU(0.1),
#             # nn.Dropout(self.dropout_rate),
#             # nn.Linear(hidden_size,hidden_size),
#             # nn.BatchNorm1d(hidden_size),
#             # nn.LeakyReLU(0.1),
#             # nn.Dropout(self.dropout_rate),
#             # nn.Linear(hidden_size,hidden_size),
#         )
#         self.mlm_layer = deepcopy(self.feature_layer)
#         self.hlm_layer = deepcopy(self.feature_layer)
        
#         fc_input_size = 2*hidden_size
#         self.fc = nn.Sequential(
#             nn.Linear(fc_input_size,fc_input_size//2),
#             #nn.BatchNorm1d(fc_input_size//2),
#             nn.LeakyReLU(0.1),
#             #nn.Dropout(self.dropout_rate),
#             # nn.Linear(fc_input_size//2,fc_input_size//4),
#             # nn.BatchNorm1d(fc_input_size//4),
#             # nn.LeakyReLU(0.1),
#             # nn.Dropout(self.dropout_rate),
#             # nn.Linear(fc_input_size//4,fc_input_size//8),
#             # nn.BatchNorm1d(fc_input_size//8),
#             # nn.LeakyReLU(0.1),
#             # nn.Dropout(self.dropout_rate),
#             nn.Linear(fc_input_size//2,output_size),
#         )
        
#     def forward(self, feat, mol, fps):
        
#         mlm = self.mlm_layer(feat)
#         hlm = self.hlm_layer(feat)
        
#         combined = torch.cat((mlm,hlm),dim=1)
#         combined = self.fc(combined)
        
#         output = combined
        
#         # output = self.tanh(output)
#         # output = (output+1)/2
        
#         #output = self.sigmoid(output)
        
#         return output

In [38]:
# class MultiTaskModel(nn.Module):
#     def __init__(self, feature_input_size, output_size, hidden_size, dropout_rate):
#         super(MultiTaskModel, self).__init__()
#         self.feature_output_size = 1000
#         self.dropout_rate = dropout_rate
        
#         # BatchNorm1d
#         self.feature_layer = nn.Sequential(
#             nn.Linear(feature_input_size,hidden_size),
#             nn.BatchNorm1d(hidden_size),
#             nn.LeakyReLU(0.1),
#             nn.Dropout(self.dropout_rate),
#             nn.Linear(hidden_size,hidden_size),
#             nn.BatchNorm1d(hidden_size),
#             nn.LeakyReLU(0.1),
#             nn.Dropout(self.dropout_rate),
#             nn.Linear(hidden_size,hidden_size),
#             nn.BatchNorm1d(hidden_size),
#             nn.LeakyReLU(0.1),
#             nn.Dropout(self.dropout_rate),
#             nn.Linear(hidden_size,hidden_size),
#             nn.BatchNorm1d(hidden_size),
#             nn.LeakyReLU(0.1),
#             nn.Dropout(self.dropout_rate),
#             nn.Linear(hidden_size,self.feature_output_size),
#         )
        
#         fc_input_size = self.feature_output_size
#         self.fc = nn.Sequential(
#             nn.Linear(fc_input_size,fc_input_size//2),
#             nn.BatchNorm1d(fc_input_size//2),
#             nn.LeakyReLU(0.1),
#             nn.Dropout(self.dropout_rate),
#             nn.Linear(fc_input_size//2,fc_input_size//4),
#             nn.BatchNorm1d(fc_input_size//4),
#             nn.LeakyReLU(0.1),
#             nn.Dropout(self.dropout_rate),
#             nn.Linear(fc_input_size//4,fc_input_size//8),
#             nn.BatchNorm1d(fc_input_size//8),
#             nn.LeakyReLU(0.1),
#             nn.Dropout(self.dropout_rate),
#             nn.Linear(fc_input_size//8,output_size),
#         )
        
#     def forward(self, feat, mol, fps):
#         feat = self.feature_layer(feat)
#         #mol = self.mol_layer(mol)
#         #fps = self.fps_layer(fps)
#         feat = self.fc(feat)
        
#         output = feat
        
#         # output = self.tanh(output)
#         # output = (output+1)/2
        
#         #output = self.sigmoid(output)
        
#         return output

In [39]:
# class MultiTaskModel(nn.Module):
#     def __init__(self, feature_input_size, output_size, hidden_size, dropout_rate):
#         super(MultiTaskModel, self).__init__()
#         self.hidden_size = hidden_size
#         self.dropout_rate = dropout_rate
#         self.image_output_size = 50
#         self.feature_output_size = 100
        
#         self.lstm1 = nn.LSTM(feature_input_size,hidden_size,num_layers=1,batch_first=True)
#         self.lstm2 = nn.LSTM(hidden_size,hidden_size,num_layers=1,batch_first=True)
#         self.lstm3 = nn.LSTM(hidden_size,hidden_size,num_layers=1,batch_first=True)
#         self.fc = nn.Linear(hidden_size,output_size)
#         self.bn = nn.BatchNorm1d(hidden_size)
#         self.dropout = nn.Dropout(dropout_rate)
#         self.activation = nn.LeakyReLU(0.01)

#     def forward(self, feat, mol, fps):
#         x = feat
#         h0 = torch.zeros(1, self.hidden_size).to(x.device) 
#         c0 = torch.zeros(1, self.hidden_size).to(x.device)
        
#         x, (hn,cn) = self.lstm1(x,(h0,c0))
#         x = self.dropout(x)
#         x = self.bn(x)
#         x = self.activation(x)
        
#         x, (hn,cn) = self.lstm2(x,(hn,cn))
#         x = self.dropout(x)
#         x = self.bn(x)
#         x = self.activation(x)
        
#         x, (hn,cn) = self.lstm3(x,(hn,cn))
#         x = self.dropout(x)
#         x = self.bn(x)
#         x = self.activation(x)
        
#         x = self.fc(x)

#         return x

In [40]:
# class MultiTaskModel(nn.Module):
#     def __init__(self, feature_input_size, output_size, hidden_size, dropout_rate):
#         super(MultiTaskModel, self).__init__()
#         self.image_output_size = 50
#         self.feature_output_size = 100
#         self.dropout_rate = dropout_rate
        
#         self.activation = nn.Tanh()
        
#         # efficientnet
#         self.backbone = models.efficientnet_b0(pretrained=True)
#         self.backbone.classifier = nn.Sequential(
#             nn.Dropout(p=0.2,inplace=True),
#             nn.Linear(self.backbone.classifier[-1].in_features,self.image_output_size),
#         )
#         self.mol_backbone = deepcopy(self.backbone)
#         self.fps_backbone = deepcopy(self.backbone)
        
#         # # resnet
#         # self.backbone = models.resnet18(pretrained=True)
#         # self.backbone.fc = nn.Linear(self.backbone.fc.in_features,self.image_output_size)
        
#         self.mol_layer = nn.Sequential(
#             self.mol_backbone,
#             #nn.BatchNorm1d(self.image_output_size),
#             #self.activation,
#             #nn.Dropout(self.dropout_rate),
#         )
#         self.fps_layer = nn.Sequential(
#             self.fps_backbone,
#             #nn.BatchNorm1d(self.image_output_size),
#             #self.activation,
#             #nn.Dropout(self.dropout_rate),
#         )
        
#         self.feature_layer = nn.Sequential(
#             nn.Linear(feature_input_size,hidden_size),
#             #nn.BatchNorm1d(hidden_size),
#             self.activation,
#             #nn.Dropout(self.dropout_rate),
#             nn.Linear(hidden_size,self.feature_output_size),
#         )
        
#         combined_input_size = self.feature_output_size # 2*self.image_output_size+
#         self.lstm = nn.LSTM(combined_input_size, combined_input_size//4, num_layers=2, batch_first=True)
#         self.fc = nn.Linear(combined_input_size//4,output_size)
        
#     def forward(self, feat, mol, fps):
#         feat = self.feature_layer(feat)
#         mol = self.mol_layer(mol)
#         fps = self.fps_layer(fps)
#         combined = torch.cat((feat,mol,fps),dim=1)
#         combined, _ = self.lstm(combined)
#         combined = self.activation(combined)
#         combined = self.fc(combined)
#         return combined

In [51]:
class MultiTaskModel(nn.Module):
    def __init__(self, feature_input_size, vocab_size, output_size, hidden_size, dropout_rate):
        super(MultiTaskModel, self).__init__()
        self.image_output_size = 16
        self.feature_output_size = 32
        self.embedding_output_size = 32
        self.dropout_rate = dropout_rate
        
        # efficientnet
        self.backbone = models.efficientnet_b0(pretrained=True)
        self.backbone.classifier = nn.Sequential(
            nn.Dropout(p=0.2,inplace=True),
            nn.Linear(self.backbone.classifier[-1].in_features,self.image_output_size),
        )
        
        # # resnet
        # self.backbone = models.resnet18(pretrained=True)
        # self.backbone.fc = nn.Linear(self.backbone.fc.in_features,self.image_output_size)
        
        self.mol_layer = nn.Sequential(
            deepcopy(self.backbone),
            #nn.BatchNorm1d(self.image_output_size),
            nn.GELU(),
            #nn.Dropout(self.dropout_rate),
        )
        self.fps_layer = nn.Sequential(
            deepcopy(self.backbone),
            #nn.BatchNorm1d(self.image_output_size),
            nn.GELU(),
            #nn.Dropout(self.dropout_rate),
        )
        
        self.feature_layer = nn.Sequential(
            nn.Linear(feature_input_size,hidden_size),
            #nn.BatchNorm1d(hidden_size),
            nn.GELU(),
            #nn.Dropout(self.dropout_rate),
            nn.Linear(hidden_size,hidden_size),
            #nn.BatchNorm1d(hidden_size),
            nn.GELU(),
            nn.Dropout(self.dropout_rate),
            nn.Linear(hidden_size, self.embedding_output_size),
        )
        
        self.emb_layer = nn.Embedding(vocab_size, hidden_size)
        self.emb_sequence = nn.Sequential(
            #nn.BatchNorm1d(hidden_size),
            nn.GELU(),
            #nn.Dropout(self.dropout_rate),
            nn.Linear(hidden_size, hidden_size),
            #nn.BatchNorm1d(hidden_size),
            nn.GELU(),
            nn.Dropout(self.dropout_rate),
            nn.Linear(hidden_size, self.embedding_output_size),
        )
        
        fc_input_size = self.feature_output_size+self.embedding_output_size #2*self.image_output_size
        self.fc = nn.Sequential(
            nn.Linear(fc_input_size,2),
        )
        
    def forward(self, feat, emb):
        feat = self.feature_layer(feat)
        #mol = self.mol_layer(mol)
        #fps = self.fps_layer(fps)
        emb = self.emb_layer(emb).mean(dim=1)
        emb = self.emb_sequence(emb)
        
        combined = torch.cat((feat,emb),dim=1)
        output = self.fc(combined)
        
        # output = self.tanh(output)
        # output = (output+1.0)/2.0
        # output = output*100.0
        
        # output = self.sigmoid(output)
        # output = output*100
        
        return output

<br>

# Train & Validation function

In [52]:
class MultiRMSELoss(nn.Module):
    def __init__(self):
        super(MultiRMSELoss, self).__init__()

    def forward(self, output, target):
        loss1 = torch.sqrt(torch.mean((output[:,0]-target[:,0])**2))
        loss2 = torch.sqrt(torch.mean((output[:,1]-target[:,1])**2))
        loss = 0.5*loss1+0.5*loss2
        return loss

In [53]:
import gc
import numpy as np
import torch
import torch.nn as nn
import time

class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt', trace_func=print):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
            path (str): Path for the checkpoint to be saved to.
                            Default: 'checkpoint.pt'
            trace_func (function): trace print function.
                            Default: print            
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path
        self.trace_func = trace_func
        
    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.verbose:
                self.trace_func(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            self.trace_func(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        if self.path!='':
            torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

        
def train(
    model, criterion, optimizer, train_loader, valid_loader, epochs,
    early_stopping, device='cpu', scheduler=None, metric_period=1, verbose=True, 
    save_model_path = './mc/best_model.pt', final_model_path = './mc/final_model.pt',
    use_best_model=True,
    inverse_transform=None,
):  
    seed_everything(CFG.SEED)
    model.to(device)

    best_loss  = 999999999
    best_epoch = 1
    best_model = None
    is_best    = np.nan
    
    start_time = time.time()
    epoch_s = time.time()
    
    model.train()
    for epoch in range(1, epochs+1):
        gc.collect()
        
        #model.train()
        train_loss = []
        for feat,emb,target in train_loader:
            feat = feat.to(device)
            #mol = mol.to(device)
            #fps = fps.to(device)
            emb = emb.to(device)
            target = target.to(device)

            optimizer.zero_grad()
            output = model(feat,emb).float()
            
            if inverse_transform is not None:
                output = inverse_transform(output)
                target = inverse_transform(target)
            
            loss = criterion(output, target)
            loss.backward()  # Getting gradients
            optimizer.step() # Updating parameters

            train_loss.append(loss.item())

        if valid_loader is not None:
            valid_loss = validation(model, valid_loader, criterion, device, inverse_transform)
        else:
            valid_loss = loss
            
        epoch_e = time.time()
            
        if scheduler is not None:
            scheduler.step(valid_loss)

        # update the best epoch & best loss
        if (best_loss > valid_loss) | (epoch==1):
            best_epoch = epoch
            best_loss = valid_loss
            best_model = model
            is_best = 1
            torch.save(best_model.state_dict(), save_model_path)
        else:
            is_best = 0
            if not use_best_model:
                torch.save(best_model.state_dict(), save_model_path)
            
        # 결과물 printing
        if (verbose) & (epoch % metric_period == 0):
            mark = '*' if is_best else ' '
            epoch_str = str(epoch).zfill(len(str(epochs)))
            if valid_loader is not None:
                progress = '{}[{}/{}] loss: {:.5f}, val_loss: {:.5f}, best_epoch: {}, elapsed: {:.2f}s, total: {:.2f}s, remaining: {:.2f}s'\
                    .format(
                        mark,
                        epoch_str,
                        epochs,
                        np.mean(train_loss),
                        valid_loss,
                        best_epoch,
                        epoch_e-epoch_s,
                        epoch_e-start_time,
                        (epoch_e-epoch_s)*(epochs-epoch)/metric_period,
                    )
            else:
                progress = '{}[{}/{}] loss: {:.5f}, best_epoch: {}, elapsed: {:.2f}s, total: {:.2f}s, remaining: {:.2f}s'\
                    .format(
                        mark,
                        epoch_str,
                        epochs,
                        np.mean(train_loss),
                        best_epoch,
                        epoch_e-epoch_s,
                        epoch_e-start_time,
                        (epoch_e-epoch_s)*(epochs-epoch)/metric_period,
                    )
            epoch_s = time.time()
            print(progress)

        # early stopping 여부를 체크. 현재 과적합 상황 추적
        if early_stopping is not None:
            early_stopping(valid_loss, model)
            if early_stopping.early_stop:
                break
                
        torch.save(best_model.state_dict(), final_model_path)

    return best_model

def validation(model, valid_loader, criterion, device, inverse_transform):
    valid_loss = []
    with torch.no_grad():
        for feat,emb,target in valid_loader:
            feat = feat.to(device)
            #mol = mol.to(device)
            #fps = fps.to(device)
            emb = emb.to(device)
            target = target.to(device)
            
            output = model(feat,emb).float()
            
            if inverse_transform is not None:
                output = inverse_transform(output)
                target = inverse_transform(target)

            loss = criterion(output, target)
            valid_loss.append(loss.item())

    return np.mean(valid_loss)

<br></br>

# K-Fold Fitting

In [54]:
import warnings
from tqdm import trange

def get_abs_corr(x,y):
    return np.abs(np.corrcoef(x,y))[0,1]

class InteractionTerm:
    def __init__(self):
        pass
    
    def fit(self,data,num_features,corr_cutoff=0.7):
        warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
        warnings.filterwarnings("ignore", category=RuntimeWarning)
        
        d = data.copy()
        self.interaction_list = []
        for i in trange(len(num_features),desc='fitting...'):
            for j in range(len(num_features)):
                if i>j:
                    col_i = num_features[i]
                    col_j = num_features[j]
                    
                    # 상관계수가 cutoff보다 큰 경우에는 interaction을 생성하지 않음
                    if (get_abs_corr(d[col_i]*d[col_j],d[col_i])>=corr_cutoff) | (get_abs_corr(d[col_i]*d[col_j],d[col_j])>=corr_cutoff):
                        pass
                    else:
                        self.interaction_list.append(f'{col_i}*{col_j}')
    
    def transform(self,data):
        d = data.copy()
        print('> the number of interaction term:',len(self.interaction_list))
        for interaction in self.interaction_list:
            col_i,col_j = interaction.split('*')
            d[interaction] = d[col_i]*d[col_j]
        return d
    
    def fit_transform(self,data,num_features,corr_cutoff=0.7):
        self.fit(data,num_features,corr_cutoff)
        return self.transform(data)

In [55]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold

In [56]:
interaction_features = [col for col in train_df.columns if col.find('*')>=0]

In [59]:
fps_maximum = max(np.stack(train_df.FPs).max(),np.stack(test_df.FPs).max())

feature_input_size = len(features) + len(interaction_features)
output_size = 2
hidden_size = 32
dropout_rate = 0.00
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [60]:
n_splits = 5
kf = KFold(n_splits=n_splits,random_state=0,shuffle=True)

k=0
for tr_idx,va_idx in kf.split(train_df):
    k+=1
    print('-'*100)
    print('> K-Fold: {}'.format(k))
    print('-'*100)
    
    tr_df, va_df = train_df.iloc[tr_idx], train_df.iloc[va_idx]
    te_df = test_df.copy()
    
    # (1) scaling
    for f in features:
        scaler = MinMaxScaler(feature_range=(-1,1))
        tr_df[f] = scaler.fit_transform(np.array(tr_df[f]).reshape(-1,1))
        va_df[f] = scaler.transform(np.array(va_df[f]).reshape(-1,1))
        te_df[f] = scaler.transform(np.array(te_df[f]).reshape(-1,1))
    
    # (2) target transform
    inverse_transform = None
    
    # (3) custom dataset
    transform = [smiles_transform,fps_transform]
    train_dataset = CustomDataset(tr_df, ['MLM','HLM'], 'SMILES', 'FPs', fps_maximum, tokenizer, transform, False)
    val_dataset   = CustomDataset(va_df, ['MLM','HLM'], 'SMILES', 'FPs', fps_maximum, tokenizer, transform, False)
    test_dataset  = CustomDataset(te_df, ['MLM','HLM'], 'SMILES', 'FPs', fps_maximum, tokenizer, transform, True)
    
    # (4) dataloader
    train_loader = DataLoader(train_dataset, batch_size=CFG.BATCH_SIZE, shuffle=False, pin_memory=True, num_workers=0)
    val_loader   = DataLoader(val_dataset  , batch_size=CFG.BATCH_SIZE, shuffle=False, pin_memory=True, num_workers=0)
    test_loader  = DataLoader(test_dataset , batch_size=CFG.BATCH_SIZE, shuffle=False, pin_memory=True, num_workers=0)
    
    # (5) model define
    model = MultiTaskModel(feature_input_size,vocab_size,output_size,hidden_size,dropout_rate)

    criterion = MultiRMSELoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=CFG.LEARNING_RATE)
    # optimizer = torch.optim.SGD(model.parameters(), lr=CFG.LEARNING_RATE, momentum=0.9)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.5, patience=5, threshold_mode='abs', min_lr=1e-5, verbose=False)
    # scheduler = None
    # early_stopping = EarlyStopping(patience=10,verbose=False,path='')
    early_stopping = None
    
    # (6) modeling
    torch.cuda.empty_cache(), gc.collect(), gc.collect()
    
    best_model = train(
        model, criterion, optimizer, train_loader, val_loader,
        CFG.EPOCHS, early_stopping, device, scheduler,
        metric_period=1, verbose=True,
        save_model_path=f'./mc/best_model_k{k}.pt',
        final_model_path=f'./mc/final_model_k{k}.pt',
        use_best_model=True,
        inverse_transform=inverse_transform,
    )

----------------------------------------------------------------------------------------------------
> K-Fold: 1
----------------------------------------------------------------------------------------------------




*[001/512] loss: 37.77555, val_loss: 33.85527, best_epoch: 1, elapsed: 0.83s, total: 0.83s, remaining: 426.67s
 [002/512] loss: 32.58034, val_loss: 34.02709, best_epoch: 1, elapsed: 1.02s, total: 2.05s, remaining: 519.40s
 [003/512] loss: 32.44486, val_loss: 33.87916, best_epoch: 1, elapsed: 1.02s, total: 3.07s, remaining: 521.42s
*[004/512] loss: 32.35245, val_loss: 33.75582, best_epoch: 4, elapsed: 1.02s, total: 4.10s, remaining: 519.63s
*[005/512] loss: 32.27905, val_loss: 33.64662, best_epoch: 5, elapsed: 1.01s, total: 5.30s, remaining: 514.14s
*[006/512] loss: 32.21673, val_loss: 33.53580, best_epoch: 6, elapsed: 1.02s, total: 6.50s, remaining: 515.74s
*[007/512] loss: 32.15913, val_loss: 33.43187, best_epoch: 7, elapsed: 1.01s, total: 7.71s, remaining: 512.38s
*[008/512] loss: 32.10520, val_loss: 33.34493, best_epoch: 8, elapsed: 1.02s, total: 8.91s, remaining: 512.11s
*[009/512] loss: 32.05656, val_loss: 33.26248, best_epoch: 9, elapsed: 1.02s, total: 10.12s, remaining: 512.69s



KeyboardInterrupt



<br></br>

# Inference

In [None]:
def predict(best_model,loader,device,inverse_transform):
    best_model.to(device)

    true_list = []
    pred_list = []
    with torch.no_grad():
        for feat,mol,fps,emb,target in iter(loader):
            feat = feat.to(device)
            mol = mol.to(device)
            fps = fps.to(device)
            emb = emb.to(device)
            target = target.to(device)

            output = best_model(feat,mol,fps,emb)
            
            if inverse_transform is not None:
                output = inverse_transform(output)
                target = inverse_transform(target)

            true_list.append(target)
            pred_list.append(output)

    trues = torch.cat(true_list,dim=0)
    preds = torch.cat(pred_list,dim=0)

    trues = trues.cpu().numpy()
    preds = preds.cpu().numpy()

    return trues, preds

def predict_test(best_model,loader,device,inverse_transform):
    best_model.to(device)

    #true_list = []
    pred_list = []
    with torch.no_grad():
        for feat,mol,fps,emb in iter(loader):
            feat = feat.to(device)
            mol = mol.to(device)
            fps = fps.to(device)
            emb = emb.to(device)
            #target = target.to(device)

            output = best_model(feat,mol,fps,emb)
            
            if inverse_transform is not None:
                output = inverse_transform(output)
                #target = inverse_transform(target)

            #true_list.append(target)
            pred_list.append(output)

    #trues = torch.cat(true_list,dim=0)
    preds = torch.cat(pred_list,dim=0)

    #trues = trues.cpu().numpy()
    preds = preds.cpu().numpy()

    return preds

In [None]:
best_model = MultiTaskModel(feature_input_size,vocab_size,output_size,hidden_size,dropout_rate)
best_model.load_state_dict(torch.load('./mc/best_model.pt'))

In [None]:
tr_true, tr_pred = predict(best_model,train_loader,device,inverse_transform)
va_true, va_pred = predict(best_model,val_loader,device,inverse_transform)

In [None]:
(MultiRMSELoss()(torch.tensor(tr_true),torch.tensor(tr_pred)),
 MultiRMSELoss()(torch.tensor(va_true),torch.tensor(va_pred)))

In [None]:
# tr_pred = tr_pred.mean(axis=1).tolist()
# tr_pred = np.array([tr_pred,tr_pred]).T

# va_pred = va_pred.mean(axis=1).tolist()
# va_pred = np.array([va_pred,va_pred]).T

# (MultiRMSELoss()(torch.tensor(tr_true),torch.tensor(tr_pred)),
#  MultiRMSELoss()(torch.tensor(va_true),torch.tensor(va_pred)))

In [None]:
tr_true[:10].round(1), tr_pred[:10].round(1)
# va_true[:10].round(1), va_pred[:10].round(1)

In [None]:
def abline(intercept,slope):
    axes = plt.gca()
    x_vals = np.array(axes.get_xlim())
    y_vals = intercept + slope * x_vals
    plt.plot(x_vals, y_vals, linestyle='--', color='red')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

fig = plt.figure(figsize=(15,7))
fig.add_subplot(121)
sns.scatterplot(x=tr_true[:,0],y=tr_pred[:,0])
abline(0,1)
plt.xlabel('true')
plt.ylabel('pred')
plt.grid()
fig.add_subplot(122)
sns.scatterplot(x=tr_true[:,1],y=tr_pred[:,1])
abline(0,1)
plt.xlabel('true')
plt.ylabel('pred')
plt.grid()
plt.suptitle('train',fontsize=20)
plt.tight_layout()
plt.show()

fig = plt.figure(figsize=(15,7))
fig.add_subplot(121)
sns.scatterplot(x=va_true[:,0],y=va_pred[:,0])
abline(0,1)
plt.xlabel('true')
plt.ylabel('pred')
plt.grid()
fig.add_subplot(122)
sns.scatterplot(x=va_true[:,1],y=va_pred[:,1])
abline(0,1)
plt.xlabel('true')
plt.ylabel('pred')
plt.grid()
plt.suptitle('validation',fontsize=20)
plt.tight_layout()
plt.show()

In [None]:
te_pred = predict_test(best_model,test_loader,device,inverse_transform)

In [None]:
submit = pd.read_csv('./data/sample_submission.csv')
submit[targets] = te_pred
submit.head()

In [None]:
submit.to_csv('./out/mlt_efficientnet_mol&fps.csv',index=False)