# Import

In [1]:
import random
import os

from tqdm import tqdm

import numpy as np
import pandas as pd
pd.set_option('mode.chained_assignment',  None)

from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from rdkit import DataStructs
from rdkit.Chem import PandasTools, AllChem

In [2]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)

seed_everything(42) # Seed 고정

In [3]:
class CFG:
    SEED = 0
    IMG_SIZE = 224
    BATCH_SIZE = 64
    EPOCHS = 256
    LEARNING_RATE = 0.003

<br></br>

# Data Load

In [4]:
train_df = pd.read_csv('./data/train.csv')
test_df  = pd.read_csv('./data/test.csv')

In [5]:
train_df.head()

Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea
0,TRAIN_0000,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,26.01,50.68,3.259,400.495,5,2,8,3.259,117.37
1,TRAIN_0001,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,29.27,50.59,2.169,301.407,2,1,2,2.172,73.47
2,TRAIN_0002,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,5.586,80.892,1.593,297.358,5,0,3,1.585,62.45
3,TRAIN_0003,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,5.71,2.0,4.771,494.652,6,0,5,3.475,92.6
4,TRAIN_0004,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,93.27,99.99,2.335,268.31,3,0,1,2.337,42.43


<br></br>

# Pre-Processing

## 분자구조 관련 컬럼 생성

In [6]:
# Molecule to MorganFingerprint
def mol2fp(mol):
    fp = AllChem.GetHashedMorganFingerprint(mol, 6, nBits=4096)
    ar = np.zeros((1,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(fp, ar)
    return ar

In [7]:
# (1) SMILES를 통해 Molecule(분자구조) 생성
PandasTools.AddMoleculeColumnToFrame(train_df,'SMILES','Molecule')
PandasTools.AddMoleculeColumnToFrame(test_df ,'SMILES','Molecule')

In [8]:
from sklearn.feature_selection import VarianceThreshold

In [9]:
# (2) Morgan Fingerprint column 추가
train_df["FPs"] = train_df.Molecule.apply(mol2fp)
test_df ["FPs"] = test_df .Molecule.apply(mol2fp)

In [10]:
# (3) Morgan Fingerprint 중, variance가 0.05보다 작은 컬럼들을 지우기
feature_select = VarianceThreshold(threshold=0.05)

# 일부사용
tr_fps_selected = feature_select.fit_transform(np.stack(train_df['FPs']))
te_fps_selected = feature_select.transform(np.stack(test_df['FPs']))

# # 전체사용
# tr_fps_selected = np.stack(train_df['FPs'])
# te_fps_selected = np.stack(test_df ['FPs'])

fps_names = ['fps'+str(i+1) for i in range(tr_fps_selected.shape[1])]

train_df = pd.concat([
    train_df.drop('FPs',axis=1),
    pd.DataFrame(tr_fps_selected,columns=fps_names),
],axis=1)

test_df = pd.concat([
    test_df.drop('FPs',axis=1),
    pd.DataFrame(te_fps_selected,columns=fps_names),
],axis=1)

In [11]:
# 사용할 column만 추출
features = ['AlogP','Molecular_Weight','Num_H_Acceptors','Num_H_Donors',
            'Num_RotatableBonds','LogD','Molecular_PolarSurfaceArea']
fps_features = [col for col in train_df.columns if col.find('fps')==0]
smiles_feature = 'SMILES'
targets  = ['MLM','HLM']

train_df = train_df[features+fps_features+[smiles_feature]+targets]
test_df  = test_df[features+fps_features+[smiles_feature]]

<br>

## Imputaion

In [12]:
from sklearn.impute import SimpleImputer

In [13]:
def null_check(data):
    d = data.copy()
    null_info = d.isnull().sum()
    null_info = null_info[null_info!=0]
    display(null_info)

In [14]:
print('> train')
null_check(train_df)

print('> test')
null_check(test_df)

> train


AlogP    2
dtype: int64

> test


AlogP    1
dtype: int64

In [15]:
null_features = ['AlogP']

imputer = SimpleImputer()
train_df[null_features] = imputer.fit_transform(train_df[null_features])
test_df [null_features] = imputer.transform(test_df[null_features])

<br>

## Train Test Split

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
tr_df, va_df = train_test_split(train_df,test_size=0.2,shuffle=True,random_state=CFG.SEED)
te_df = test_df.copy()

<br>

## Scaling

In [18]:
from sklearn.preprocessing import MinMaxScaler

scaling_features = list(set(features)-set(['FPs']))
scalers = {}
for f in scaling_features:
    scaler = MinMaxScaler()
    tr_df[f] = scaler.fit_transform(np.array(tr_df[f]).reshape(-1,1))
    va_df[f] = scaler.transform(np.array(va_df[f]).reshape(-1,1))
    te_df[f] = scaler.transform(np.array(te_df[f]).reshape(-1,1))
    scalers[f] = scaler

<br></br>

# Custom Dataset

In [19]:
from rdkit import Chem
from rdkit.Chem import Draw

class CustomDataset(Dataset):
    def __init__(self, data, targets, smiles, transforms=None, is_test=False):
        self.data = data.copy()
        self.targets = targets
        self.smiles = smiles
        self.transforms = transforms
        self.is_test = is_test
        
        self.smiles_features = []
        for s in tqdm(data[smiles].values):
            m = Chem.MolFromSmiles(s)
            img = Draw.MolToImage(m, size=(224,224))
            img = np.array(img)
            if self.transforms is not None:
                img = self.transforms(image=img)['image']
            self.smiles_features.append(img)
            
        if not self.is_test:
            self.target_features = self.data[self.targets].values
            self.num_features = self.data.drop(columns=targets+[smiles],axis=1).values
        else:
            self.num_features = self.data.drop(columns=[smiles],axis=1).values

    def __getitem__(self, index):
        if self.is_test:
            return (
                torch.Tensor(self.num_features[index]),
                torch.Tensor(self.smiles_features[index]),
            )
        else:
            return (
                torch.Tensor(self.num_features[index]),
                torch.Tensor(self.smiles_features[index]),
                torch.Tensor(self.target_features[index]),
            )
        
    def __len__(self):
        return len(self.data)

In [20]:
import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2
import torchvision.models as models

In [21]:
#  이미지 변환
transform = A.Compose([
    A.Resize(CFG.IMG_SIZE,CFG.IMG_SIZE),
    A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), max_pixel_value=255.0, always_apply=False, p=1.0),
    ToTensorV2(),
])

In [22]:
train_dataset = CustomDataset(tr_df, ['MLM','HLM'], 'SMILES', transform, False)
val_dataset   = CustomDataset(va_df, ['MLM','HLM'], 'SMILES', transform, False)
test_dataset  = CustomDataset(te_df, ['MLM','HLM'], 'SMILES', transform, True)

100%|██████████| 2798/2798 [00:25<00:00, 111.87it/s]
100%|██████████| 700/700 [00:06<00:00, 112.41it/s]
100%|██████████| 483/483 [00:04<00:00, 110.61it/s]


In [23]:
train_loader = DataLoader(train_dataset, batch_size=CFG.BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_dataset  , batch_size=CFG.BATCH_SIZE, shuffle=True)
test_loader  = DataLoader(test_dataset , batch_size=CFG.BATCH_SIZE, shuffle=True)

In [24]:
# [feat for feat,smiles,target in train_dataset][0]
# [smiles for feat,smiles in test_dataset][0]

<br></br>

# Model

In [25]:
import torchvision.models as models

In [26]:
class MultiTaskModel(nn.Module):
    def __init__(self, feature_input_size, output_size, hidden_size):
        super(MultiTaskModel, self).__init__()
        self.image_output_size = 200
        self.feature_output_size = 50
        
        self.efficientnet = models.efficientnet_b0()
        self.efficientnet.classifier = nn.Sequential(
            nn.Dropout(p=0.2,inplace=True),
            nn.Linear(self.efficientnet.classifier[-1].in_features,self.image_output_size,bias=True),
        )
        
        self.image_layer = nn.Sequential(
            self.efficientnet,
            nn.BatchNorm1d(self.image_output_size),
            nn.LeakyReLU(0.01),
            nn.Dropout(0.5),
        )
        
        self.feature_layer = nn.Sequential(
            nn.Linear(feature_input_size,hidden_size),
            nn.BatchNorm1d(hidden_size), # LayerNorm
            nn.LeakyReLU(0.01),
            nn.Dropout(0.5),
            nn.Linear(hidden_size,hidden_size),
            nn.BatchNorm1d(hidden_size), # LayerNorm
            nn.LeakyReLU(0.01),
            nn.Dropout(0.5),
            nn.Linear(hidden_size,self.feature_output_size),
        )
        
        self.fc = nn.Linear(self.image_output_size+self.feature_output_size,output_size)
        
    def forward(self, image, feature):
        image_output = self.image_layer(image)
        feature_output = self.feature_layer(feature)
        combined = torch.cat((image_output,feature_output),dim=1)
        output = self.fc(combined)
        return output

In [27]:
class MultiRMSELoss(nn.Module):
    def __init__(self):
        super(MultiRMSELoss, self).__init__()

    def forward(self, output, target):
        loss1 = torch.sqrt(torch.mean((output[:,0]-target[:,0])**2))
        loss2 = torch.sqrt(torch.mean((output[:,1]-target[:,1])**2))
        loss = 0.5*loss1+0.5*loss2
        return loss

In [28]:
# import time

# def train(model, train_loader, valid_loader, criterion, optimizer, epochs, device):
#     model = model.to(device)
    
#     for epoch in range(1,epochs+1):
#         epoch_st = time.time()
        
#         model.train()
#         running_loss = 0
#         for feature, img, target in train_loader:
#             feature, img, target = feature.to(device), img.to(device), target.to(device)
#             optimizer.zero_grad()
            
#             output = model(img,feature)
#             loss = criterion(output, target)
#             loss.backward()
#             optimizer.step()
            
#             running_loss += loss.item()
        
#         if epoch % 1 == 0:
#             valid_loss = 0
#             with torch.no_grad():
#                 for feature, img, target in valid_loader:
#                     feature, img, target = feature.to(device), img.to(device), target.to(device)
#                     output = model(img,feature)
#                     loss = criterion(output, target)
#                     valid_loss += loss.item()
            
#             str_epoch = str(epoch).zfill(len(str(epochs)))
#             progress = 'Epoch: {}/{}, Loss: {:.4f}, val_loss: {:.4f}'\
#                 .format(str_epoch,epochs,running_loss/len(train_loader),valid_loss/len(valid_loader))
#             print(progress)
    
#     return model

In [29]:
import numpy as np
import torch
import torch.nn as nn
import time

class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt', trace_func=print):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
            path (str): Path for the checkpoint to be saved to.
                            Default: 'checkpoint.pt'
            trace_func (function): trace print function.
                            Default: print            
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path
        self.trace_func = trace_func
        
    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.verbose:
                self.trace_func(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            self.trace_func(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        if self.path!='':
            torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

        
def train(
    model, criterion, optimizer, train_loader, valid_loader, epochs,
    early_stopping, device='cpu', scheduler=None, metric_period=1, 
    verbose=True, save_model_path = './mc/best_model.pt',
):
    model.to(device)

    best_loss  = 999999999
    best_epoch = 1
    best_model = None
    is_best    = np.nan
    
    start_time = time.time()
    epoch_s = time.time()
    for epoch in range(1, epochs+1):
        
        model.train()
        train_loss = []
        for feat,img,target in tqdm(train_loader):
            feat = feat.to(device)
            img = img.to(device)
            target = target.to(device)

            optimizer.zero_grad()
            output = model(img,feat)#.float()
            
            loss = criterion(output, target)
            loss.backward()  # Getting gradients
            optimizer.step() # Updating parameters

            train_loss.append(loss.item())

        valid_loss = validation(model, valid_loader, criterion, device)
        epoch_e = time.time()
            
        if scheduler is not None:
            scheduler.step(valid_loss)

        # update the best epoch & best loss
        if (best_loss > valid_loss) | (epoch==1):
            best_epoch = epoch
            best_loss = valid_loss
            best_model = model
            is_best = 1
            torch.save(best_model.state_dict(), save_model_path)
        else:
            is_best = 0
            
        # 결과물 printing
        if (verbose) & (epoch % metric_period == 0):
            mark = '*' if is_best else ' '
            epoch_str = str(epoch).zfill(len(str(epochs)))
            progress = '{}[{}/{}] loss: {:.5f}, val_loss: {:.5f}, best_epoch: {}, elapsed: {:.2f}s, total: {:.2f}s, remaining: {:.2f}s'\
                .format(
                    mark,
                    epoch_str,
                    epochs,
                    np.mean(train_loss),
                    valid_loss,
                    best_epoch,
                    epoch_e-epoch_s,
                    epoch_e-start_time,
                    (epoch_e-epoch_s)*(epochs-epoch)/metric_period,
                )
            epoch_s = time.time()
            print(progress)

        # early stopping 여부를 체크. 현재 과적합 상황 추적
        if early_stopping is not None:
            early_stopping(valid_loss, model)
            if early_stopping.early_stop:
                break

    return best_model

def validation(model, valid_loader, criterion, device): #transform_y
    model.eval()
    valid_loss = []
    with torch.no_grad():
        for feat,img,target in tqdm(valid_loader):
            feat = feat.to(device)
            img = img.to(device)
            target = target.to(device)
            
            output = model(img,feat)#.float()
            
            loss = criterion(output, target)
            valid_loss.append(loss.item())

    return np.mean(valid_loss)

In [30]:
feature_input_size = [feat for feat,smiles,target in train_dataset][0].shape[0]
output_size = 2
hidden_size = 64

In [31]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = MultiTaskModel(feature_input_size,output_size,hidden_size)
criterion = MultiRMSELoss()
optimizer= torch.optim.Adam(model.parameters(), lr=CFG.LEARNING_RATE, weight_decay=5e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=2, threshold_mode='abs',min_lr=1e-7, verbose=True)
early_stopping = EarlyStopping(patience=10,verbose=False,path='')

In [31]:
best_model = train(
    model, criterion, optimizer, train_loader, val_loader,
    CFG.EPOCHS, early_stopping,
    device='cpu', scheduler=scheduler, metric_period=1, 
    verbose=True, save_model_path = './mc/best_model.pt',
)

100%|██████████| 44/44 [09:58<00:00, 13.60s/it]
100%|██████████| 11/11 [00:32<00:00,  2.91s/it]


*[001/256] loss: 47.67812, val_loss: 34.79943, best_epoch: 1, elapsed: 630.48s, total: 630.48s, remaining: 160771.21s


100%|██████████| 44/44 [09:49<00:00, 13.40s/it]
100%|██████████| 11/11 [00:31<00:00,  2.86s/it]


*[002/256] loss: 33.44755, val_loss: 33.62227, best_epoch: 2, elapsed: 621.18s, total: 1251.72s, remaining: 157779.06s


100%|██████████| 44/44 [09:51<00:00, 13.44s/it]
100%|██████████| 11/11 [00:31<00:00,  2.88s/it]


 [003/256] loss: 32.16778, val_loss: 33.80289, best_epoch: 2, elapsed: 622.92s, total: 1874.70s, remaining: 157599.76s


 16%|█▌        | 7/44 [01:48<09:35, 15.54s/it]
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/khj/.pyenv/versions/3.8.10/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3343, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-32-2b9654d68ee6>", line 1, in <module>
    best_model = train(
  File "<ipython-input-29-b3890e9e95e0>", line 86, in train
    loss.backward()  # Getting gradients
  File "/Users/khj/.pyenv/versions/3.8.10/lib/python3.8/site-packages/torch/_tensor.py", line 487, in backward
    torch.autograd.backward(
  File "/Users/khj/.pyenv/versions/3.8.10/lib/python3.8/site-packages/torch/autograd/__init__.py", line 200, in backward
    Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/khj/.pyenv/versions/3.8.10/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 204

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/khj/.pyenv/versions/3.8.10/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3343, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-32-2b9654d68ee6>", line 1, in <module>
    best_model = train(
  File "<ipython-input-29-b3890e9e95e0>", line 86, in train
    loss.backward()  # Getting gradients
  File "/Users/khj/.pyenv/versions/3.8.10/lib/python3.8/site-packages/torch/_tensor.py", line 487, in backward
    torch.autograd.backward(
  File "/Users/khj/.pyenv/versions/3.8.10/lib/python3.8/site-packages/torch/autograd/__init__.py", line 200, in backward
    Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/khj/.pyenv/versions/3.8.10/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 204

TypeError: object of type 'NoneType' has no len()

In [None]:
# best_model = train(
#     model,
#     train_loader,
#     val_loader,
#     criterion,
#     optimizer,
#     CFG.EPOCHS,
#     device,
# )