In [15]:
!pip install rdkit




[notice] A new release of pip is available: 23.0 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [16]:
import random
import os

import numpy as np
import pandas as pd

from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from rdkit import DataStructs
from rdkit.Chem import PandasTools, AllChem

In [17]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)

seed_everything(42) # Seed 고정

In [63]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [64]:
PandasTools.AddMoleculeColumnToFrame(train,'SMILES','Molecule')
PandasTools.AddMoleculeColumnToFrame(test,'SMILES','Molecule')

In [148]:
def mol2fp(mol):
    fp = AllChem.GetHashedMorganFingerprint(mol, 6, nBits=4096)
    ar = np.zeros((1,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(fp, ar)
    return ar

In [149]:
# FPs column 추가
train["FPs"] = train.Molecule.apply(mol2fp)
test["FPs"] = test.Molecule.apply(mol2fp)

In [22]:
# 사용할 column만 추출
train = train[['FPs','MLM', 'HLM']]
test = test[['FPs']]

In [23]:
class CustomDataset(Dataset):
    def __init__(self, df, target, transform, is_test=False):
        self.df = df
        self.target = target # HLM or MLM
        self.is_test = is_test # train,valid / test

        self.feature_select = transform
        if not self.is_test: 
            self.fp = self.feature_select.fit_transform(np.stack(df['FPs']))
        else: # valid or test
            self.fp = self.feature_select.transform(np.stack(df['FPs']))

    def __getitem__(self, index):
        fp = self.fp[index]
        if not self.is_test: # test가 아닌 경우(label 존재)
            label = self.df[self.target][index]
            return torch.tensor(fp).float(), torch.tensor(label).float().unsqueeze(dim=-1) # feature, label

        else: # test인 경우
            return torch.tensor(fp).float() # feature
        
    def __len__(self):
        return len(self.df)

In [24]:
transform = VarianceThreshold(threshold=0.05)

train_MLM = CustomDataset(df=train, target='MLM', transform=transform, is_test=False)
train_HLM = CustomDataset(df=train, target='HLM', transform=transform, is_test=False)

input_size = train_MLM.fp.shape[1]
input_size

251

In [25]:
# Hyperparameter
CFG = {'BATCH_SIZE': 256,
       'EPOCHS': 1000,
       'INPUT_SIZE': input_size,
       'HIDDEN_SIZE': 1024,
       'OUTPUT_SIZE': 1,
       'DROPOUT_RATE': 0.8,
       'LEARNING_RATE': 0.001}

In [26]:
# train,valid split
train_MLM_dataset, valid_MLM_dataset = train_test_split(train_MLM, test_size=0.2, random_state=42)
train_HLM_dataset, valid_HLM_dataset = train_test_split(train_HLM, test_size=0.2, random_state=42)

In [27]:
train_MLM_loader = DataLoader(dataset=train_MLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=True)

valid_MLM_loader = DataLoader(dataset=valid_MLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=False)


train_HLM_loader = DataLoader(dataset=train_HLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=True)

valid_HLM_loader = DataLoader(dataset=valid_HLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=False)

In [28]:
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_rate, out_size):
        super(Net, self).__init__()
        
        # fc 레이어 3개와 출력 레이어
        self.fc1 = nn.Linear(input_size, hidden_size) 
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, hidden_size)
        self.fc_out = nn.Linear(hidden_size, out_size)
        
        # 정규화
        self.ln1 = nn.LayerNorm(hidden_size)
        self.ln2 = nn.LayerNorm(hidden_size)
        self.ln3 = nn.LayerNorm(hidden_size)        
        
        # 활성화 함수
        self.activation = nn.LeakyReLU()
        
        # Dropout
        self.dropout = nn.Dropout(dropout_rate)
     
    def forward(self, x):
        out = self.fc1(x)
        out = self.ln1(out)
        out = self.activation(out)
        out = self.dropout(out)
        
        out = self.fc2(out)
        out = self.ln2(out)
        out = self.activation(out)
        out = self.dropout(out)
        
        out = self.fc3(out)
        out = self.ln3(out)
        out = self.activation(out)
        out = self.dropout(out)

        out = self.fc_out(out)
        return out

In [29]:
model_MLM = Net(CFG['INPUT_SIZE'],CFG['HIDDEN_SIZE'],CFG['DROPOUT_RATE'],CFG['OUTPUT_SIZE'])
model_HLM = Net(CFG['INPUT_SIZE'],CFG['HIDDEN_SIZE'],CFG['DROPOUT_RATE'],CFG['OUTPUT_SIZE'])

In [30]:
criterion = nn.MSELoss()
optimizer_MLM = torch.optim.Adam(model_MLM.parameters(), lr=CFG['LEARNING_RATE'])
optimizer_HLM = torch.optim.Adam(model_HLM.parameters(), lr=CFG['LEARNING_RATE'])

In [31]:
def train(train_loader, valid_loader, model, criterion, optimizer, epochs):
    model.train()
    
    for epoch in range(epochs):
        running_loss = 0
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            
            output = model(inputs)
            loss = criterion(output, targets)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
        
        if epoch % 100 == 0:
            valid_loss = 0
            with torch.no_grad():
                for inputs, targets in valid_loader:
                    output = model(inputs)
                    loss = criterion(output, targets)
                    valid_loss += loss.item()
                    
            print(f'Epoch: {epoch}/{epochs}, Train Loss: {running_loss/len(train_loader)}, Valid Loss: {valid_loss/len(valid_HLM_loader)}')
            
            model.train()
    
    return model

In [32]:
print("Training Start: MLM")
model_MLM = train(train_MLM_loader, valid_MLM_loader, model_MLM, criterion, optimizer_MLM, epochs=CFG['EPOCHS'])

print("Training Start: HLM")
model_HLM = train(train_HLM_loader, valid_HLM_loader, model_HLM, criterion, optimizer_HLM, epochs=CFG['EPOCHS'])  

Training Start: MLM
Epoch: 0/1000, Train Loss: 2092.469082919034, Valid Loss: 1900.72998046875
Epoch: 100/1000, Train Loss: 304.99756414240056, Valid Loss: 1549.8810221354167
Epoch: 200/1000, Train Loss: 183.7533652565696, Valid Loss: 1527.1312255859375
Epoch: 300/1000, Train Loss: 139.0046615600586, Valid Loss: 1504.3356119791667
Epoch: 400/1000, Train Loss: 117.46295166015625, Valid Loss: 1422.7367757161458
Epoch: 500/1000, Train Loss: 96.76879952170633, Valid Loss: 1534.8275553385417
Epoch: 600/1000, Train Loss: 89.51383625377308, Valid Loss: 1606.5711669921875
Epoch: 700/1000, Train Loss: 80.1250325983221, Valid Loss: 1591.6677652994792
Epoch: 800/1000, Train Loss: 78.11895509199663, Valid Loss: 1497.6765543619792
Epoch: 900/1000, Train Loss: 68.56684216586027, Valid Loss: 1556.7668050130208
Training Start: HLM
Epoch: 0/1000, Train Loss: 3230.5836958451705, Valid Loss: 2635.5508626302085
Epoch: 100/1000, Train Loss: 390.78552800958806, Valid Loss: 1298.9443359375
Epoch: 200/1000, T

In [34]:
test_MLM = CustomDataset(df=test, target=None, transform=transform, is_test=True)
test_HLM = CustomDataset(df=test, target=None, transform=transform, is_test=True)

test_MLM_loader = DataLoader(dataset=test_MLM,
                             batch_size=CFG['BATCH_SIZE'],
                             shuffle=False)

test_HLM_loader = DataLoader(dataset=test_HLM,
                             batch_size=CFG['BATCH_SIZE'],
                             shuffle=False)

In [56]:
def inference(test_loader, model):
    model.eval()
    preds = []
    
    with torch.no_grad():
        for inputs in test_loader:
            output = model(inputs)
            preds.extend(output.cpu().numpy().flatten().tolist())
    
    return preds

In [57]:
predictions_MLM = inference(test_MLM_loader, model_MLM)
predictions_HLM = inference(test_HLM_loader, model_HLM)

In [59]:
submission = pd.read_csv('data/sample_submission.csv')
submission

Unnamed: 0,id,MLM,HLM
0,TEST_000,0,0
1,TEST_001,0,0
2,TEST_002,0,0
3,TEST_003,0,0
4,TEST_004,0,0
...,...,...,...
478,TEST_478,0,0
479,TEST_479,0,0
480,TEST_480,0,0
481,TEST_481,0,0


In [61]:
submission['MLM'] = predictions_MLM
submission['HLM'] = predictions_HLM
submission

Unnamed: 0,id,MLM,HLM
0,TEST_000,2.676011,87.764236
1,TEST_001,52.746513,84.391464
2,TEST_002,38.338795,64.148926
3,TEST_003,30.253155,62.875374
4,TEST_004,22.506796,88.883690
...,...,...,...
478,TEST_478,35.174438,74.771637
479,TEST_479,95.249947,96.846245
480,TEST_480,38.255692,77.573036
481,TEST_481,37.676563,67.066414


In [62]:
submission.to_csv('data/baseline_submission.csv', index=False)

In [421]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.7.6-py3-none-win_amd64.whl (70.9 MB)
     ---------------------------------------- 70.9/70.9 MB 9.3 MB/s eta 0:00:00
Installing collected packages: xgboost
Successfully installed xgboost-1.7.6



[notice] A new release of pip is available: 23.0 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [495]:
# xgboost를 활용해보쟈
import xgboost

In [497]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [498]:
# 사용할 column만 추출
train_X = train[[ 'AlogP',
 'Molecular_Weight',
 'Num_H_Acceptors',
 'Num_H_Donors',
 'Num_RotatableBonds',
 'LogD',
 'Molecular_PolarSurfaceArea']]

train_MLM = train['MLM']

train_HLM = train['HLM']

In [499]:
test_X = test[[ 'AlogP',
 'Molecular_Weight',
 'Num_H_Acceptors',
 'Num_H_Donors',
 'Num_RotatableBonds',
 'LogD',
 'Molecular_PolarSurfaceArea']]

In [500]:
xgb_model = xgboost.XGBRegressor()

In [501]:
xgb_model.fit(train_X,train_MLM)
predictionsMLM = xgb_model.predict(test_X)

In [503]:
xgb_model.fit(train_X,train_HLM)
predictionsHLM = xgb_model.predict(test_X)

In [505]:
submission['MLM'] = (submission['MLM'] + predictionsMLM) / 2

In [506]:
submission['HLM'] = (submission['HLM'] + predictionsHLM) / 2

In [507]:
submission.to_csv('data/baselineWithXGBoost_submission.csv', index=False)

In [508]:
submission

Unnamed: 0,id,MLM,HLM
0,TEST_000,30.428961,58.627215
1,TEST_001,68.585991,85.519079
2,TEST_002,33.438704,45.191609
3,TEST_003,46.035476,71.820112
4,TEST_004,52.580194,76.973137
...,...,...,...
478,TEST_478,23.392993,34.866105
479,TEST_479,80.482122,93.362574
480,TEST_480,42.539534,70.121817
481,TEST_481,65.389289,83.094238
