In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# 무시
pd.set_option('mode.chained_assignment',  None) # <==== 경고를 끈다

In [2]:
# seed
seed = 42

# define rmse
rmse = lambda x, y: np.mean((x - y) ** 2) ** 0.5 

# features and targets
features = ["AlogP", "Molecular_Weight", "Num_H_Acceptors", "Num_H_Donors", "Num_RotatableBonds", 
            "LogD", "Molecular_PolarSurfaceArea"]
mlm_target = "MLM"
hlm_target = "HLM"

# load data
df = pd.read_csv("data/train.csv")
df["AlogP"] = np.where(pd.isna(df["AlogP"]), df["LogD"], df["AlogP"])


In [3]:
df

Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea
0,TRAIN_0000,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,26.010,50.680,3.259,400.495,5,2,8,3.259,117.37
1,TRAIN_0001,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,29.270,50.590,2.169,301.407,2,1,2,2.172,73.47
2,TRAIN_0002,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,5.586,80.892,1.593,297.358,5,0,3,1.585,62.45
3,TRAIN_0003,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,5.710,2.000,4.771,494.652,6,0,5,3.475,92.60
4,TRAIN_0004,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,93.270,99.990,2.335,268.310,3,0,1,2.337,42.43
...,...,...,...,...,...,...,...,...,...,...,...
3493,TRAIN_3493,Cn1nc(CNC(=O)Cn2nc(C(F)(F)F)c3c2CCC3)c(Cl)c1Cl,1.556,3.079,3.409,396.195,3,1,5,3.409,64.74
3494,TRAIN_3494,CCn1[nH]cc/c1=N\C(=O)c1nn(-c2ccccc2)c(=O)c2ccc...,35.560,47.630,1.912,359.381,4,1,3,1.844,77.37
3495,TRAIN_3495,CCOC(=O)CCCc1nc2cc(N)ccc2n1C,56.150,1.790,1.941,261.320,3,1,6,2.124,70.14
3496,TRAIN_3496,Nc1cc(C(=O)OCCC2CCOC2=O)cnc1Cl,0.030,2.770,0.989,284.696,5,1,5,0.989,91.51


In [11]:
# train
scores = []
# reg_mlms = []
# reg_hlms = []
lgb_mlms = []
lgb_hlms = []

kf = KFold(n_splits = 50, random_state = seed, shuffle = True)
for i, (train_index, valid_index) in enumerate(kf.split(df)):
    df_train = df.iloc[train_index]
    df_valid = df.iloc[valid_index]

    for col in df_train.iloc[:,4:].columns:
        # Standardization _ Standard
        st_scaler = StandardScaler()
        st_scaled = st_scaler.fit_transform(df_train[[col]])
        df_train[[col]] = st_scaled
        # Normalization _ MinMax
        mm_scaler = MinMaxScaler()
        mm_scaled = mm_scaler.fit_transform(df_train[[col]])
        df_train[[col]] = mm_scaled

        # Same Scaler to validation Data
        st_scaled_t = st_scaler.transform(df_valid[[col]])
        df_valid[[col]] = st_scaled_t
        mm_scaled_t = mm_scaler.transform(df_valid[[col]])
        df_valid[[col]] = mm_scaled_t

    x_train = df_train[features].values
    y_mlm_train = df_train[mlm_target].values
    y_hlm_train = df_train[hlm_target].values

    x_valid = df_valid[features].values
    y_mlm_valid = df_valid[mlm_target].values
    y_hlm_valid = df_valid[hlm_target].values
    
    # rf
    # reg_mlm = RandomForestRegressor(random_state = seed)
    # reg_mlm.fit(x_train, y_mlm_train)
    # p_mlm = reg_mlm.predict(x_valid)
    # reg_hlm = RandomForestRegressor(random_state = seed)
    # reg_hlm.fit(x_train, y_hlm_train)
    # p_hlm = reg_hlm.predict(x_valid)
    
    # lgbm
    lgb_mlm = LGBMRegressor(random_state = seed)
    lgb_mlm.fit(x_train, y_mlm_train)
    p_mlm = lgb_mlm.predict(x_valid)
    lgb_hlm = LGBMRegressor(random_state = seed)
    lgb_hlm.fit(x_train, y_hlm_train)
    p_hlm = lgb_hlm.predict(x_valid)

    score = 0.5 * rmse(y_mlm_valid, p_mlm) + 0.5 * rmse(y_hlm_valid, p_hlm)

    # reg_mlms.append(reg_mlm)
    # reg_hlms.append(reg_hlm)
    lgb_mlms.append(lgb_mlm)
    lgb_hlms.append(lgb_hlm)

    scores.append(score)
    print(f"Fold {i+1:2d}: {score:.5f}")

print(f"CV score: {np.mean(scores):.5f}")

Fold  1: 34.57636
Fold  2: 34.47936
Fold  3: 35.37335
Fold  4: 32.90915
Fold  5: 32.93524
Fold  6: 37.69387
Fold  7: 34.51076
Fold  8: 29.14137
Fold  9: 35.81904
Fold 10: 30.66150
Fold 11: 34.04569
Fold 12: 35.18424
Fold 13: 30.64775
Fold 14: 31.26443
Fold 15: 31.66998
Fold 16: 36.06186
Fold 17: 34.09457
Fold 18: 34.20904
Fold 19: 33.68007
Fold 20: 36.43316
Fold 21: 31.06512
Fold 22: 32.84196
Fold 23: 29.58454
Fold 24: 33.43205
Fold 25: 31.55590
Fold 26: 31.12873
Fold 27: 34.78837
Fold 28: 32.54276
Fold 29: 35.00113
Fold 30: 31.77342
Fold 31: 29.95142
Fold 32: 30.51791
Fold 33: 34.58580
Fold 34: 30.14678
Fold 35: 31.41830
Fold 36: 33.59675
Fold 37: 28.10005
Fold 38: 34.50438
Fold 39: 32.65692
Fold 40: 31.72874
Fold 41: 32.07022
Fold 42: 31.05745
Fold 43: 31.66654
Fold 44: 27.78824
Fold 45: 31.72982
Fold 46: 30.40680
Fold 47: 31.39558
Fold 48: 34.43388
Fold 49: 35.06008
Fold 50: 31.81573
CV score: 32.67472


In [14]:
# load data
df = pd.read_csv("data/test.csv")
df["AlogP"] = np.where(pd.isna(df["AlogP"]), df["LogD"], df["AlogP"])

# predict
df_submission = pd.read_csv("data/sample_submission.csv")
# df_submission["MLM"] = np.mean([reg_mlm.predict(df[features].values) for reg_mlm in reg_mlms], axis = 0)
# df_submission["HLM"] = np.mean([reg_hlm.predict(df[features].values) for reg_hlm in reg_hlms], axis = 0)
df_submission["MLM"] = np.mean([lgb_mlm.predict(df[features].values) for lgb_mlm in lgb_mlms], axis = 0)
df_submission["HLM"] = np.mean([lgb_hlm.predict(df[features].values) for lgb_hlm in lgb_hlms], axis = 0)


In [15]:
df_submission

Unnamed: 0,id,MLM,HLM
0,TEST_000,60.436658,62.350984
1,TEST_001,13.437314,26.474688
2,TEST_002,60.436658,62.350984
3,TEST_003,45.243695,39.785297
4,TEST_004,70.541544,58.292353
...,...,...,...
478,TEST_478,60.436658,62.350984
479,TEST_479,54.937294,65.892655
480,TEST_480,60.436658,62.350984
481,TEST_481,76.598476,58.091959


In [16]:
df_submission.to_csv("submission_lgbm.csv", index = False, encoding = "utf-8-sig")