In [None]:
!pip install mordred

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math
import os

from sklearn.model_selection import KFold, train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.models import load_model

from rdkit import Chem
from rdkit.Chem import PandasTools, AllChem, Draw, ChemicalFeatures
from rdkit.Chem import Descriptors
from rdkit import RDConfig
from rdkit import DataStructs

from mordred import Calculator, descriptors


#Descriptor 함수 정의

In [None]:
#1 RDkit에서 제공하는 Descriptor
def getMolDescriptors(mol, missingVal=None):
    res = {}
    for nm,fn in Descriptors._descList:
        # some of the descriptor fucntions can throw errors if they fail, catch those here:
        try:
            val = fn(mol)
        except:
            # print the error message:
            import traceback
            traceback.print_exc()
            # and set the descriptor value to whatever missingVal is
            val = missingVal
        res[nm] = val
    return res

#2 atomic environment/ 각 원자의 주변원자, 혼성오비탈, ring에 포함되어 있는 원자인지 / 각 atomic environment를 피쳐로 개수 count
def extract_fragments(smiles, hybridization_type=None, ring=False):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    fragments = []
    for atom in mol.GetAtoms():
        hybridization = atom.GetHybridization()
        if (hybridization_type is None or hybridization == hybridization_type) and ring and atom.IsInRing():
            neighbors = atom.GetNeighbors()
            fragment = Chem.EditableMol(Chem.Mol())
            atom_idx = fragment.AddAtom(atom)
            for neighbor in neighbors:
                bond = mol.GetBondBetweenAtoms(atom.GetIdx(), neighbor.GetIdx())
                if bond:
                    bond_order = bond.GetBondType()
                    neighbor_idx = fragment.AddAtom(neighbor)
                    fragment.AddBond(atom_idx, neighbor_idx, bond_order)
            fragments.append(fragment.GetMol())

        if (hybridization_type is None or hybridization == hybridization_type) and not ring and not atom.IsInRing():
            neighbors = atom.GetNeighbors()
            fragment = Chem.EditableMol(Chem.Mol())
            atom_idx = fragment.AddAtom(atom)
            for neighbor in neighbors:
                bond = mol.GetBondBetweenAtoms(atom.GetIdx(), neighbor.GetIdx())
                if bond:
                    bond_order = bond.GetBondType()
                    neighbor_idx = fragment.AddAtom(neighbor)
                    fragment.AddBond(atom_idx, neighbor_idx, bond_order)
            fragments.append(fragment.GetMol())

    return fragments

def count_fragments(full, column_prefix, hybridization_type=None, ring=False):
    fragment_counts = {}
    for index, row in full.iterrows():
        smiles = row['SMILES']
        fragments = extract_fragments(smiles, hybridization_type, ring)

        fragment_count = {}
        for idx, fragment in enumerate(fragments):
            fragment_smiles = Chem.MolToSmiles(fragment)
            if fragment_smiles in fragment_count:
                fragment_count[fragment_smiles] += 1
            else:
                fragment_count[fragment_smiles] = 1

        fragment_counts[index] = fragment_count

    result_data = []
    for index, counts in fragment_counts.items():
        counts['SMILES'] = full.loc[index, 'SMILES']
        result_data.append(counts)

    result_df = pd.DataFrame(result_data).fillna(0).drop(columns='SMILES')
    result_df.columns = [f'{column_prefix}_{column}' for column in result_df.columns]

    return result_df

#3 입체이성질체유무, 전체에서 aromatic 원자 비율, formal charge
def calculate_molecular_properties(full):
    def calculate_stereocenters(smiles):
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None
        chiral_centers = Chem.FindMolChiralCenters(mol, includeUnassigned=True)
        return len(chiral_centers)

    def calculate_aromatic_proportion(smiles):
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None
        aromatic_atoms = [atom for atom in mol.GetAtoms() if atom.GetIsAromatic()]
        total_atoms = mol.GetNumAtoms()
        if total_atoms == 0:
            return None
        return len(aromatic_atoms) / total_atoms

    def calculate_formal_charge(smiles):
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None
        return Chem.rdmolops.GetFormalCharge(mol)

    properties_data = []

    for index, row in full.iterrows():
        smiles = row['SMILES']
        stereocenters = calculate_stereocenters(smiles)
        proportion = calculate_aromatic_proportion(smiles)
        charge = calculate_formal_charge(smiles)

        if stereocenters is not None and proportion is not None and charge is not None:
            properties_data.append({
                "stereocenters": stereocenters,
                "aromatic_proportion": proportion,
                "formal_charge": charge
            })

    properties_df = pd.DataFrame(properties_data)

    return properties_df

#데이터 가공

In [None]:
train = pd.read_csv("/content/drive/MyDrive/metabolism_dacon/train.csv")
train["AlogP"] = np.where(pd.isna(train["AlogP"]), train["LogD"], train["AlogP"])

#ChEMBL data
# MLM = pd.read_csv("/content/drive/MyDrive/metabolism_dacon/train_aug_MLM_0906.csv")
HLM = pd.read_csv("/content/drive/MyDrive/metabolism_dacon/train_aug_HLM_0906.csv")

test = pd.read_csv("/content/drive/MyDrive/metabolism_dacon/test.csv")
test["AlogP"] = np.where(pd.isna(test["AlogP"]), test["LogD"], test["AlogP"])

full = pd.concat([train, test, HLM], axis = 0).reset_index(drop=True)
full['Molecule'] = full['SMILES'].apply(Chem.MolFromSmiles)

In [None]:
duplicates = full[full.duplicated(subset='SMILES', keep=False)].sort_values(by='SMILES').reset_index(drop=True)
prop = calculate_molecular_properties(duplicates).reset_index(drop=True)
dupli_df = pd.concat([duplicates, prop], axis=1)[['SMILES', 'MLM', 'HLM', 'stereocenters']]

# 'stereocenters' 컬럼 값이 0인 데이터 필터링
filtered_df = dupli_df[dupli_df['stereocenters'] == 0]

In [None]:
mean_mlm = filtered_df.groupby('SMILES')['MLM'].mean().reset_index()
mean_hlm = filtered_df.groupby('SMILES')['HLM'].mean().reset_index()

In [None]:
duplicates = full[full.duplicated(subset='SMILES', keep=False)].sort_values(by='SMILES').reset_index(drop=True)
prop = calculate_molecular_properties(duplicates).reset_index(drop=True)
dupli_df = pd.concat([duplicates, prop], axis=1)[['SMILES', 'MLM', 'HLM', 'stereocenters']]

# 'stereocenters' 컬럼 값이 0인 데이터 필터링
filtered_df = dupli_df[dupli_df['stereocenters'] == 0]

# 'SMILES'를 기준으로 그룹화하고 'MLM'과 'HLM'의 평균 계산
mean_mlm = filtered_df.groupby('SMILES')['MLM'].mean().reset_index()
mean_hlm = filtered_df.groupby('SMILES')['HLM'].mean().reset_index()

# 원본 DataFrame을 수정
for index, row in mean_mlm.iterrows():
    smiles = row['SMILES']
    mlm_mean = row['MLM']
    hlm_mean = mean_hlm[mean_hlm['SMILES'] == smiles]['HLM'].values[0]
    filtered_df.loc[filtered_df['SMILES'] == smiles, 'MLM'] = mlm_mean
    filtered_df.loc[filtered_df['SMILES'] == smiles, 'HLM'] = hlm_mean

mean_duplicate = filtered_df.drop_duplicates().reset_index(drop=True)

# 입체이성질체 없는 경우 동일 분자의 실험값들을 mean으로 처리.
for index, row in mean_duplicate.iterrows():
    s_value = row['SMILES']
    matching_indices = full[full['SMILES'] == s_value].index
    for full_index in matching_indices:
        full.at[full_index, 'MLM'] = row['MLM']
        full.at[full_index, 'HLM'] = row['HLM']

filtered_full = full.drop_duplicates(subset=['SMILES', 'HLM', 'MLM'], keep='first').reset_index(drop=True)

In [None]:
full

Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,Molecule
0,TRAIN_0000,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,26.010,50.680,3.259,400.495,5,2,8,3.25900,117.37,<rdkit.Chem.rdchem.Mol object at 0x7acde4b22b90>
1,TRAIN_0001,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,29.270,50.590,2.169,301.407,2,1,2,2.17200,73.47,<rdkit.Chem.rdchem.Mol object at 0x7acde4b22ab0>
2,TRAIN_0002,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,5.586,80.892,1.593,297.358,5,0,3,1.58500,62.45,<rdkit.Chem.rdchem.Mol object at 0x7acde4b22c00>
3,TRAIN_0003,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,5.710,2.000,4.771,494.652,6,0,5,3.47500,92.60,<rdkit.Chem.rdchem.Mol object at 0x7acde4b22c70>
4,TRAIN_0004,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,93.270,99.990,2.335,268.310,3,0,1,2.33700,42.43,<rdkit.Chem.rdchem.Mol object at 0x7acde4b22ce0>
...,...,...,...,...,...,...,...,...,...,...,...,...
5573,CHEMBL3286701,Cc1[nH]c2ccccc2c1CCNC(=O)c1ccc(N(C)C)cc1,,17.000,3.510,321.420,2,2,5,3.51482,,<rdkit.Chem.rdchem.Mol object at 0x7acde49d89e0>
5574,CHEMBL215387,O=C(O)CCCCCCCCCCCNC(=O)NC12CC3CC(CC(C3)C1)C2,,83.500,5.240,392.580,2,3,13,5.24000,,<rdkit.Chem.rdchem.Mol object at 0x7acde49d8a50>
5575,CHEMBL242459,O=C(NC12CC3CC(CC(C3)C1)C2)N[C@H]1CC[C@H](Oc2cc...,,83.000,4.340,412.530,3,3,5,4.34280,,<rdkit.Chem.rdchem.Mol object at 0x7acde49d8ac0>
5576,CHEMBL3735279,CCOC(=O)c1nc(C)c2c(c1N)C(=O)N(Cc1ccccc1)C2=O,,14.000,1.950,339.350,6,1,4,1.94512,,<rdkit.Chem.rdchem.Mol object at 0x7acde49d8b30>


In [None]:
filtered_full

Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,Molecule
0,TRAIN_0000,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,26.010,50.680,3.259,400.495,5,2,8,3.25900,117.37,<rdkit.Chem.rdchem.Mol object at 0x7acde4b22b90>
1,TRAIN_0001,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,29.270,50.590,2.169,301.407,2,1,2,2.17200,73.47,<rdkit.Chem.rdchem.Mol object at 0x7acde4b22ab0>
2,TRAIN_0002,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,5.586,80.892,1.593,297.358,5,0,3,1.58500,62.45,<rdkit.Chem.rdchem.Mol object at 0x7acde4b22c00>
3,TRAIN_0003,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,5.710,2.000,4.771,494.652,6,0,5,3.47500,92.60,<rdkit.Chem.rdchem.Mol object at 0x7acde4b22c70>
4,TRAIN_0004,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,93.270,99.990,2.335,268.310,3,0,1,2.33700,42.43,<rdkit.Chem.rdchem.Mol object at 0x7acde4b22ce0>
...,...,...,...,...,...,...,...,...,...,...,...,...
5491,CHEMBL2177757,O=C(Cc1ccccc1)Nc1nnc(CCSCCc2nnc(NC(=O)Cc3ccccc...,,56.000,4.270,524.700,9,2,12,4.27040,,<rdkit.Chem.rdchem.Mol object at 0x7acde49d8900>
5492,CHEMBL3286635,CCN(CC)c1ccc(C(=O)NCCn2c(C)cc3ccccc32)cc1,,9.000,4.230,349.480,3,1,7,4.22592,,<rdkit.Chem.rdchem.Mol object at 0x7acde49d8970>
5493,CHEMBL3286701,Cc1[nH]c2ccccc2c1CCNC(=O)c1ccc(N(C)C)cc1,,17.000,3.510,321.420,2,2,5,3.51482,,<rdkit.Chem.rdchem.Mol object at 0x7acde49d89e0>
5494,CHEMBL3735279,CCOC(=O)c1nc(C)c2c(c1N)C(=O)N(Cc1ccccc1)C2=O,,14.000,1.950,339.350,6,1,4,1.94512,,<rdkit.Chem.rdchem.Mol object at 0x7acde49d8b30>


In [None]:
# Mordred Descriptor
calc = Calculator(descriptors, ignore_3D=True)
mord_desc_df = calc.pandas([Chem.MolFromSmiles(x) for x in filtered_full.SMILES])
mord_desc_df.columns = ['mord_'+ column for column in mord_desc_df.columns]
mord_desc_df.drop(columns=mord_desc_df.select_dtypes(include=['object']).columns, inplace=True)
bool_columns = mord_desc_df.select_dtypes(include=['bool']).columns
mord_desc_df[bool_columns] = mord_desc_df[bool_columns].astype(int)

print(mord_desc_df.head())

100%|██████████| 5496/5496 [09:41<00:00,  9.45it/s]


    mord_ABC  mord_ABCGG  mord_nAcid  mord_nBase  mord_nAromAtom  \
0  21.379612   17.449011           0           0              16   
1  16.539255   14.049653           0           0              11   
2  17.475469   13.660693           2           1              13   
3  27.857311   20.034364           0           1              17   
4  15.722758   12.817176           0           0              12   

   mord_nAromBond  mord_nAtom  mord_nHeavyAtom  mord_nSpiro  mord_nBridgehead  \
0              16          52               28            0                 0   
1              11          40               21            0                 0   
2              15          41               22            0                 0   
3              17          69               35            0                 0   
4              12          36               20            0                 0   

   ...  mord_SRW09  mord_SRW10  mord_TSRW10     mord_MW  mord_AMW  mord_WPath  \
0  ...    7.390799   10

In [None]:
allDescrs = [getMolDescriptors(m) for m in filtered_full['Molecule']]
full_Descrs = pd.DataFrame(allDescrs)

In [None]:
result_df_sp3 = count_fragments(filtered_full, 'ring_sp3', Chem.HybridizationType.SP3, ring=True)
result_df_non_ring_sp3 = count_fragments(filtered_full, 'non_ring_sp3', Chem.HybridizationType.SP3)
result_df_sp2 = count_fragments(filtered_full, 'ring_sp2', Chem.HybridizationType.SP2, ring=True)
result_df_non_ring_sp2 = count_fragments(filtered_full, 'non_ring_sp2', Chem.HybridizationType.SP2)
result_df_sp = count_fragments(filtered_full, 'sp', Chem.HybridizationType.SP)

prop_df = calculate_molecular_properties(filtered_full)

In [None]:
#9.14 한리 선생님이 추가해준 features
# Wiener Index
def calculate_wiener_index(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    # Get distance matrix for molecule
    dist_matrix = Chem.GetDistanceMatrix(mol)

    # Wiener index is half of the sum of all the entries in the distance matrix
    wiener = sum(sum(dist_matrix)) / 2.0
    return wiener


In [None]:


fdefName = os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef')

factory = ChemicalFeatures.BuildFeatureFactory(fdefName)

feature_counts = {}

def extract_features(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    feats = factory.GetFeaturesForMol(mol)

    feature_count = {}
    for feat in feats:
        feat_type = feat.GetType()
        if feat_type in feature_count:
            feature_count[feat_type] += 1
        else:
            feature_count[feat_type] = 1

    return feature_count

for index, row in filtered_full.iterrows():
    smiles = row['SMILES']
    feature_count = extract_features(smiles)
    feature_counts[index] = feature_count


result_data = []
for index, counts in feature_counts.items():
    counts['SMILES'] = filtered_full.loc[index, 'SMILES']
    result_data.append(counts)

result_df_features = pd.DataFrame(result_data).fillna(0).drop(columns='SMILES')


result_df_features.columns = ['feature_' + column for column in result_df_features.columns]


print(result_df_features)


      feature_SingleAtomDonor  feature_SingleAtomAcceptor  feature_Arom5  \
0                         3.0                         5.0            2.0   
1                         1.0                         2.0            1.0   
2                         1.0                         4.0            1.0   
3                         1.0                         5.0            1.0   
4                         0.0                         3.0            0.0   
...                       ...                         ...            ...   
5491                      2.0                         6.0            2.0   
5492                      1.0                         1.0            1.0   
5493                      2.0                         1.0            1.0   
5494                      2.0                         5.0            0.0   
5495                      3.0                         4.0            1.0   

      feature_Arom6  feature_ThreeWayAttach  feature_RH6_6  \
0               1.0      

In [None]:
Desc_feature= ['MaxAbsEStateIndex','MaxEStateIndex','MinAbsEStateIndex','MinEStateIndex','qed','HeavyAtomMolWt','NumValenceElectrons',
 'NumRadicalElectrons','MaxPartialCharge','MinPartialCharge','MaxAbsPartialCharge','MinAbsPartialCharge',
 'FpDensityMorgan1','FpDensityMorgan2','FpDensityMorgan3','HallKierAlpha','Ipc','NHOHCount','NOCount','TPSA']

In [None]:
Data = pd.concat([filtered_full, full_Descrs[Desc_feature],mord_desc_df,result_df_sp3, result_df_non_ring_sp3, result_df_sp2, result_df_non_ring_sp2, result_df_sp, prop_df, result_df_features], axis=1).drop(columns=['Molecule'])
Data["Molecular_PolarSurfaceArea"] = np.where(pd.isna(Data["Molecular_PolarSurfaceArea"]), Data["TPSA"], Data["Molecular_PolarSurfaceArea"])
Data["AlogP"] = np.where(pd.isna(Data["AlogP"]), Data["LogD"], Data["AlogP"])

In [None]:
# 약물가능성(주로 막투과도, 용해도 관련) 평가요소인 lipinski rule of 5와 추가 평가요소

Data['beyond_Lipinski'] = 0

Data.loc[Data['Molecular_Weight'] >= 500, 'beyond_Lipinski'] += 1
Data.loc[Data['LogD'] >= 5, 'beyond_Lipinski'] += 1
Data.loc[Data['Num_H_Donors'] >= 5, 'beyond_Lipinski'] += 1
Data.loc[Data['Num_H_Acceptors'] >= 10, 'beyond_Lipinski'] += 1
Data.loc[Data['Num_RotatableBonds'] >= 10, 'beyond_Lipinski'] += 1
Data.loc[(Data['Num_H_Donors'] + Data['Num_H_Acceptors']) >= 12, 'beyond_Lipinski'] += 1
Data.loc[Data['TPSA'] >= 140, 'beyond_Lipinski'] += 1
Data.loc[Data['mord_FCSP3'] <= 0.3, 'beyond_Lipinski'] += 1
Data.loc[Data['mord_naRing'] >= 5, 'beyond_Lipinski'] += 1
Data.loc[Data['NOCount'] >= 10, 'beyond_Lipinski'] += 1
Data.loc[Data['NHOHCount'] >= 5, 'beyond_Lipinski'] += 1

In [None]:
# 약물의 막투과도 관련 feature
Data['ACDNSA'] = Data['Num_H_Acceptors']*np.sqrt(Data['Num_H_Donors'])/Data['mord_LabuteASA']
Data['LogD_Radius'] = Data['LogD']/(Data['mord_Radius']*Data['Molecular_Weight'])
Data['globe'] =  Data['mord_Radius']*Data['mord_Radius']/Data['mord_LabuteASA']

In [None]:
Data

Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,...,feature_PosN,feature_RH3_3,feature_Nitro2,feature_ZnBinder6,feature_Arom7,feature_ZnBinder4,beyond_Lipinski,ACDNSA,LogD_Radius,globe
0,TRAIN_0000,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,26.010,50.680,3.259,400.495,5,2,8,3.25900,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.042175,0.001017,0.381723
1,TRAIN_0001,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,29.270,50.590,2.169,301.407,2,1,2,2.17200,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.015580,0.001441,0.194752
2,TRAIN_0002,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,5.586,80.892,1.593,297.358,5,0,3,1.58500,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.000000,0.000888,0.281136
3,TRAIN_0003,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,5.710,2.000,4.771,494.652,6,0,5,3.47500,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.000000,0.000703,0.481757
4,TRAIN_0004,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,93.270,99.990,2.335,268.310,3,0,1,2.33700,...,0.0,0.0,0.0,0.0,0.0,0.0,1,0.000000,0.001742,0.212654
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5491,CHEMBL2177757,O=C(Cc1ccccc1)Nc1nnc(CCSCCc2nnc(NC(=O)Cc3ccccc...,,56.000,4.270,524.700,9,2,12,4.27040,...,0.0,0.0,0.0,0.0,0.0,0.0,3,0.059154,0.000678,0.669256
5492,CHEMBL3286635,CCN(CC)c1ccc(C(=O)NCCn2c(C)cc3ccccc32)cc1,,9.000,4.230,349.480,3,1,7,4.22592,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.019357,0.001512,0.412956
5493,CHEMBL3286701,Cc1[nH]c2ccccc2c1CCNC(=O)c1ccc(N(C)C)cc1,,17.000,3.510,321.420,2,2,5,3.51482,...,0.0,0.0,0.0,0.0,0.0,0.0,1,0.019913,0.001562,0.344973
5494,CHEMBL3735279,CCOC(=O)c1nc(C)c2c(c1N)C(=O)N(Cc1ccccc1)C2=O,,14.000,1.950,339.350,6,1,4,1.94512,...,0.0,0.0,0.0,0.0,0.0,0.0,1,0.041680,0.000819,0.340385


In [None]:
# 'id' 컬럼에 'TEST' 문자열이 들어가는 행을 testset으로 선택
test = Data[Data['id'].str.contains('TEST')]
print(len(test))
# 'id' 컬럼에 'TEST' 문자열이 들어가지 않는 행을 trainset으로 선택
train = Data[~Data['id'].str.contains('TEST')]
print(len(train))

483
5013


In [None]:
from sklearn.preprocessing import StandardScaler, RobustScaler

# min-max 정규화
scaler = MinMaxScaler()
standard_scaler = StandardScaler()
robust_scaler = RobustScaler()
non_features = ['id', 'SMILES', 'MLM', 'HLM','Fingerprint']
features = [column for column in train.columns if column not in non_features]
train[features] = scaler.fit_transform(train[features])

test[features] = scaler.transform(test[features])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[features] = scaler.fit_transform(train[features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[features] = scaler.transform(test[features])


In [None]:
# ChEMBL데이터들을 추가로 인한, MLM, HLM train 데이터가 달라짐.
train_HLM = train.drop(columns=['MLM']).dropna(axis=0)
train_MLM = train.dropna(axis=0)

In [None]:
# Define features and targets
non_features = ['id', 'SMILES', 'MLM', 'HLM', 'Fingerprint']
features = [column for column in train.columns if column not in non_features]
mlm_target = "MLM"
hlm_target = "HLM"

# Initialize KFold
seed = 42
n_splits = 10
kf = KFold(n_splits=n_splits, random_state=seed, shuffle=True)

# Initialize arrays to store models and scores
reg_mlms = []
reg_hlms = []

# Initialize arrays to store RMSE scores
mlm_rmse_scores = []
hlm_rmse_scores = []

# Loop through KFold splits
for i, (train_index, valid_index) in enumerate(kf.split(train_HLM)):
    df_train = train_HLM.iloc[train_index]
    df_valid = train_HLM.iloc[valid_index]

    x_train_num = df_train[features].values

    y_hlm_train = df_train[hlm_target].values

    x_valid_num = df_valid[features].values

    y_hlm_valid = df_valid[hlm_target].values


    x1_input = keras.Input(shape=(x_train_num.shape[1],))
    x1 = layers.Dense(256, activation='relu')(x1_input)
    x1 = layers.BatchNormalization()(x1)
    x1 = layers.Dropout(0.2)(x1)
    x1 = layers.Dense(1024, activation='relu')(x1)
    x1= layers.BatchNormalization()(x1)
    x1 = layers.Dropout(0.2)(x1)
    x1 = layers.Dense(512, activation='relu')(x1)
    x1 = layers.BatchNormalization()(x1)
    x1 = layers.Dropout(0.2)(x1)
    x1  = layers.Dense(256, activation='relu')(x1)
    x1  = layers.BatchNormalization()(x1)
    x1  = layers.Dropout(0.2)(x1)
    x1  = layers.Dense(128, activation='relu')(x1)
    x1  = layers.BatchNormalization()(x1)

    outputs = layers.Dense(1)(x1)


    model_hlm = keras.Model(inputs=x1_input, outputs=outputs)

    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=0.001,
    decay_steps=8,
    decay_rate=0.98)
    optimizer_decay = keras.optimizers.SGD(learning_rate=lr_schedule)

    model_hlm.compile(optimizer=optimizer_decay, loss='mean_squared_error', metrics=[RootMeanSquaredError()])

    checkpoint_hlm = ModelCheckpoint(f'model_hlm_fold{i}.h5', monitor='val_loss', verbose=0, save_best_only=True)
    early_stopping_hlm = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)

    model_hlm.fit(x_train_num, y_hlm_train, epochs=500, batch_size=16, verbose=1, validation_data=(x_valid_num, y_hlm_valid),
                  callbacks=[checkpoint_hlm, early_stopping_hlm])

    reg_hlms.append(model_hlm)

    # Calculate RMSE for HLM predictions
    y_hlm_pred = model_hlm.predict(x_valid_num)
    hlm_rmse = math.sqrt(mean_squared_error(y_hlm_valid, y_hlm_pred))
    print(hlm_rmse)
    hlm_rmse_scores.append(hlm_rmse)

In [None]:
# Define features and targets
non_features = ['id', 'SMILES', 'MLM', 'HLM', 'Fingerprint']
features = [column for column in train.columns if column not in non_features]
mlm_target = "MLM"
hlm_target = "HLM"

# Initialize KFold
seed = 42
n_splits = 10
kf = KFold(n_splits=n_splits, random_state=seed, shuffle=True)

# Initialize arrays to store models and scores
reg_mlms = []
reg_hlms = []

# Initialize arrays to store RMSE scores
mlm_rmse_scores = []
hlm_rmse_scores = []

# Loop through KFold splits
for i, (train_index, valid_index) in enumerate(kf.split(train_MLM)):
    df_train = train_MLM.iloc[train_index]
    df_valid = train_MLM.iloc[valid_index]

    x_train_num = df_train[features].values

    y_mlm_train = df_train[mlm_target].values

    x_valid_num = df_valid[features].values

    y_mlm_valid = df_valid[mlm_target].values


    x1_input = keras.Input(shape=(x_train_num.shape[1],))
    x1 = layers.Dense(256, activation='relu')(x1_input)
    x1 = layers.BatchNormalization()(x1)
    x1 = layers.Dropout(0.2)(x1)
    x1 = layers.Dense(1024, activation='relu')(x1)
    x1 = layers.BatchNormalization()(x1)
    x1 = layers.Dropout(0.2)(x1)
    x1 = layers.Dense(512, activation='relu')(x1)
    x1 = layers.BatchNormalization()(x1)
    x1 = layers.Dropout(0.2)(x1)
    x1 = layers.Dense(256, activation='relu')(x1)
    x1 = layers.BatchNormalization()(x1)
    x1 = layers.Dropout(0.2)(x1)
    x1 = layers.Dense(128, activation='relu')(x1)
    x1 = layers.BatchNormalization()(x1)

    outputs = layers.Dense(1)(x1)

    model = keras.Model(inputs=x1_input, outputs=outputs)

    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=0.001,
    decay_steps=8,
    decay_rate=0.98)
    optimizer_decay = keras.optimizers.SGD(learning_rate=lr_schedule)

    model.compile(optimizer=optimizer_decay, loss='mean_squared_error', metrics=[RootMeanSquaredError()])


    # Train the model
    checkpoint_mlm = ModelCheckpoint(f'model_mlm_fold{i}.h5', monitor='val_loss', verbose=0, save_best_only=True)
    early_stopping_mlm = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)

    model.fit(x_train_num, y_mlm_train, epochs=500, batch_size=16, verbose=1, validation_data=(x_valid_num, y_mlm_valid),
              callbacks=[checkpoint_mlm, early_stopping_mlm])

    reg_mlms.append(model)

    # Calculate RMSE for MLM predictions
    y_mlm_pred = model.predict(x_valid_num)
    mlm_rmse = math.sqrt(mean_squared_error(y_mlm_valid, y_mlm_pred))
    print(mlm_rmse)
    mlm_rmse_scores.append(mlm_rmse)


In [None]:


# Load MLM models
mlm_models = [load_model(f'model_mlm_fold{i}.h5') for i in range(n_splits)]

# Load HLM models
hlm_models = [load_model(f'model_hlm_fold{i}.h5') for i in range(n_splits)]

x_test = test[features].values

mlm_predictions = []
hlm_predictions = []

for model_mlm, model_hlm in zip(mlm_models, hlm_models):
    mlm_predictions.append(model_mlm.predict(x_test))
    hlm_predictions.append(model_hlm.predict(x_test))

# Convert prediction lists to numpy arrays
mlm_predictions = np.array(mlm_predictions)
hlm_predictions = np.array(hlm_predictions)

mlm_ensemble_prediction = mlm_predictions.mean(axis=0)
hlm_ensemble_prediction = hlm_predictions.mean(axis=0)

In [None]:
df_submission = pd.read_csv("/content/drive/MyDrive/metabolism_dacon/sample_submission.csv")
df_submission["MLM"] = mlm_ensemble_prediction
df_submission["HLM"] = hlm_ensemble_prediction
df_submission.to_csv("submission.csv", index = False, encoding = "utf-8-sig")

In [None]:
df_submission

Unnamed: 0,id,MLM,HLM
0,TEST_000,20.674702,47.023026
1,TEST_001,68.731430,79.063332
2,TEST_002,44.597523,67.481194
3,TEST_003,40.368557,71.071121
4,TEST_004,59.678509,75.166687
...,...,...,...
478,TEST_478,6.414800,22.438740
479,TEST_479,73.108261,85.277214
480,TEST_480,48.360413,62.935143
481,TEST_481,48.941154,71.435020
