In [None]:
!pip install autogluon

In [47]:
from autogluon.tabular import TabularDataset, TabularPredictor

In [2]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2023.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.7/29.7 MB[0m [31m51.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.3.3


In [7]:
import random
import os

import numpy as np
import pandas as pd

from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from rdkit import DataStructs
from rdkit.Chem import PandasTools, AllChem

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [8]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

seed_everything(42) # Seed 고정

In [49]:
!unzip '/content/drive/MyDrive/데이콘_신약개발/데이콘 신약데이터.zip'

train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
PandasTools.AddMoleculeColumnToFrame(train,'SMILES','Molecule')
PandasTools.AddMoleculeColumnToFrame(test,'SMILES','Molecule')
def mol2fp(mol):
    fp = AllChem.GetHashedMorganFingerprint(mol, 6, nBits=4096)
    ar = np.zeros((1,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(fp, ar)
    return ar

# FPs column 추가
train["FPs"] = train.Molecule.apply(mol2fp)
test["FPs"] = test.Molecule.apply(mol2fp)

# 사용할 column만 추출
train = train[['FPs','MLM', 'HLM']]
test = test[['FPs']]

X = train['FPs']
y = train[['MLM', 'HLM']]

X_X = pd.concat([pd.DataFrame(i).T for i in X])


# 예측모델 후보 - 랜덤포레스트 회귀
# from sklearn.ensemble import RandomForestRegressor
# rfr = RandomForestRegressor()
# rfr.fit(X_X,y)

test_X = pd.concat([pd.DataFrame(i).T for i in test['FPs']])
# test_X_predict = rfr.predict(test_X)

Archive:  /content/drive/MyDrive/데이콘_신약개발/데이콘 신약데이터.zip
replace sample_submission.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [51]:
len(X_X), len(y)
X2_MLM = pd.concat([X_X.reset_index(drop=True),y['MLM']],axis=1)
X2_HLM = pd.concat([X_X.reset_index(drop=True),y['HLM']],axis=1)
# X2.columns = list(map(str,X2.columns[:-2])) + ['MLM','HLM']
# X2.head(2)

In [52]:
train_X2_MLM = TabularDataset(X2_MLM)
train_X2_HLM = TabularDataset(X2_HLM)
save_path = './automl_MLM'
predictor_MLM = TabularPredictor(
    label='MLM',
    path=save_path,
    # eval_metric='mae'
)

save_path = './automl_HLM'
predictor_HLM = TabularPredictor(
    label='HLM',
    path=save_path,
    # eval_metric='mae'
)





In [None]:
predictor_MLM.fit(
    train_data = train_X2_MLM,
    time_limit=60*1
)
predictor_HLM.fit(
    train_data = train_X2_HLM,
    time_limit=60*1
)

predict_MLM = predictor_MLM.predict(test_X)
predict_HLM = predictor_HLM.predict(test_X)

Beginning AutoGluon training ... Time limit = 60s
AutoGluon will save models to "./automl_MLM/"
AutoGluon Version:  0.8.2
Python Version:     3.10.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Fri Jun 9 10:57:30 UTC 2023
Disk Space Avail:   49.25 GB / 83.96 GB (58.7%)
Train Data Rows:    3498
Train Data Columns: 4096
Label Column: MLM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (131.72, 0.0, 37.38474, 35.69599)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    10578.95 MB
	Train Data (Original)  Memory Usage: 14.33 M

In [None]:
# predict
df_submission = pd.read_csv("./sample_submission.csv")
df_submission["MLM"] = predict_MLM
df_submission["HLM"] = predict_HLM
df_submission.to_csv("automl_result.csv", index = False, encoding = "utf-8-sig")