In [1]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier, SGDRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn import metrics
import pandas as pd
from pathlib import Path
import numpy as np

In [2]:
data_path = Path('data/preprocessed/KlekFP_preprocessed.csv')
results_path = Path('results/')

In [3]:
df = pd.read_csv(data_path, header=0, index_col=False)
df

Unnamed: 0,IC50,KRFP2,KRFP8,KRFP10,KRFP13,KRFP14,KRFP16,KRFP17,KRFP18,KRFP20,...,KRFP4830,KRFP4831,KRFP4832,KRFP4833,KRFP4835,KRFP4852,KRFP4853,KRFP4856,KRFP4857,KRFP4858
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10630,11499,1,1,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
10631,11500,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
10632,11501,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
10633,11502,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0


In [4]:
X = df.drop(['IC50'], axis=1)
y = df["IC50"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [None]:
models = {
    LogisticRegression(): {"max_iter": [1000], "C": [0.1, 1.0, 10.0], 'solver': ['newton-cg', 'saga']},
    SGDRegressor(): {'alpha' : [0.0, 0.01, 0.001], 'learning_rate' : ['constant','optimal','invscaling'], 'random_state': [123]},
    SVC(): {'C': [0.1, 1, 10], 'gamma': ["scale", "auto"], 'kernel': ['rbf', 'poly']},
    RandomForestClassifier(): {'n_estimators': [10, 50, 200]},
    AdaBoostClassifier() : {'n_estimators': [10, 200, 500], 'learning_rate': [0.05, 0.1, 1.0]},
    GradientBoostingClassifier() : {'learning_rate': [0.005, 0.01, 0.1, 1.0], 'n_estimators' : [50, 200, 500]},
    ExtraTreesClassifier(): {'n_estimators': [50, 200, 500], 'criterion': ['gini', 'entropy'], 'max_features': ['sqrt', 'log2']}
}

In [None]:
for model, params in models.items():
  sh = GridSearchCV(model, params, cv=2).fit(X, y)
  results = pd.DataFrame(sh.cv_results_)
  results.to_csv(results_path.joinpath(f"KlekFP_{sh.best_estimator_}.csv"), encoding='utf-8', header=True)

In [None]:
model_creator = {
    "LogisticRegression" : LogisticRegression,
    "SGDRegressor" : SGDRegressor,
    "SVC" : SVC,
    "RandomForestClassifier" : RandomForestClassifier,
    "AdaBoostClassifier" : AdaBoostClassifier,
    "GradientBoostingClassifier" : GradientBoostingClassifier,
    "ExtraTreesClassifier" : ExtraTreesClassifier
}