In [1]:
import pandas as pd


# Part 1: Prep data for shallow model ML

In [2]:
# for some reason sklearn prediction code can't do ignore columns
df = pd.read_csv('../out/data_for_ml.csv')
df = df[['SMILES', 'hit_inh', 'hit_kill']]
df['hit_inh'] = [1 if x == 1.0 else 0 for x in list(df['hit_inh'])] # must binarize
df['hit_kill'] = [1 if x == 1.0 else 0 for x in list(df['hit_kill'])] # must binarize

df.to_csv('../out/data_for_sklearn.csv')
df

Unnamed: 0,SMILES,hit_inh,hit_kill
0,CCCC[C@H](CC)CNC(=N)N=C(N)NCCCCCCNC(N)=NC(=N)N...,1,1
1,NC(Nc1ccc(Cl)cc1)=NC(=N)NCCCCCCNC(=N)N=C(N)Nc1...,1,1
2,CC[C@@H](C)CCCCC(=O)N[C@@H](CCN)C(=O)N[C@@H]([...,1,1
3,OCCN[C@H]1CCCc2c1[nH]c1ccc(cc21)-c1ccccc1 |&1:...,1,1
4,NC[C@H]1O[C@H](O[C@@H]2[C@@H](N)C[C@@H](N)[C@H...,1,1
...,...,...,...
6715,CC(O)(CC(O)=O)CC(O)=O,0,0
6716,CCCCCCCCN[C@H](C)[C@@H](O)c1ccc(SC(C)C)cc1 |&1...,0,0
6717,CCc1nn(CCCN2CCN(CC2)c2cccc(Cl)c2)c(=O)n1CCOc1c...,0,0
6718,NC[C@H]1O[C@H](O[C@@H]2[C@@H](N)C[C@@H](N)[C@H...,0,0


# Part 2: Make scripts

In [3]:
bash_dir = 'script_2A_rfc_hyperopt.sh'

prefix = 'python sklearn_train.py '
suffix = ' --data_path ../../nontoxic_stat_phase_killing_abx/out/data_for_sklearn.csv --num_folds 3 --dataset_type classification --features_path ../../nontoxic_stat_phase_killing_abx/out/data_prep_for_ml_fullset.npz --no_features_scaling --split_type scaffold_balanced --split_sizes 0.8 0.1 0.1 --smiles_columns SMILES --target_columns hit_inh hit_kill --model_type random_forest '

numbits = [512, 1024, 2048, 4096]
radius = [2, 3, 4]
classweight = ['balanced']
numtrees = [50, 100, 250, 500, 750, 1000, 2000]


file1 = open(bash_dir,"a") 
file1.write('cd ../../chemprop/; mkdir ../../nontoxic_stat_phase_killing_abx/models/rfc_hyperopt_final/ ')
file1.write('\n')
file1.close()

model_dir = 0
for l in numbits:
    for d in radius:
        for h in classweight:
            for f in numtrees:
                command = 'mkdir ../../nontoxic_stat_phase_killing_abx/models/rfc_hyperopt_final/' + str(model_dir) + '; ' + prefix + '--num_bits ' + str(l) + ' --radius ' + str(d) + ' --class_weight ' + str(h) + ' --num_trees ' + str(f) + ' --save_dir ../../nontoxic_stat_phase_killing_abx/models/rfc_hyperopt_final/' + str(model_dir) + suffix
                model_dir = model_dir + 1
                file1 = open(bash_dir,"a") 
                file1.write(command)
                file1.write('\n')
                file1.close()

In [4]:
bash_dir = 'script_2A_svm_inh_hyperopt.sh'

prefix = 'python sklearn_train.py '
suffix = ' --data_path ../../nontoxic_stat_phase_killing_abx/out/data_for_sklearn.csv --num_folds 3 --dataset_type classification --features_path ../../nontoxic_stat_phase_killing_abx/out/data_prep_for_ml_fullset.npz  --no_features_scaling --split_type scaffold_balanced --split_sizes 0.8 0.1 0.1 --smiles_columns SMILES --target_columns hit_inh --model_type svm '

numbits = [512, 1024, 2048, 4096]
radius = [2, 3, 4]
classweight = ['balanced']


file1 = open(bash_dir,"a") 
file1.write('cd ../../chemprop/; mkdir ../../nontoxic_stat_phase_killing_abx/models/svm_inh_hyperopt_final/ ')
file1.write('\n')
file1.close()

model_dir = 0
for l in numbits:
    for d in radius:
        for h in classweight:
            for f in numtrees:
                command = 'mkdir ../../nontoxic_stat_phase_killing_abx/models/svm_inh_hyperopt_final/' + str(model_dir) + '; ' + prefix + '--num_bits ' + str(l) + ' --radius ' + str(d) + ' --class_weight ' + str(h) + ' --save_dir ../../nontoxic_stat_phase_killing_abx/models/svm_inh_hyperopt_final/' + str(model_dir) + suffix
                model_dir = model_dir + 1
                file1 = open(bash_dir,"a") 
                file1.write(command)
                file1.write('\n')
                file1.close()

In [5]:
bash_dir = 'script_2A_svm_kill_hyperopt.sh'

prefix = 'python sklearn_train.py '
suffix = ' --data_path ../../nontoxic_stat_phase_killing_abx/out/data_for_sklearn.csv --num_folds 3 --dataset_type classification --features_path ../../nontoxic_stat_phase_killing_abx/out/data_for_sklearn.npz  --no_features_scaling --split_type scaffold_balanced --split_sizes 0.8 0.1 0.1 --smiles_columns SMILES --target_columns hit_kill --model_type svm '

numbits = [512, 1024, 2048, 4096]
radius = [2, 3, 4]
classweight = ['balanced']


file1 = open(bash_dir,"a") 
file1.write('cd ../../chemprop/; mkdir ../../nontoxic_stat_phase_killing_abx/models/svm_kill_hyperopt_final/ ')
file1.write('\n')
file1.close()

model_dir = 0
for l in numbits:
    for d in radius:
        for h in classweight:
            for f in numtrees:
                command = 'mkdir ../../nontoxic_stat_phase_killing_abx/models/svm_kill_hyperopt_final/' + str(model_dir) + '; ' + prefix + '--num_bits ' + str(l) + ' --radius ' + str(d) + ' --class_weight ' + str(h) + ' --save_dir ../../nontoxic_stat_phase_killing_abx/models/svm_kill_hyperopt_final/' + str(model_dir) + suffix
                model_dir = model_dir + 1
                file1 = open(bash_dir,"a") 
                file1.write(command)
                file1.write('\n')
                file1.close()

In [6]:
bash_dir = 'script_2A_ffn_hyperopt.sh'

prefix = 'python train.py '
suffix = ' --data_path ../../nontoxic_stat_phase_killing_abx/out/round3finalval08162021_FULL_for_sklearn.csv --num_folds 3 --dataset_type classification --features_generator morgan --no_features_scaling --split_type scaffold_balanced --split_sizes 0.8 0.1 0.1 --smiles_columns SMILES --target_columns hit_inh hit_kill --depth 0 --features_only '

ffn_num_layers = [1, 2, 3, 4]
hidden_size = [500, 1000, 1500]
dropout = [0.1, 0.2, 0.3]


file1 = open(bash_dir,"a") 
file1.write('cd ../../chemprop/; mkdir ../../nontoxic_stat_phase_killing_abx/models/ffn_hyperopt_final/ ')
file1.write('\n')
file1.close()

model_dir = 0
for f in ffn_num_layers:
    for h in hidden_size:
        for d in dropout:
            command = 'mkdir ../../nontoxic_stat_phase_killing_abx/models/ffn_hyperopt_final/' + str(model_dir) + '; ' + prefix + '--ffn_num_layers ' + str(f) + ' --hidden_size ' + str(h) + ' --dropout ' + str(d) + ' --save_dir ../../nontoxic_stat_phase_killing_abx/models/ffn_hyperopt_final/' + str(model_dir) + suffix
            model_dir = model_dir + 1
            file1 = open(bash_dir,"a") 
            file1.write(command)
            file1.write('\n')
            file1.close()