#### Autogluon results for models built with 2259 imputed features, from which 15 balanced training datasets were obtained, and models were evaluated on 10 imbalanced test sets

In [None]:
import pandas as pd
output_path = '/home/seguraab/ara-kinase-prediction/output_clf/ara_m_autogluon_2259_imp_feats'
res = []
imp = {}
for i in range(10): # test sets
    for j in range(15): # balanced training sets
        dirp = f'{output_path}/ara_m_autogluon_test_{i}_balanced_{j}'
        i_j_res = pd.read_csv(f'{dirp}/ara_m_autogluon_test_{i}_balanced_{j}_RESULTS.csv', index_col=0)
        i_j_res.insert(0, 'Tag', f'ara_m_autogluon_test_{i}_balanced_{j}')
        res.append(i_j_res)
        
        i_j_imp = pd.read_csv(f'{dirp}/ara_m_autogluon_test_{i}_balanced_{j}_IMPORTANCE.csv', index_col=0)
        imp[f'ara_m_autogluon_test_{i}_balanced_{j}'] = i_j_imp
        del i_j_res, i_j_imp
        
res = pd.concat(res, axis=0)
# res.to_csv(f'{output_path}/ara_m_autogluon_2259_imp_feats_balanced_RESULTS.csv', index=False)

imp = pd.concat(imp, axis=1)
new_cols = [f'{level0}_{level1}' for level0, level1 in \
    zip(imp.columns.get_level_values(0), imp.columns.get_level_values(1))]
imp.columns = new_cols
# imp.to_csv(f'{output_path}/ara_m_autogluon_2259_imp_feats_balanced_IMPORTANCE.csv', index=False)

### Feature selection results for XGBoost models built using the 2259 imputed features, 15 balanced training sets, and 10 imbalanced test sets. Feature selection was performed with RandomForest.

In [None]:
import pandas as pd
import datatable as dt
import numpy as np

# Get the best feature selection run per test set and balanced dataset
clf_res = dt.fread('/home/seguraab/ara-kinase-prediction/output_clf/ara_m_kfold_ht_2259_imp_feats_fs/RESULTS_xgboost.tsv').to_pandas()
clf_res.insert(0, 'TestSet', clf_res.Tag.str.split('_').str[8])
clf_res.insert(0, 'BalancedTrainSet', clf_res.Tag.str.split('_').str[10])
best_runs = clf_res.groupby(['TestSet', 'BalancedTrainSet']).apply(lambda x: x['F1_val'].idxmax())
clf_res_best = clf_res.iloc[best_runs]
clf_res_best.to_csv('/home/seguraab/ara-kinase-prediction/output_clf/ara_m_kfold_ht_2259_imp_feats_fs/RESULTS_xgboost_best.csv', index=False)
# note on clf_res_best: the best run F1 test is really bad even though F1_val looks good.

# Get the average performance across different feature subsets per test set and balanced dataset
clf_res['TestSet'] = clf_res['TestSet'].astype(int)
clf_res['BalancedTrainSet'] = clf_res['BalancedTrainSet'].astype(int)
clf_res_avg = clf_res.select_dtypes(np.number).groupby(['TestSet', 'BalancedTrainSet']).mean()
clf_res_avg.to_csv('/home/seguraab/ara-kinase-prediction/output_clf/ara_m_kfold_ht_2259_imp_feats_fs/RESULTS_xgboost_avg.csv')

#### Feature selection curves
None of the models have a test set performance above 0.5 with a reasonable validation F1.
```python
>>> clf_res.loc[clf_res.F1_test > 0.5, 'F1_val']
# 387    0.330136
```

In [None]:
import matplotlib.pyplot as plt

# Plotting the F1 score
for test_set in range(10):
    fig, ax = plt.subplots(5, 3, figsize=(8.5, 14.17), sharex=True, sharey=True) # to make the plots square
    for i in range(5):
        for j in range(3):
            balanced_train_set = i * 3 + j
            clf_res_subset = clf_res[(clf_res['TestSet'] == test_set) &\
                (clf_res['BalancedTrainSet'] == balanced_train_set)]
            clf_res_subset = clf_res_subset.sort_values('NumFeatures')
            ax[i, j].plot(clf_res_subset['NumFeatures'], clf_res_subset['F1_val'],
                          label='F1_val', color='red')
            ax[i, j].plot(clf_res_subset['NumFeatures'], clf_res_subset['F1_test'],
                          label='F1_test', color='blue')
            ax[i, j].set_title(f'Balanced Train Set {balanced_train_set}')
            ax[i, j].set_xlabel('Number of Features')
            ax[i, j].set_ylabel('F1 score')
            ax[i, j].legend()
    
    fig.suptitle(f'Test Set {test_set}')
    plt.tight_layout()
    plt.savefig(f'/home/seguraab/ara-kinase-prediction/output_clf/ara_m_kfold_ht_2259_imp_feats_fs/0_figure_test_{test_set}_feature_selection.pdf')
    plt.close()