In [1]:
import pprint
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
pd.set_option('display.width', 100)
pd.set_option("display.precision", 5)
plt.style.use('fivethirtyeight')
plt.rcParams.update({'font.size': 16, 'font.family': 'sans'})
%matplotlib

Using matplotlib backend: Qt5Agg


In [3]:
filepath = '../models/model_info.csv'
model_results = pd.read_csv(filepath).copy()

In [4]:
model_results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3041 entries, 0 to 3040
Data columns (total 45 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   record_id                      3040 non-null   object 
 1   record_type                    3040 non-null   object 
 2   CV_fit_time                    3040 non-null   float64
 3   CV_score_time                  3040 non-null   float64
 4   refit_time                     2788 non-null   float64
 5   CV_accuracy                    3040 non-null   float64
 6   CV_balanced_accuracy           3040 non-null   float64
 7   CV_F1_score                    3040 non-null   float64
 8   CV_precision                   3040 non-null   float64
 9   CV_recall                      3040 non-null   float64
 10  CV_roc_auc                     3040 non-null   float64
 11  CV_R2_score                    0 non-null      float64
 12  CV_mse                         0 non-null      f

In [5]:
show_results = model_results.query("record_type == 'test' & target == 'T2_CLS_ufc_>0'")

In [6]:
show_results = show_results.loc[:, ['record_id', 'data', 'model_type', 'Test_accuracy',
                 'Test_balanced_accuracy', 'Test_f1_score',
                 'Test_precision', 'Test_recall', 'Test_hamming_loss',
                 'Test_jaccard_score', 'Test_log_loss',
                 'hyperparameters']]

In [7]:
show_results = show_results.sort_values(by=['data', 'Test_accuracy'],
                                        ascending=False, ignore_index=True).drop_duplicates(ignore_index=True,
                                                                                           subset='record_id')
show_results

Unnamed: 0,record_id,data,model_type,Test_accuracy,Test_balanced_accuracy,Test_f1_score,Test_precision,Test_recall,Test_hamming_loss,Test_jaccard_score,Test_log_loss,hyperparameters
0,mDrY3it80D,text,Forest Cls,0.58,0.57906,0.57914,0.57978,0.58,0.42,0.408,0.95742,"{'random_state': 7, 'n_jobs': -1, 'n_estimator..."
1,CGagm8pxSz,text,Log Reg,0.5795,0.57761,0.57754,0.57917,0.5795,0.4205,0.40688,0.98442,"{'tol': 0.0001, 'solver': 'saga', 'random_stat..."
2,N8y1RUyA6Y,text,XGB Cls,0.573,0.57144,0.57098,0.57284,0.573,0.427,0.40039,0.98316,"{'subsample': 0.8, 'reg_lambda': 0.75, 'random..."
3,u3ilbEnNJ9,text,Log Reg,0.5655,0.56399,0.5636,0.56519,0.5655,0.4345,0.39316,1.00467,"{'tol': 0.0001, 'solver': 'lbfgs', 'random_sta..."
4,lzMwQGcsL9,text,HGB Cls,0.5615,0.55992,0.56052,0.5608,0.5615,0.4385,0.38996,1.03528,"{'random_state': 7, 'min_samples_leaf': 20, 'm..."
5,f1529Q2N3x,text,Log Reg,0.561,0.55753,0.55854,0.55948,0.561,0.439,0.38881,1.04893,"{'tol': 0.0001, 'solver': 'saga', 'random_stat..."
6,CY4Ab4y8hT,text,Forest Cls,0.55,0.5,0.52308,0.52,0.55,0.45,0.37273,9.05846,"{'random_state': 7, 'n_estimators': 10, 'min_s..."
7,QplKozhffy,text,Log Reg,0.55,0.5472,0.54672,0.54887,0.55,0.45,0.37762,1.08071,"{'tol': 0.0001, 'solver': 'lbfgs', 'random_sta..."
8,hmhj7EY39X,non_text,XGB Cls,0.713,0.71274,0.71293,0.71298,0.713,0.287,0.55397,0.53677,"{'subsample': 0.8, 'reg_lambda': 0.75, 'random..."
9,Fc3xOG2zYk,non_text,Forest Cls,0.7055,0.70466,0.70539,0.70536,0.7055,0.2945,0.54506,0.54031,"{'random_state': 7, 'n_jobs': -1, 'n_estimator..."


In [8]:
show_results = show_results[show_results['model_type'] == 'Forest Cls']
hyper_dict = show_results.loc[0:, ['hyperparameters']].to_dict()
for k, v in hyper_dict['hyperparameters'].items():
    print (f'Index: {k}')
    pprint.pprint(v)
    print()


Index: 0
("{'random_state': 7, 'n_jobs': -1, 'n_estimators': 50, 'min_samples_split': "
 "2, 'min_samples_leaf': 5, 'max_samples': None, 'max_leaf_nodes': 100, "
 "'max_features': 20, 'max_depth': None, 'criterion': 'gini', 'class_weight': "
 'None}')

Index: 6
("{'random_state': 7, 'n_estimators': 10, 'min_samples_split': 10, "
 "'min_samples_leaf': 5, 'max_samples': None, 'max_leaf_nodes': 10, "
 "'max_features': 10, 'max_depth': None, 'criterion': 'gini', 'class_weight': "
 'None}')

Index: 9
("{'random_state': 7, 'n_jobs': -1, 'n_estimators': 100, 'min_samples_split': "
 "2, 'min_samples_leaf': 1, 'max_samples': None, 'max_leaf_nodes': None, "
 "'max_features': 'sqrt', 'max_depth': None, 'criterion': 'gini', "
 "'class_weight': None}")

Index: 13
("{'random_state': 7, 'n_jobs': -1, 'n_estimators': 50, 'min_samples_split': "
 "2, 'min_samples_leaf': 5, 'max_samples': None, 'max_leaf_nodes': 100, "
 "'max_features': 20, 'max_depth': None, 'criterion': 'gini', 'class_weight': "
 'None