In [2]:
import glob
import pandas as pd
import os
import numpy as np
import pickle
from sys import argv

In [3]:
files = glob.glob("results/validation/*")
files = [file for file in files if os.path.getsize(file) > 0]

data = pd.read_csv(files[0], sep=";")
for file in files[1:]:
    tmp = pd.read_csv(file, sep=";")
    data = pd.concat([data, tmp], axis=0)

In [4]:
data.shape

(14064, 13)

In [5]:
data.head()

Unnamed: 0,part,dataset,method,cls,n_estimators,learning_rate,subsample,max_depth,colsample_bytree,min_child_weight,nr_events,metric,score
0,0,production,single_laststate,xgboost,158,0.010445,0.319582,9,0.479149,1,1,auc,0.749628
1,0,production,single_laststate,xgboost,373,0.058747,0.617477,8,0.467443,2,1,auc,0.65224
2,0,production,single_laststate,xgboost,849,0.046847,0.314837,5,0.488857,3,1,auc,0.673032
3,0,production,single_laststate,xgboost,893,0.031759,0.551795,5,0.474981,1,1,auc,0.662409
4,0,production,single_laststate,xgboost,680,0.048269,0.691477,8,0.458188,1,1,auc,0.632234


In [6]:
data.metric.value_counts()

auc    14064
Name: metric, dtype: int64

In [7]:
# select best params according to auc only
data = data[data.metric=="auc"]


# fix cases where score is unknown
data["score"][pd.isnull(data["score"])] = 0

if data["score"].dtype != np.float64:
    data["score"][data["score"] == "None"] = 0
    
data["score"] = data["score"].astype(float)
data.fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [8]:
# extract columns that refer to parameters
params_cols = [col for col in data.columns if col not in ['cls', 'dataset', 'method', 'metric', 'nr_events', 'part', 'score']]

# aggregate data over all CV folds
data_agg = data.groupby(["cls", "dataset", "method", "metric", "nr_events"] + params_cols, as_index=False)["score"].mean()
data_agg_over_all_prefixes = data.groupby(["cls", "dataset", "method", "metric"] + params_cols, as_index=False)["score"].mean()


# select the best params
data_best = data_agg.sort_values("score", ascending=False).groupby(["cls", "dataset", "method", "metric", "nr_events"], as_index=False).first()
data_best_over_all_prefixes = data_agg_over_all_prefixes.sort_values("score", ascending=False).groupby(["cls", "dataset", "method", "metric"], as_index=False).first()


In [9]:
best_params = {}

# all except prefix length based
for row in data_best_over_all_prefixes[~data_best_over_all_prefixes.method.str.contains("prefix")][["dataset", "method", "cls"] + params_cols].values:
    
    if row[0] not in best_params:
        best_params[row[0]] = {}
    if row[1] not in best_params[row[0]]:
        best_params[row[0]][row[1]] = {}
    if row[2] not in best_params[row[0]][row[1]]:
        best_params[row[0]][row[1]][row[2]] = {}
        
    for i, param in enumerate(params_cols):
        value = row[3+i]
        if param in ["n_estimators", "max_depth", "min_child_weight"]:
            value = int(value)
        else:
            value = float(value)
            
        best_params[row[0]][row[1]][row[2]][param] = value

In [10]:
# only prefix length based
for row in data_best[data_best.method.str.contains("prefix")][["dataset", "method", "cls", "nr_events"] + params_cols].values:
    
    if row[0] not in best_params:
        best_params[row[0]] = {}
    if row[1] not in best_params[row[0]]:
        best_params[row[0]][row[1]] = {}
    if row[2] not in best_params[row[0]][row[1]]:
        best_params[row[0]][row[1]][row[2]] = {}
    if row[3] not in best_params[row[0]][row[1]][row[2]]:
        best_params[row[0]][row[1]][row[2]][row[3]] = {}
        
    for i, param in enumerate(params_cols):
        value = row[4+i]
        if param in ["n_estimators", "max_depth", "min_child_weight"]:
            value = int(value)
        else:
            value = float(value)
            
        best_params[row[0]][row[1]][row[2]][row[3]][param] = value
        

In [11]:
outfile = "optimal_params.pickle"

# write to file
with open(outfile, "wb") as fout:
    pickle.dump(best_params, fout)