# HW3 Analysis

## Import and load data

In [6]:
import ml_pipeline as pp
import pandas as pd
import datetime as dt
import numpy as np
from sklearn import metrics
file = './data/projects_2012_2013.csv'
df = pp.load_csv(file)

## Data transformations

### Helper functions for hw3 specific data

#### Convert date columns to datetime

In [9]:
df.date_posted = pp.col_datetime(df, 'date_posted')

In [10]:
df.datefullyfunded = pp.col_datetime(df,'datefullyfunded')

#### Create labels

In [11]:
df = pp.create_label(df, pred_time=60)

## Selecting features and cleaning

In [12]:
feature_cols=['school_metro','school_charter', 'school_magnet', 'primary_focus_subject', 'primary_focus_area', 'resource_type', 'poverty_level', 'grade_level', 'total_price_including_optional_support', 'students_reached', 'eligible_double_your_impact_match', 'date_posted', 'label']
sel = df[feature_cols].copy()
sel.head()

Unnamed: 0,school_metro,school_charter,school_magnet,primary_focus_subject,primary_focus_area,resource_type,poverty_level,grade_level,total_price_including_optional_support,students_reached,eligible_double_your_impact_match,date_posted,label
0,urban,f,f,Mathematics,Math & Science,Supplies,highest poverty,Grades PreK-2,1498.61,31.0,f,2013-04-14,1
1,urban,f,f,Civics & Government,History & Civics,Books,highest poverty,Grades 3-5,282.47,28.0,t,2012-04-07,1
2,urban,f,f,Literacy,Literacy & Language,Technology,high poverty,Grades 3-5,1012.38,56.0,f,2012-01-30,0
3,urban,f,t,Literacy,Literacy & Language,Books,high poverty,Grades PreK-2,175.33,23.0,f,2012-10-11,1
4,suburban,f,f,Literacy,Literacy & Language,Technology,high poverty,Grades PreK-2,3591.11,150.0,f,2013-01-08,0


#### Identify feature columns with null values

In [13]:
for x in pp.na_col(df):
    if x in feature_cols:
        print(x)

school_metro
primary_focus_subject
primary_focus_area
resource_type
grade_level
students_reached


#### Impute missing categorical variables with the most frequent, which is a common way to handle missing categorical data without more information. 


In [14]:
cat_cols = ['school_metro','primary_focus_subject','primary_focus_area','resource_type','grade_level']
for x in cat_cols:
    sel = pp.na_fill_col(sel, x , pp.most_freq)

#### Impute missing numerical variable (students_reached) with the median value because there are outliers affecting the mean. 

In [15]:
sel.students_reached.quantile([0.1, 0.25, 0.5, 0.75, 0.9,0.98,1])

0.10       18.0
0.25       23.0
0.50       30.0
0.75      100.0
0.90      200.0
0.98      700.0
1.00    12143.0
Name: students_reached, dtype: float64

In [16]:
sel = pp.na_fill_col(sel, 'students_reached', np.nanmedian)

#### Check that there are no more missing values in feature columns

In [17]:
for x in pp.na_col(sel):
    if x in feature_cols:
        print(x)

#### Discretize numeric features and then get all dummy variables.

In [18]:
# discretize numeric features
bucketdict= {'total_price_including_optional_support': 4, 'students_reached':4}
df_discr = pp.feat_mult_disc(sel, bucketdict, qt=True)

df_discr.total_price_including_optional_support_binned.unique()
df_discr.students_reached_binned.unique()

[(30.0, 100.0], (23.0, 30.0], (0.999, 23.0], (100.0, 12143.0]]
Categories (4, interval[float64]): [(0.999, 23.0] < (23.0, 30.0] < (30.0, 100.0] < (100.0, 12143.0]]

In [19]:
col_to_binary = list(df_discr.columns)
col_to_binary.remove('label')
col_to_binary.remove('date_posted')

In [20]:
col_to_binary

['school_metro',
 'school_charter',
 'school_magnet',
 'primary_focus_subject',
 'primary_focus_area',
 'resource_type',
 'poverty_level',
 'grade_level',
 'eligible_double_your_impact_match',
 'total_price_including_optional_support_binned',
 'students_reached_binned']

In [21]:
# turn variables into dummies
df_final = pp.feat_binary(df_discr, col_to_binary)
df_final.head()

Unnamed: 0,date_posted,label,school_metro_rural,school_metro_suburban,school_metro_urban,school_charter_f,school_charter_t,school_magnet_f,school_magnet_t,primary_focus_subject_Applied Sciences,...,eligible_double_your_impact_match_f,eligible_double_your_impact_match_t,"total_price_including_optional_support_binned_(91.999, 345.81]","total_price_including_optional_support_binned_(345.81, 510.5]","total_price_including_optional_support_binned_(510.5, 752.96]","total_price_including_optional_support_binned_(752.96, 164382.84]","students_reached_binned_(0.999, 23.0]","students_reached_binned_(23.0, 30.0]","students_reached_binned_(30.0, 100.0]","students_reached_binned_(100.0, 12143.0]"
0,2013-04-14,1,0,0,1,1,0,1,0,0,...,1,0,0,0,0,1,0,0,1,0
1,2012-04-07,1,0,0,1,1,0,1,0,0,...,0,1,1,0,0,0,0,1,0,0
2,2012-01-30,0,0,0,1,1,0,1,0,0,...,1,0,0,0,0,1,0,0,1,0
3,2012-10-11,1,0,0,1,1,0,0,1,0,...,1,0,1,0,0,0,1,0,0,0
4,2013-01-08,0,0,1,0,1,0,1,0,0,...,1,0,0,0,0,1,0,0,0,1


## Run variations of models: 
### Decision trees, KNN, Logistic Regression, Linear SVM, Random forests, Bagging, Boosting

In [108]:
windows = [dt.datetime(2012,1,1), dt.datetime(2012,7,1), dt.datetime(2013,1,1), dt.datetime(2013,7,1), dt.datetime(2014,1,1)]
pred_time = 60 #days
label_col = 'label'
split_col = 'date_posted'
feature_cols= list(df_final.columns)
feature_cols.remove('label')
feature_cols.remove('date_posted')
seed=12345

In [109]:
models = [
    {'type': 'Dtree', 'clf': pp.dtree_score, 'criteria': ['entropy', 'gini'], 'depth': [10,20,30],'min_leaf': [100, 300,500], 'seed': seed},
    {'type': 'LR', 'clf': lr_score, 'p': ['l1','l2'], 'c': [0.1, 1.0, 10.0, 100.0], 'solver': ['liblinear'], 'seed': seed},
    {'type': 'SVM', 'clf': linsvc_score, 'p': ['l2'], 'c': [0.1, 1.0, 10.0, 100.0], 'seed': seed},
    {'type': 'Bagging_dtree', 'clf': bagging_score, 'n': [10, 50, 100], 'base':[None], 'seed':seed},
    {'type': 'ADABoost_dtree', 'clf': adaboost_score, 'n': [10, 50, 100], 'base':[None], 'seed':seed},
    {'type': 'Random Forest', 'clf': rforest_score, 'n': [10, 50, 100], 'criterion': ['entropy', 'gini'], 'seed': seed},
    {'type': 'KNN', 'clf': knn_score, 'n': [5], 'weights': ['uniform','distance'], 'distance_metric':['minkowski'],'p': [1,2]}
]

#models = [{'type': 'Random Forest', 'clf': rforest_score, 'n': [10, 50, 100], 'criterion': ['entropy', 'gini'], 'seed': seed}]
thresholds = [1, 2, 5, 10, 20,30, 50]


In [None]:
resdf = pp.run_models(models, thresholds, windows, df_final, feature_cols, label_col, split_col, pred_time, pred_unit = 'day', filename = './data/finalrun.csv')
resdf.head()

In [110]:
results = []

for i in range(1, len(windows)-1):
    train_start = windows[0]
    train_end = windows[i]
    test_end = windows[i+1]
    
    #split data
    x_train,y_train,x_test,y_test = pp.single_train_test_set(df_final, 
                                                          feature_cols, 
                                                          label_col, 
                                                          split_col, 
                                                          train_start,
                                                          train_end, 
                                                          test_end, 
                                                          pred_time=pred_time)
    
    
    baseline = sum(y_test)/len(y_test)
    #run models
    for clf in models:
        modeltype = clf['type']
        func = clf['clf']
        printinfo = 'model: {}, run: {}'.format(modeltype, i)
        print(printinfo)
        if modeltype == 'Dtree':
            seed = clf['seed']
            for c in clf['criteria']:
                for d in clf['depth']:
                    for l in clf['min_leaf']:
                        #run model
                        info = 'criteria: {}, depth: {}, min_leaf: {}, seed: {}'.format(c, d, l, seed)
                        print(info)
                        scores = func(x_train, y_train, x_test, criteria = c, depth = d, min_leaf = l, seed=seed)
                        for pct_pop in thresholds:
                            acc, prec, rec, f1, auc = all_metrics(y_test, scores, pct_pop)
                            tmp = {'baseline': baseline, 'train_set_num': i, 'train_start': train_start, 'test_start': train_end,'type': modeltype, 
                                   'details': info, 'threshold_pct': pct_pop, 'precision': prec, 'recall': rec, 'auc': auc}
                            results.append(tmp)
                            
        elif modeltype == 'LR':
            seed = clf['seed']
            for p in clf['p']:
                for c in clf['c']:
                    for s in clf['solver']:
                        #print(p)
                        info = 'penalty: {}, c: {}, solver: {}, seed: {}'.format(p, c, s, seed)
                        print(info)
                        scores = func(x_train, y_train, x_test, p = p, c = c, solver = s, seed=seed)
                        for pct_pop in thresholds:
                            acc, prec, rec, f1, auc = all_metrics(y_test, scores, pct_pop)
                            tmp = {'baseline': baseline,'train_set_num': i, 'train_start': train_start, 'test_start': train_end,'type': modeltype, 
                                   'details': info, 'threshold_pct': pct_pop, 'precision': prec, 'recall': rec, 'auc': auc}
                            results.append(tmp)
                        
        elif modeltype == 'SVM':
            seed = clf['seed']
            for p in clf['p']:
                for c in clf['c']:
                    info = 'penalty: {}, c: {}, seed: {}'.format(p, c, seed)
                    print(info)
                    scores = func(x_train, y_train, x_test, p =p, c=c, seed=seed)
                    for pct_pop in thresholds:
                        acc, prec, rec, f1, auc = all_metrics(y_test, scores, pct_pop)
                        tmp = {'baseline': baseline,'train_set_num': i, 'train_start': train_start, 'test_start': train_end,'type': modeltype, 
                               'details': info, 'threshold_pct': pct_pop, 'precision': prec, 'recall': rec, 'auc': auc}
                        results.append(tmp)
                            
        elif modeltype == 'KNN':
            for n in clf['n']:
                for w in clf['weights']:
                    for d in clf['distance_metric']:
                        for p in clf['p']:
                            info = 'n: {}, weights: {}, distance: {}, p: {}'.format(n, w, d, p)
                            print(info)
                            scores = func(x_train, y_train, x_test, n = n, weights = w, distance_metric = d, p=p)
                            #print(list(scores))
                            for pct_pop in thresholds:
                                acc, prec, rec, f1, auc = all_metrics(y_test, scores, pct_pop)
                                tmp = {'baseline': baseline,'train_set_num': i, 'train_start': train_start, 'test_start': train_end,'type': modeltype, 
                                       'details': info, 'threshold_pct': pct_pop, 'precision': prec, 'recall': rec, 'auc': auc}
                                results.append(tmp)
        
        elif modeltype == 'Bagging_dtree':
            seed = clf['seed']
            for n in clf['n']:
                for b in clf['base']:
                    info = 'n: {}, base: {}'.format(n,b)
                    scores = func(x_train, y_train, x_test, n = n, base = b, seed=seed)
                    for pct_pop in thresholds:
                        acc, prec, rec, f1, auc = all_metrics(y_test, scores, pct_pop)
                        tmp = {'baseline': baseline,'train_set_num': i, 'train_start': train_start, 'test_start': train_end,'type': modeltype, 
                               'details': info, 'threshold_pct': pct_pop, 'precision': prec, 'recall': rec, 'auc': auc}
                        results.append(tmp)
        
        elif modeltype == 'ADABoost_dtree':
            seed = clf['seed']
            for n in clf['n']:
                for b in clf['base']:
                    info = 'n: {}, base: {}'.format(n,b)
                    scores = func(x_train, y_train, x_test, n = n, base = b, seed=seed)
                    for pct_pop in thresholds:
                        acc, prec, rec, f1, auc = all_metrics(y_test, scores, pct_pop)
                        tmp = {'baseline': baseline,'train_set_num': i, 'train_start': train_start, 'test_start': train_end,'type': modeltype, 
                               'details': info, 'threshold_pct': pct_pop, 'precision': prec, 'recall': rec, 'auc': auc}
                        results.append(tmp)
            
            
        elif modeltype == 'Random Forest':
            seed = clf['seed']
            for n in clf['n']:
                for c in clf['criterion']:
                    info = 'n: {}, criterion: {}'.format(n,c)
                    scores = func(x_train, y_train, x_test, n = n, criterion = c, seed=seed)
                    for pct_pop in thresholds:
                        acc, prec, rec, f1, auc = all_metrics(y_test, scores, pct_pop)
                        tmp = {'baseline': baseline,'train_set_num': i, 'train_start': train_start, 'test_start': train_end,'type': modeltype, 
                               'details': info, 'threshold_pct': pct_pop, 'precision': prec, 'recall': rec, 'auc': auc}
                        results.append(tmp)
                    
                    
resdf=pd.DataFrame(results, columns = ['type', 'details', 'baseline', 'threshold_pct', 'precision', 'recall', 'auc','train_set_num', 'train_start', 'test_start'])    

model: Dtree, run: 1
criteria: entropy, depth: 10, min_leaf: 100, seed: 12345
criteria: entropy, depth: 10, min_leaf: 300, seed: 12345
criteria: entropy, depth: 10, min_leaf: 500, seed: 12345
criteria: entropy, depth: 20, min_leaf: 100, seed: 12345
criteria: entropy, depth: 20, min_leaf: 300, seed: 12345
criteria: entropy, depth: 20, min_leaf: 500, seed: 12345
criteria: entropy, depth: 30, min_leaf: 100, seed: 12345
criteria: entropy, depth: 30, min_leaf: 300, seed: 12345
criteria: entropy, depth: 30, min_leaf: 500, seed: 12345
criteria: gini, depth: 10, min_leaf: 100, seed: 12345
criteria: gini, depth: 10, min_leaf: 300, seed: 12345
criteria: gini, depth: 10, min_leaf: 500, seed: 12345
criteria: gini, depth: 20, min_leaf: 100, seed: 12345
criteria: gini, depth: 20, min_leaf: 300, seed: 12345
criteria: gini, depth: 20, min_leaf: 500, seed: 12345
criteria: gini, depth: 30, min_leaf: 100, seed: 12345
criteria: gini, depth: 30, min_leaf: 300, seed: 12345
criteria: gini, depth: 30, min_lea



penalty: l2, c: 100.0, seed: 12345




model: Bagging_dtree, run: 1
model: ADABoost_dtree, run: 1
model: Random Forest, run: 1
model: KNN, run: 1
n: 5, weights: uniform, distance: minkowski, p: 1
n: 5, weights: uniform, distance: minkowski, p: 2
n: 5, weights: distance, distance: minkowski, p: 1
n: 5, weights: distance, distance: minkowski, p: 2
model: Dtree, run: 2
criteria: entropy, depth: 10, min_leaf: 100, seed: 12345
criteria: entropy, depth: 10, min_leaf: 300, seed: 12345
criteria: entropy, depth: 10, min_leaf: 500, seed: 12345
criteria: entropy, depth: 20, min_leaf: 100, seed: 12345
criteria: entropy, depth: 20, min_leaf: 300, seed: 12345
criteria: entropy, depth: 20, min_leaf: 500, seed: 12345
criteria: entropy, depth: 30, min_leaf: 100, seed: 12345
criteria: entropy, depth: 30, min_leaf: 300, seed: 12345
criteria: entropy, depth: 30, min_leaf: 500, seed: 12345
criteria: gini, depth: 10, min_leaf: 100, seed: 12345
criteria: gini, depth: 10, min_leaf: 300, seed: 12345
criteria: gini, depth: 10, min_leaf: 500, seed: 1



penalty: l2, c: 100.0, seed: 12345




model: Bagging_dtree, run: 2
model: ADABoost_dtree, run: 2
model: Random Forest, run: 2
model: KNN, run: 2
n: 5, weights: uniform, distance: minkowski, p: 1
n: 5, weights: uniform, distance: minkowski, p: 2
n: 5, weights: distance, distance: minkowski, p: 1
n: 5, weights: distance, distance: minkowski, p: 2
model: Dtree, run: 3
criteria: entropy, depth: 10, min_leaf: 100, seed: 12345
criteria: entropy, depth: 10, min_leaf: 300, seed: 12345
criteria: entropy, depth: 10, min_leaf: 500, seed: 12345
criteria: entropy, depth: 20, min_leaf: 100, seed: 12345
criteria: entropy, depth: 20, min_leaf: 300, seed: 12345
criteria: entropy, depth: 20, min_leaf: 500, seed: 12345
criteria: entropy, depth: 30, min_leaf: 100, seed: 12345
criteria: entropy, depth: 30, min_leaf: 300, seed: 12345
criteria: entropy, depth: 30, min_leaf: 500, seed: 12345
criteria: gini, depth: 10, min_leaf: 100, seed: 12345
criteria: gini, depth: 10, min_leaf: 300, seed: 12345
criteria: gini, depth: 10, min_leaf: 500, seed: 1



penalty: l2, c: 100.0, seed: 12345




model: Bagging_dtree, run: 3
model: ADABoost_dtree, run: 3
model: Random Forest, run: 3
model: KNN, run: 3
n: 5, weights: uniform, distance: minkowski, p: 1
n: 5, weights: uniform, distance: minkowski, p: 2
n: 5, weights: distance, distance: minkowski, p: 1
n: 5, weights: distance, distance: minkowski, p: 2


#### Testing code

In [30]:
#### Use temporal validation to split into training/test sets



x_train1,y_train1,x_test1,y_test1 = pp.single_train_test_set(df_final, 
                                                          feature_cols, 
                                                          label_col, 
                                                          split_col, 
                                                          windows[0],
                                                          windows[1], 
                                                          windows[2], 
                                                          pred_time=60)

x_train2, y_train2, x_test2, y_test2 = pp.single_train_test_set(df_final, 
                                                          feature_cols, 
                                                          label_col, 
                                                          split_col, 
                                                          windows[0],
                                                          windows[2], 
                                                          windows[3], 
                                                          pred_time=60)

x_train3, y_train3, x_test3, y_test3 = pp.single_train_test_set(df_final, 
                                                          feature_cols, 
                                                          label_col, 
                                                          split_col, 
                                                          windows[0],
                                                          windows[3], 
                                                          windows[4], 
                                                          pred_time=60)

In [140]:
import matplotlib.pyplot as plt
x = lr_score(x_train1, y_train1, x_test1, p = 'l1', c = 10.0, solver = 'liblinear', seed=12345)
precision_at_threshold(y_test1, x, 0)
plot_pct_pop(y_test1, x)


24492

In [137]:
def plot_pct_pop(y_test, pred_scores):
    '''
    This function plots precision and recall on two axes with percent of population as the x-axis.
    
    y_test: test set labels
    pred_scores: predicted scores
    
    return: None
    '''
    pct_pop = np.array([1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 99])
    prec = []
    rec = []
    for each in pct_pop:
        #quantile = 100 - each
        #t = np.percentile(pred_scores, quantile)
        a, p, r, f1, auc = all_metrics(y_test, pred_scores, each)
        #p = precision_at_threshold(y_test, pred_scores, t)
        #r = recall_at_threshold(y_test, pred_scores, t)
        prec.append(p)
        rec.append(r)
    
    #pct_pop = 1-thresholds
    fig, ax1 = plt.subplots()

    color = 'tab:blue'
    ax1.set_xlabel('percent of population')
    ax1.set_ylabel('precision', color=color)
    ax1.plot(pct_pop, prec, color=color)
    ax1.tick_params(axis='y', labelcolor=color)
    plt.yticks(np.arange(0, 1.2, step=0.2))
    ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

    color = 'tab:red'
    ax2.set_ylabel('recall', color=color)  # we already handled the x-label with ax1
    ax2.plot(pct_pop, rec, color=color)
    ax2.tick_params(axis='y', labelcolor=color)
    plt.yticks(np.arange(0, 1.2, step=0.2))
    fig.tight_layout()  # otherwise the right y-label is slightly clipped
    plt.show()

In [7]:
resdf = pp.load_csv('./data/finalrun.csv')

## Comparing model results

#### Identify models that do better on precision

In [20]:
best_prec = resdf.sort_values('precision', ascending=False)
best_prec.groupby(['train_set_num','threshold_pct']).first()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,type,details,baseline,precision,recall,auc,train_start,test_start
train_set_num,threshold_pct,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1,0,Dtree,"criteria: entropy, depth: 10, min_leaf: 100, s...",0.743083,0.948485,0.01278,0.505386,2012-01-01,2012-07-01
1,2,1,Dtree,"criteria: entropy, depth: 10, min_leaf: 100, s...",0.743083,0.849772,0.022865,0.505587,2012-01-01,2012-07-01
1,5,2,Dtree,"criteria: entropy, depth: 10, min_leaf: 100, s...",0.743083,0.778519,0.052384,0.504641,2012-01-01,2012-07-01
1,10,24,Dtree,"criteria: entropy, depth: 20, min_leaf: 100, s...",0.743083,0.755461,0.101666,0.503242,2012-01-01,2012-07-01
1,20,46,Dtree,"criteria: entropy, depth: 30, min_leaf: 100, s...",0.743083,0.749242,0.201658,0.503226,2012-01-01,2012-07-01
1,30,271,Random Forest,"n: 50, criterion: entropy",0.743083,0.747067,0.301609,0.503131,2012-01-01,2012-07-01
1,50,258,Random Forest,"n: 10, criterion: entropy",0.743083,0.742172,0.499388,0.498808,2012-01-01,2012-07-01
2,1,476,LR,"penalty: l2, c: 0.1, solver: liblinear, seed: ...",0.684111,0.963134,0.014074,0.506454,2012-01-01,2013-01-01
2,2,330,Dtree,"criteria: entropy, depth: 10, min_leaf: 300, s...",0.684111,0.817972,0.023906,0.506192,2012-01-01,2013-01-01
2,5,478,LR,"penalty: l2, c: 0.1, solver: liblinear, seed: ...",0.684111,0.737327,0.053872,0.506154,2012-01-01,2013-01-01


#### Identify models that do better on recall

In [21]:
best_rec = resdf.sort_values('recall', ascending=False)
best_rec.groupby(['train_set_num','threshold_pct']).first()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,type,details,baseline,precision,recall,auc,train_start,test_start
train_set_num,threshold_pct,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1,0,Dtree,"criteria: entropy, depth: 10, min_leaf: 100, s...",0.743083,0.948485,0.01278,0.505386,2012-01-01,2012-07-01
1,2,1,Dtree,"criteria: entropy, depth: 10, min_leaf: 100, s...",0.743083,0.849772,0.022865,0.505587,2012-01-01,2012-07-01
1,5,2,Dtree,"criteria: entropy, depth: 10, min_leaf: 100, s...",0.743083,0.778519,0.052384,0.504641,2012-01-01,2012-07-01
1,10,45,Dtree,"criteria: entropy, depth: 30, min_leaf: 100, s...",0.743083,0.755461,0.101666,0.503242,2012-01-01,2012-07-01
1,20,25,Dtree,"criteria: entropy, depth: 20, min_leaf: 100, s...",0.743083,0.749242,0.201658,0.503226,2012-01-01,2012-07-01
1,30,271,Random Forest,"n: 50, criterion: entropy",0.743083,0.747067,0.301609,0.503131,2012-01-01,2012-07-01
1,50,258,Random Forest,"n: 10, criterion: entropy",0.743083,0.742172,0.499388,0.498808,2012-01-01,2012-07-01
2,1,476,LR,"penalty: l2, c: 0.1, solver: liblinear, seed: ...",0.684111,0.963134,0.014074,0.506454,2012-01-01,2013-01-01
2,2,393,Dtree,"criteria: gini, depth: 10, min_leaf: 300, seed...",0.684111,0.817972,0.023906,0.506192,2012-01-01,2013-01-01
2,5,478,LR,"penalty: l2, c: 0.1, solver: liblinear, seed: ...",0.684111,0.737327,0.053872,0.506154,2012-01-01,2013-01-01


#### Identify models that do better on AUC ROC

In [22]:
best_auc = resdf.sort_values('auc', ascending=False)
best_auc.groupby(['train_set_num','threshold_pct']).first()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,type,details,baseline,precision,recall,auc,train_start,test_start
train_set_num,threshold_pct,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1,0,Dtree,"criteria: entropy, depth: 10, min_leaf: 100, s...",0.743083,0.948485,0.01278,0.505386,2012-01-01,2012-07-01
1,2,1,Dtree,"criteria: entropy, depth: 10, min_leaf: 100, s...",0.743083,0.849772,0.022865,0.505587,2012-01-01,2012-07-01
1,5,2,Dtree,"criteria: entropy, depth: 10, min_leaf: 100, s...",0.743083,0.778519,0.052384,0.504641,2012-01-01,2012-07-01
1,10,24,Dtree,"criteria: entropy, depth: 20, min_leaf: 100, s...",0.743083,0.755461,0.101666,0.503242,2012-01-01,2012-07-01
1,20,4,Dtree,"criteria: entropy, depth: 10, min_leaf: 100, s...",0.743083,0.749242,0.201658,0.503226,2012-01-01,2012-07-01
1,30,271,Random Forest,"n: 50, criterion: entropy",0.743083,0.747067,0.301609,0.503131,2012-01-01,2012-07-01
1,50,258,Random Forest,"n: 10, criterion: entropy",0.743083,0.742172,0.499388,0.498808,2012-01-01,2012-07-01
2,1,476,LR,"penalty: l2, c: 0.1, solver: liblinear, seed: ...",0.684111,0.963134,0.014074,0.506454,2012-01-01,2013-01-01
2,2,393,Dtree,"criteria: gini, depth: 10, min_leaf: 300, seed...",0.684111,0.817972,0.023906,0.506192,2012-01-01,2013-01-01
2,5,478,LR,"penalty: l2, c: 0.1, solver: liblinear, seed: ...",0.684111,0.737327,0.053872,0.506154,2012-01-01,2013-01-01


#### Examine how results change over time

#### Choose model for the 5% threshold (target percent fo population)