In [3]:
# -*- coding: utf-8 -*-
"""
Created on Wed Jan 18 11:55:32 2017

Script for full tests, decision tree (pruned)

"""

import sklearn.model_selection as ms
import pandas as pd
from helpers import basicResults,dtclf_pruned,makeTimingCurve
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np

def DTpruningVSnodes(clf,alphas,trgX,trgY,dataset):
    '''Dump table of pruning alpha vs. # of internal nodes'''
    out = {}
    for a in alphas:
        clf.set_params(**{'DT__alpha':a})
        clf.fit(trgX,trgY)
        out[a]=clf.steps[-1][-1].numNodes()
        print(dataset,a)
    out = pd.Series(out)
    out.index.name='alpha'
    out.name = 'Number of Internal Nodes'
    out.to_csv('./output/DT_{}_nodecounts.csv'.format(dataset))
    
    return



    

# Load Data 

cancer = pd.read_csv('./data/breast_cancer.csv')  
cancer = pd.get_dummies(cancer, columns = ['class'], prefix = 'class')
cancer['class'] = cancer['class_2.0']
cancer.drop(['class_2.0', 'class_4.0'], axis = 1, inplace = True)
cancer = cancer.astype(np.int64)

cancerX = cancer.drop('class',1).copy().values
cancerY = cancer['class'].copy().values


drug = pd.read_csv('./data/drug_consumption.data.txt')      
col_names = ['id', 'age', 'gender', 'edu', 'country', 'ethnicity', 'nscore', 'escore', 'oscore', 'ascore', 'cscore', 'impulsive', 'ss', 'alc', 'amphet', 'amyl', 'benzo', 'caff', 'canna', 'choc', 'coke', 'crack', 'ecstasy', 'heroin', 'keta', 'legal', 'lsd', 'meth', 'shroom', 'nico', 'semer', 'vsa']
drug.columns = col_names
drug = drug.drop(['id', 'alc', 'amyl', 'benzo', 'caff', 'canna', 'choc', 'coke', 'crack', 'ecstasy', 'heroin', 'keta', 'legal', 'lsd', 'meth', 'shroom', 'nico', 'semer', 'vsa'], axis = 1)
drug = pd.get_dummies(drug, columns = ['age', 'gender', 'country', 'ethnicity'], prefix = ['age', 'gender', 'country', 'ethnicity'])
drug['amphet'] = np.where(drug['amphet'] == 'CL0', 'Never Used Amphetamines', 'Used Amphetamines')

drugX = drug.drop('amphet',1).copy().values
drugY = drug['amphet'].copy().values

drug_trgX, drug_tstX, drug_trgY, drug_tstY = ms.train_test_split(drugX, drugY, test_size=0.3, random_state=0,stratify=drugY)     
cancer_trgX, cancer_tstX, cancer_trgY, cancer_tstY = ms.train_test_split(cancerX, cancerY, test_size=0.3, random_state=0,stratify=cancerY)     

# Search for good alphas
alphas = [-1,-1e-3,-(1e-3)*10**-0.5, -1e-2, -(1e-2)*10**-0.5,-1e-1,-(1e-1)*10**-0.5, 0, (1e-1)*10**-0.5,1e-1,(1e-2)*10**-0.5,1e-2,(1e-3)*10**-0.5,1e-3]

pipeD = Pipeline([('Scale',StandardScaler()),                 
                 ('DT',dtclf_pruned(random_state=55))])
pipeC = Pipeline([('Scale',StandardScaler()),                 
                 ('DT',dtclf_pruned(random_state=55))])

params = {'DT__criterion':['gini','entropy'],'DT__alpha':alphas,'DT__class_weight':['balanced']}

cancer_clf = basicResults(pipeC,cancer_trgX,cancer_trgY,cancer_tstX,cancer_tstY,params,'DT','cancer')        
drug_clf = basicResults(pipeD,drug_trgX,drug_trgY,drug_tstX,drug_tstY,params,'DT','drug')        

#cancer_final_params = {'DT__alpha': -0.00031622776601683794, 'DT__class_weight': 'balanced', 'DT__criterion': 'entropy'}
#drug_final_params = {'class_weight': 'balanced', 'alpha': 0.0031622776601683794, 'criterion': 'entropy'}

cancer_final_params = cancer_clf.best_params_
drug_final_params = drug_clf.best_params_

pipeC.set_params(**cancer_final_params)
makeTimingCurve(cancerX,cancerY,pipeC,'DT','cancer')
pipeD.set_params(**drug_final_params)
makeTimingCurve(drugX,drugY,pipeD,'DT','drug')

DTpruningVSnodes(pipeC,alphas,cancer_trgX,cancer_trgY,'cancer')
DTpruningVSnodes(pipeD,alphas,drug_trgX,drug_trgY,'drug')

Fitting 5 folds for each of 28 candidates, totalling 140 fits
[CV] DT__alpha=-1, DT__class_weight=balanced, DT__criterion=gini .....
[CV]  DT__alpha=-1, DT__class_weight=balanced, DT__criterion=gini, score=0.8455313159746661, total=   0.0s
[CV] DT__alpha=-1, DT__class_weight=balanced, DT__criterion=gini .....
[CV]  DT__alpha=-1, DT__class_weight=balanced, DT__criterion=gini, score=0.9035890218156231, total=   0.0s
[CV] DT__alpha=-1, DT__class_weight=balanced, DT__criterion=gini .....
[CV]  DT__alpha=-1, DT__class_weight=balanced, DT__criterion=gini, score=0.9208304011259677, total=   0.0s
[CV] DT__alpha=-1, DT__class_weight=balanced, DT__criterion=gini .....
[CV]  DT__alpha=-1, DT__class_weight=balanced, DT__criterion=gini, score=0.9066091954022988, total=   0.0s
[CV] DT__alpha=-1, DT__class_weight=balanced, DT__criterion=gini .....
[CV]  DT__alpha=-1, DT__class_weight=balanced, DT__criterion=gini, score=0.8973214285714284, total=   0.0s
[CV] DT__alpha=-1, DT__class_weight=balanced, DT

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.0s remaining:    0.0s



[CV] DT__alpha=-0.01, DT__class_weight=balanced, DT__criterion=gini ..
[CV]  DT__alpha=-0.01, DT__class_weight=balanced, DT__criterion=gini, score=0.9725545390570022, total=   0.0s
[CV] DT__alpha=-0.01, DT__class_weight=balanced, DT__criterion=gini ..
[CV]  DT__alpha=-0.01, DT__class_weight=balanced, DT__criterion=gini, score=0.9553131597466572, total=   0.0s
[CV] DT__alpha=-0.01, DT__class_weight=balanced, DT__criterion=gini ..
[CV]  DT__alpha=-0.01, DT__class_weight=balanced, DT__criterion=gini, score=0.9066091954022988, total=   0.0s
[CV] DT__alpha=-0.01, DT__class_weight=balanced, DT__criterion=gini ..
[CV]  DT__alpha=-0.01, DT__class_weight=balanced, DT__criterion=gini, score=0.9330357142857142, total=   0.0s
[CV] DT__alpha=-0.01, DT__class_weight=balanced, DT__criterion=entropy 
[CV]  DT__alpha=-0.01, DT__class_weight=balanced, DT__criterion=entropy, score=0.8486980999296274, total=   0.0s
[CV] DT__alpha=-0.01, DT__class_weight=balanced, DT__criterion=entropy 
[CV]  DT__alpha=-0



[CV]  DT__alpha=-0.1, DT__class_weight=balanced, DT__criterion=gini, score=0.9208304011259677, total=   0.0s
[CV] DT__alpha=-0.1, DT__class_weight=balanced, DT__criterion=gini ...
[CV]  DT__alpha=-0.1, DT__class_weight=balanced, DT__criterion=gini, score=0.9066091954022988, total=   0.0s
[CV] DT__alpha=-0.1, DT__class_weight=balanced, DT__criterion=gini ...
[CV]  DT__alpha=-0.1, DT__class_weight=balanced, DT__criterion=gini, score=0.8973214285714284, total=   0.0s
[CV] DT__alpha=-0.1, DT__class_weight=balanced, DT__criterion=entropy 
[CV]  DT__alpha=-0.1, DT__class_weight=balanced, DT__criterion=entropy, score=0.8659394792399722, total=   0.0s
[CV] DT__alpha=-0.1, DT__class_weight=balanced, DT__criterion=entropy 
[CV]  DT__alpha=-0.1, DT__class_weight=balanced, DT__criterion=entropy, score=0.962350457424349, total=   0.0s
[CV] DT__alpha=-0.1, DT__class_weight=balanced, DT__criterion=entropy 
[CV]  DT__alpha=-0.1, DT__class_weight=balanced, DT__criterion=entropy, score=0.920830401125967



[CV]  DT__alpha=0, DT__class_weight=balanced, DT__criterion=entropy, score=0.9521463757916961, total=   0.0s
[CV] DT__alpha=0, DT__class_weight=balanced, DT__criterion=entropy ...
[CV]  DT__alpha=0, DT__class_weight=balanced, DT__criterion=entropy, score=0.9451090781140044, total=   0.0s
[CV] DT__alpha=0, DT__class_weight=balanced, DT__criterion=entropy ...
[CV]  DT__alpha=0, DT__class_weight=balanced, DT__criterion=entropy, score=0.9238505747126436, total=   0.0s
[CV] DT__alpha=0, DT__class_weight=balanced, DT__criterion=entropy ...
[CV]  DT__alpha=0, DT__class_weight=balanced, DT__criterion=entropy, score=0.9583333333333333, total=   0.0s
[CV] DT__alpha=0.0316227766016838, DT__class_weight=balanced, DT__criterion=gini 
[CV]  DT__alpha=0.0316227766016838, DT__class_weight=balanced, DT__criterion=gini, score=0.8423645320197048, total=   0.0s
[CV] DT__alpha=0.0316227766016838, DT__class_weight=balanced, DT__criterion=gini 
[CV]  DT__alpha=0.0316227766016838, DT__class_weight=balanced, D



[CV]  DT__alpha=0.0031622776601683794, DT__class_weight=balanced, DT__criterion=gini, score=0.8557353976073191, total=   0.0s
[CV] DT__alpha=0.0031622776601683794, DT__class_weight=balanced, DT__criterion=gini 
[CV]  DT__alpha=0.0031622776601683794, DT__class_weight=balanced, DT__criterion=gini, score=0.989795918367347, total=   0.0s
[CV] DT__alpha=0.0031622776601683794, DT__class_weight=balanced, DT__criterion=gini 
[CV]  DT__alpha=0.0031622776601683794, DT__class_weight=balanced, DT__criterion=gini, score=0.9451090781140044, total=   0.0s
[CV] DT__alpha=0.0031622776601683794, DT__class_weight=balanced, DT__criterion=gini 
[CV]  DT__alpha=0.0031622776601683794, DT__class_weight=balanced, DT__criterion=gini, score=0.9238505747126436, total=   0.0s
[CV] DT__alpha=0.0031622776601683794, DT__class_weight=balanced, DT__criterion=gini 
[CV]  DT__alpha=0.0031622776601683794, DT__class_weight=balanced, DT__criterion=gini, score=0.9508928571428571, total=   0.0s
[CV] DT__alpha=0.00316227766016



[CV]  DT__alpha=0.00031622776601683794, DT__class_weight=balanced, DT__criterion=entropy, score=0.8416608022519356, total=   0.0s
[CV] DT__alpha=0.00031622776601683794, DT__class_weight=balanced, DT__criterion=entropy 
[CV]  DT__alpha=0.00031622776601683794, DT__class_weight=balanced, DT__criterion=entropy, score=0.9521463757916961, total=   0.0s
[CV] DT__alpha=0.00031622776601683794, DT__class_weight=balanced, DT__criterion=entropy 
[CV]  DT__alpha=0.00031622776601683794, DT__class_weight=balanced, DT__criterion=entropy, score=0.9451090781140044, total=   0.0s
[CV] DT__alpha=0.00031622776601683794, DT__class_weight=balanced, DT__criterion=entropy 
[CV]  DT__alpha=0.00031622776601683794, DT__class_weight=balanced, DT__criterion=entropy, score=0.9238505747126436, total=   0.0s
[CV] DT__alpha=0.00031622776601683794, DT__class_weight=balanced, DT__criterion=entropy 
[CV]  DT__alpha=0.00031622776601683794, DT__class_weight=balanced, DT__criterion=entropy, score=0.9583333333333333, total=  

[Parallel(n_jobs=1)]: Done 140 out of 140 | elapsed:    1.2s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.1s remaining:    0.0s


[CV] ....................... , score=0.8494018296973964, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.9387755102040816, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.9387755102040816, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.8768472906403944, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.9387755102040816, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.9387755102040816, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.9795918367346937, total=   0.0s
[CV]  ................................................................
[CV] .




[CV]  ................................................................
[CV] ....................... , score=0.8229166666666662, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.8839285714285715, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.9270833333333333, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.9508928571428571, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.9508928571428571, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.9508928571428571, total=   0.0s
[CV]  ................................................................
[CV] .................................. , score=0.96875, total=   0.0s
Fitti

[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.1s remaining:    0.0s


[CV]  DT__alpha=-1, DT__class_weight=balanced, DT__criterion=entropy, score=0.5922466808437267, total=   0.0s
[CV] DT__alpha=-1, DT__class_weight=balanced, DT__criterion=entropy ..
[CV]  DT__alpha=-1, DT__class_weight=balanced, DT__criterion=entropy, score=0.5689982182884075, total=   0.0s
[CV] DT__alpha=-1, DT__class_weight=balanced, DT__criterion=entropy ..
[CV]  DT__alpha=-1, DT__class_weight=balanced, DT__criterion=entropy, score=0.5908960285073855, total=   0.0s
[CV] DT__alpha=-1, DT__class_weight=balanced, DT__criterion=entropy ..
[CV]  DT__alpha=-1, DT__class_weight=balanced, DT__criterion=entropy, score=0.6138258452987494, total=   0.0s
[CV] DT__alpha=-1, DT__class_weight=balanced, DT__criterion=entropy ..
[CV]  DT__alpha=-1, DT__class_weight=balanced, DT__criterion=entropy, score=0.5665528022232514, total=   0.0s
[CV] DT__alpha=-0.001, DT__class_weight=balanced, DT__criterion=gini .


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.2s remaining:    0.0s


[CV]  DT__alpha=-0.001, DT__class_weight=balanced, DT__criterion=gini, score=0.6260417265360078, total=   0.2s
[CV] DT__alpha=-0.001, DT__class_weight=balanced, DT__criterion=gini .
[CV]  DT__alpha=-0.001, DT__class_weight=balanced, DT__criterion=gini, score=0.5703488706247486, total=   0.2s
[CV] DT__alpha=-0.001, DT__class_weight=balanced, DT__criterion=gini .
[CV]  DT__alpha=-0.001, DT__class_weight=balanced, DT__criterion=gini, score=0.5906086556698661, total=   0.2s
[CV] DT__alpha=-0.001, DT__class_weight=balanced, DT__criterion=gini .
[CV]  DT__alpha=-0.001, DT__class_weight=balanced, DT__criterion=gini, score=0.6088177396943029, total=   0.2s
[CV] DT__alpha=-0.001, DT__class_weight=balanced, DT__criterion=gini .
[CV]  DT__alpha=-0.001, DT__class_weight=balanced, DT__criterion=gini, score=0.6175023158869847, total=   0.2s
[CV] DT__alpha=-0.001, DT__class_weight=balanced, DT__criterion=entropy 
[CV]  DT__alpha=-0.001, DT__class_weight=balanced, DT__criterion=entropy, score=0.597907

[CV]  DT__alpha=-0.1, DT__class_weight=balanced, DT__criterion=gini, score=0.5709236162997874, total=   0.2s
[CV] DT__alpha=-0.1, DT__class_weight=balanced, DT__criterion=gini ...
[CV]  DT__alpha=-0.1, DT__class_weight=balanced, DT__criterion=gini, score=0.6004515979620194, total=   0.2s
[CV] DT__alpha=-0.1, DT__class_weight=balanced, DT__criterion=gini ...
[CV]  DT__alpha=-0.1, DT__class_weight=balanced, DT__criterion=gini, score=0.5781032885595182, total=   0.2s
[CV] DT__alpha=-0.1, DT__class_weight=balanced, DT__criterion=entropy 
[CV]  DT__alpha=-0.1, DT__class_weight=balanced, DT__criterion=entropy, score=0.5922466808437267, total=   0.2s
[CV] DT__alpha=-0.1, DT__class_weight=balanced, DT__criterion=entropy 
[CV]  DT__alpha=-0.1, DT__class_weight=balanced, DT__criterion=entropy, score=0.5689982182884075, total=   0.1s
[CV] DT__alpha=-0.1, DT__class_weight=balanced, DT__criterion=entropy 
[CV]  DT__alpha=-0.1, DT__class_weight=balanced, DT__criterion=entropy, score=0.59089602850738

[CV]  DT__alpha=0.1, DT__class_weight=balanced, DT__criterion=entropy, score=0.6373642163342721, total=   0.2s
[CV] DT__alpha=0.1, DT__class_weight=balanced, DT__criterion=entropy .
[CV]  DT__alpha=0.1, DT__class_weight=balanced, DT__criterion=entropy, score=0.6171044312891546, total=   0.1s
[CV] DT__alpha=0.1, DT__class_weight=balanced, DT__criterion=entropy .
[CV]  DT__alpha=0.1, DT__class_weight=balanced, DT__criterion=entropy, score=0.6458429828624364, total=   0.2s
[CV] DT__alpha=0.1, DT__class_weight=balanced, DT__criterion=entropy .
[CV]  DT__alpha=0.1, DT__class_weight=balanced, DT__criterion=entropy, score=0.6492299675775821, total=   0.2s
[CV] DT__alpha=0.0031622776601683794, DT__class_weight=balanced, DT__criterion=gini 
[CV]  DT__alpha=0.0031622776601683794, DT__class_weight=balanced, DT__criterion=gini, score=0.5977067647565952, total=   0.2s
[CV] DT__alpha=0.0031622776601683794, DT__class_weight=balanced, DT__criterion=gini 
[CV]  DT__alpha=0.0031622776601683794, DT__clas

[CV]  DT__alpha=0.001, DT__class_weight=balanced, DT__criterion=entropy, score=0.5778427512737377, total=   0.1s
[CV] DT__alpha=0.001, DT__class_weight=balanced, DT__criterion=entropy 
[CV]  DT__alpha=0.001, DT__class_weight=balanced, DT__criterion=entropy, score=0.6101493747105142, total=   0.1s


[Parallel(n_jobs=1)]: Done 140 out of 140 | elapsed:   21.5s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.1s remaining:    0.0s


[learning_curve] Training set sizes: [ 50 100 131 263 395 527 659 790 922]
[CV]  ................................................................
[CV] ....................... , score=0.5413529513190414, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.5413529513190414, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.5823610552330594, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.6415023851945514, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.6208690154606588, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.5893729524685327, total=   0.0s
[CV]  ................................................................
[C

[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.3s remaining:    0.0s


[CV] ........................ , score=0.602132306454394, total=   0.1s
[CV]  ................................................................
[CV] ...................................... , score=0.5, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.6311282257601012, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.5838841312719122, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.5874475544571526, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.5883958848209668, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.5684234726133687, total=   0.0s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.5s remaining:    0.0s


[CV] ....................... , score=0.5958101040289673, total=   0.1s
[CV]  ................................................................
[CV] ....................... , score=0.6555261796654981, total=   0.1s
[CV]  ................................................................
[CV] ....................... , score=0.6561009253405369, total=   0.1s
[CV]  ................................................................
[CV] ...................................... , score=0.5, total=   0.0s
[CV]  ................................................................
[CV] ........................ , score=0.614029541927697, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.6148054485889993, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.6055807805046267, total=   0.0s
[CV]  ................................................................
[CV] .

[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:    2.5s finished


DT drug 0.1
DT drug 0.2
DT drug 0.3
DT drug 0.4
DT drug 0.5
DT drug 0.6
DT drug 0.7
DT drug 0.8
DT drug 0.9
cancer -1
cancer -0.001
cancer -0.00031622776601683794
cancer -0.01
cancer -0.0031622776601683794
cancer -0.1
cancer -0.0316227766016838
cancer 0
cancer 0.0316227766016838
cancer 0.1
cancer 0.0031622776601683794




cancer 0.01
cancer 0.00031622776601683794
cancer 0.001
drug -1
drug -0.001
drug -0.00031622776601683794
drug -0.01
drug -0.0031622776601683794
drug -0.1
drug -0.0316227766016838
drug 0
drug 0.0316227766016838
drug 0.1
drug 0.0031622776601683794
drug 0.01
drug 0.00031622776601683794
drug 0.001


In [18]:
contra = pd.read_csv('./data/cmc.data.txt', header = None)
contra.columns = ['wifes_age', 'wifes_edu', 'husbs_edu', 'num_children_born', 'wifes_religion', 'wife_employed', 'husbs_occup', 'SOL_index', 'media_expose', 'contra_method']
contra.head()

Unnamed: 0,wifes_age,wifes_edu,husbs_edu,num_children_born,wifes_religion,wife_employed,husbs_occup,SOL_index,media_expose,contra_method
0,24,2,3,3,1,1,2,3,0,1
1,45,1,3,10,1,1,3,4,0,1
2,43,2,3,7,1,1,3,4,0,1
3,42,3,2,9,1,1,3,3,0,1
4,36,3,3,8,1,1,3,2,0,1


In [3]:
import sklearn.model_selection as ms
import pandas as pd
from helpers import basicResults,dtclf_pruned,makeTimingCurve
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np

adult = pd.read_csv('./data/cmc.data.txt')      
col_names = ['wifes_age', 'wifes_edu', 'husbs_edu', 'num_children_born', 'wifes_religion', 'wife_employed', 'husbs_occup', 'SOL_index', 'media_expose', 'contra_method']
adult.columns = col_names
adult[col_names] = adult[col_names].astype(np.float64)
adult = pd.get_dummies(adult, columns = ['husbs_occup'], prefix = 'husbs_occup')
adult['contra_method'] = np.where(adult['contra_method'] == 1, 'no birth control', 'birth control')
print(adult['contra_method'].head())

0    1.0
1    1.0
2    1.0
3    1.0
4    1.0
Name: contra_method, dtype: float64


In [6]:
import sklearn.model_selection as ms
import pandas as pd
from helpers import basicResults,dtclf_pruned,makeTimingCurve
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np

adult = pd.read_csv('./data/drug_consumption.data.txt')      
col_names = ['id', 'age', 'gender', 'edu', 'country', 'ethnicity', 'nscore', 'escore', 'oscore', 'ascore', 'cscore', 'impulsive', 'ss', 'alc', 'amphet', 'amyl', 'benzo', 'caff', 'canna', 'choc', 'coke', 'crack', 'ecstasy', 'heroin', 'keta', 'legal', 'lsd', 'meth', 'shroom', 'nico', 'semer', 'vsa']
adult.columns = col_names
adult = adult.drop(['id', 'alc', 'amyl', 'benzo', 'caff', 'canna', 'choc', 'coke', 'crack', 'ecstasy', 'heroin', 'keta', 'legal', 'lsd', 'meth', 'shroom', 'nico', 'semer', 'vsa'], axis = 1)
adult = pd.get_dummies(adult, columns = ['age', 'gender', 'country', 'ethnicity'], prefix = ['age', 'gender', 'country', 'ethnicity'])
adult['amphet'] = np.where(adult['amphet'] == 'CL0', 'no birth control', 'birth control')
print(adult.head())

       edu   nscore   escore   oscore   ascore   cscore  impulsive       ss  \
0  1.98437 -0.67825  1.93886  1.43533  0.76096 -0.14277   -0.71126 -0.21575   
1 -0.05921 -0.46725  0.80523 -0.84732 -1.62090 -1.01450   -1.37983  0.40148   
2  1.16365 -0.14882 -0.80615 -0.01928  0.59042  0.58489   -1.37983 -1.18084   
3  1.98437  0.73545 -1.63340 -0.45174 -0.30172  1.30612   -0.21712 -0.21575   
4 -1.22751 -0.67825 -0.30033 -1.55521  2.03972  1.63088   -1.37983 -1.54858   

             amphet  age_-0.9519700000000001        ...          \
0     birth control                        0        ...           
1  no birth control                        0        ...           
2  no birth control                        1        ...           
3     birth control                        0        ...           
4  no birth control                        0        ...           

   country_0.21128000000000002  country_0.24923  country_0.9608200000000001  \
0                            0             