In [2]:
# -*- coding: utf-8 -*-
"""
Created on Wed Jan 18 11:55:32 2017

Script for full tests, decision tree (pruned)

"""

import sklearn.model_selection as ms
import pandas as pd
from helpers import basicResults,dtclf_pruned,makeTimingCurve
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np

def DTpruningVSnodes(clf,alphas,trgX,trgY,dataset):
    '''Dump table of pruning alpha vs. # of internal nodes'''
    out = {}
    for a in alphas:
        clf.set_params(**{'DT__alpha':a})
        clf.fit(trgX,trgY)
        out[a]=clf.steps[-1][-1].numNodes()
        print(dataset,a)
    out = pd.Series(out)
    out.index.name='alpha'
    out.name = 'Number of Internal Nodes'
    out.to_csv('./output/DT_{}_nodecounts.csv'.format(dataset))
    
    return



    

# Load Data       
cancer = pd.read_csv('./data/breast_cancer.csv')  
cancer = pd.get_dummies(cancer, columns = ['class'], prefix = 'class')
cancer['class'] = cancer['class_2.0']
cancer.drop(['class_2.0', 'class_4.0'], axis = 1, inplace = True)
cancer = cancer.astype(np.int64)

cancerX = cancer.drop('class',1).copy().values
cancerY = cancer['class'].copy().values


contra = pd.read_csv('./data/cmc.data.txt')      
col_names_contra = ['wifes_age', 'wifes_edu', 'husbs_edu', 'num_children_born', 'wifes_religion', 'wife_employed', 'husbs_occup', 'SOL_index', 'media_expose', 'contra_method']
contra.columns = col_names_contra
contra[col_names_contra] = contra[col_names_contra].astype(np.int64)
contra = pd.get_dummies(contra, columns = ['husbs_occup'], prefix = 'husbs_occup')

contraX = contra.drop('contra_method',1).copy().values
contraY = contra['contra_method'].copy().values



contra_trgX, contra_tstX, contra_trgY, contra_tstY = ms.train_test_split(contraX, contraY, test_size=0.3, random_state=0,stratify=contraY)     
cancer_trgX, cancer_tstX, cancer_trgY, cancer_tstY = ms.train_test_split(cancerX, cancerY, test_size=0.3, random_state=0,stratify=cancerY)     

# Search for good alphas
alphas = [-1,-1e-3,-(1e-3)*10**-0.5, -1e-2, -(1e-2)*10**-0.5,-1e-1,-(1e-1)*10**-0.5, 0, (1e-1)*10**-0.5,1e-1,(1e-2)*10**-0.5,1e-2,(1e-3)*10**-0.5,1e-3]


pipeO = Pipeline([('Scale',StandardScaler()),                 
                 ('DT',dtclf_pruned(random_state=55))])
pipeC = Pipeline([('Scale',StandardScaler()),                 
                 ('DT',dtclf_pruned(random_state=55))])

params = {'DT__criterion':['gini','entropy'],'DT__alpha':alphas,'DT__class_weight':['balanced']}

cancer_clf = basicResults(pipeC,cancer_trgX,cancer_trgY,cancer_tstX,cancer_tstY,params,'DT','cancer')        
contra_clf = basicResults(pipeO,contra_trgX,contra_trgY,contra_tstX,contra_tstY,params,'DT','contra')        


#cancer_final_params = {'DT__alpha': -0.00031622776601683794, 'DT__class_weight': 'balanced', 'DT__criterion': 'entropy'}
#contra_final_params = {'class_weight': 'balanced', 'alpha': 0.0031622776601683794, 'criterion': 'entropy'}

cancer_final_params = cancer_clf.best_params_
contra_final_params = contra_clf.best_params_

pipeC.set_params(**cancer_final_params)
makeTimingCurve(cancerX,cancerY,pipeC,'DT','cancer')
pipeO.set_params(**contra_final_params)
makeTimingCurve(contraX,contraY,pipeO,'DT','contra')


DTpruningVSnodes(pipeC,alphas,cancer_trgX,cancer_trgY,'cancer')
DTpruningVSnodes(pipeO,alphas,contra_trgX,contra_trgY,'contra')

Fitting 5 folds for each of 28 candidates, totalling 140 fits
[CV] DT__alpha=-1, DT__class_weight=balanced, DT__criterion=gini .....
[CV]  DT__alpha=-1, DT__class_weight=balanced, DT__criterion=gini, score=0.8455313159746661, total=   0.0s
[CV] DT__alpha=-1, DT__class_weight=balanced, DT__criterion=gini .....
[CV]  DT__alpha=-1, DT__class_weight=balanced, DT__criterion=gini, score=0.9035890218156231, total=   0.0s
[CV] DT__alpha=-1, DT__class_weight=balanced, DT__criterion=gini .....
[CV]  DT__alpha=-1, DT__class_weight=balanced, DT__criterion=gini, score=0.9208304011259677, total=   0.0s
[CV] DT__alpha=-1, DT__class_weight=balanced, DT__criterion=gini .....
[CV]  DT__alpha=-1, DT__class_weight=balanced, DT__criterion=gini, score=0.9066091954022988, total=   0.0s
[CV] DT__alpha=-1, DT__class_weight=balanced, DT__criterion=gini .....
[CV]  DT__alpha=-1, DT__class_weight=balanced, DT__criterion=gini, score=0.8973214285714284, total=   0.0s
[CV] DT__alpha=-1, DT__class_weight=balanced, DT

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.0s remaining:    0.0s


[CV]  DT__alpha=-0.01, DT__class_weight=balanced, DT__criterion=entropy, score=0.962350457424349, total=   0.0s
[CV] DT__alpha=-0.01, DT__class_weight=balanced, DT__criterion=entropy 
[CV]  DT__alpha=-0.01, DT__class_weight=balanced, DT__criterion=entropy, score=0.9553131597466572, total=   0.0s
[CV] DT__alpha=-0.01, DT__class_weight=balanced, DT__criterion=entropy 
[CV]  DT__alpha=-0.01, DT__class_weight=balanced, DT__criterion=entropy, score=0.9238505747126436, total=   0.0s
[CV] DT__alpha=-0.01, DT__class_weight=balanced, DT__criterion=entropy 
[CV]  DT__alpha=-0.01, DT__class_weight=balanced, DT__criterion=entropy, score=0.9583333333333333, total=   0.0s
[CV] DT__alpha=-0.0031622776601683794, DT__class_weight=balanced, DT__criterion=gini 
[CV]  DT__alpha=-0.0031622776601683794, DT__class_weight=balanced, DT__criterion=gini, score=0.8455313159746661, total=   0.0s
[CV] DT__alpha=-0.0031622776601683794, DT__class_weight=balanced, DT__criterion=gini 
[CV]  DT__alpha=-0.003162277660168




[CV]  DT__alpha=-0.0316227766016838, DT__class_weight=balanced, DT__criterion=entropy, score=0.9208304011259677, total=   0.0s
[CV] DT__alpha=-0.0316227766016838, DT__class_weight=balanced, DT__criterion=entropy 
[CV]  DT__alpha=-0.0316227766016838, DT__class_weight=balanced, DT__criterion=entropy, score=0.9066091954022988, total=   0.0s
[CV] DT__alpha=-0.0316227766016838, DT__class_weight=balanced, DT__criterion=entropy 
[CV]  DT__alpha=-0.0316227766016838, DT__class_weight=balanced, DT__criterion=entropy, score=0.9508928571428571, total=   0.0s
[CV] DT__alpha=0, DT__class_weight=balanced, DT__criterion=gini ......
[CV]  DT__alpha=0, DT__class_weight=balanced, DT__criterion=gini, score=0.8557353976073191, total=   0.0s
[CV] DT__alpha=0, DT__class_weight=balanced, DT__criterion=gini ......
[CV]  DT__alpha=0, DT__class_weight=balanced, DT__criterion=gini, score=0.989795918367347, total=   0.0s
[CV] DT__alpha=0, DT__class_weight=balanced, DT__criterion=gini ......
[CV]  DT__alpha=0, DT_



[CV]  DT__alpha=0.1, DT__class_weight=balanced, DT__criterion=entropy, score=0.9098419540229885, total=   0.0s
[CV] DT__alpha=0.1, DT__class_weight=balanced, DT__criterion=entropy .
[CV]  DT__alpha=0.1, DT__class_weight=balanced, DT__criterion=entropy, score=0.919642857142857, total=   0.0s
[CV] DT__alpha=0.0031622776601683794, DT__class_weight=balanced, DT__criterion=gini 
[CV]  DT__alpha=0.0031622776601683794, DT__class_weight=balanced, DT__criterion=gini, score=0.8557353976073191, total=   0.0s
[CV] DT__alpha=0.0031622776601683794, DT__class_weight=balanced, DT__criterion=gini 
[CV]  DT__alpha=0.0031622776601683794, DT__class_weight=balanced, DT__criterion=gini, score=0.989795918367347, total=   0.0s
[CV] DT__alpha=0.0031622776601683794, DT__class_weight=balanced, DT__criterion=gini 
[CV]  DT__alpha=0.0031622776601683794, DT__class_weight=balanced, DT__criterion=gini, score=0.9451090781140044, total=   0.0s
[CV] DT__alpha=0.0031622776601683794, DT__class_weight=balanced, DT__criteri



[CV]  DT__alpha=0.001, DT__class_weight=balanced, DT__criterion=gini, score=0.8557353976073191, total=   0.0s
[CV] DT__alpha=0.001, DT__class_weight=balanced, DT__criterion=gini ..
[CV]  DT__alpha=0.001, DT__class_weight=balanced, DT__criterion=gini, score=0.989795918367347, total=   0.0s
[CV] DT__alpha=0.001, DT__class_weight=balanced, DT__criterion=gini ..
[CV]  DT__alpha=0.001, DT__class_weight=balanced, DT__criterion=gini, score=0.9451090781140044, total=   0.0s
[CV] DT__alpha=0.001, DT__class_weight=balanced, DT__criterion=gini ..
[CV]  DT__alpha=0.001, DT__class_weight=balanced, DT__criterion=gini, score=0.9238505747126436, total=   0.0s
[CV] DT__alpha=0.001, DT__class_weight=balanced, DT__criterion=gini ..
[CV]  DT__alpha=0.001, DT__class_weight=balanced, DT__criterion=gini, score=0.9508928571428571, total=   0.0s
[CV] DT__alpha=0.001, DT__class_weight=balanced, DT__criterion=entropy 
[CV]  DT__alpha=0.001, DT__class_weight=balanced, DT__criterion=entropy, score=0.84166080225193

[Parallel(n_jobs=1)]: Done 140 out of 140 | elapsed:    0.9s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.0s remaining:    0.0s


[CV] ....................... , score=0.9451090781140044, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.9278676988036596, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.9451090781140044, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.9098419540229885, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.9098419540229885, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.8480603448275862, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.9410919540229883, total=   0.0s
[CV]  ................................................................
[CV] .

[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.1s remaining:    0.0s


[CV]  DT__alpha=-1, DT__class_weight=balanced, DT__criterion=entropy, score=0.583397534668721, total=   0.0s
[CV] DT__alpha=-1, DT__class_weight=balanced, DT__criterion=entropy ..
[CV]  DT__alpha=-1, DT__class_weight=balanced, DT__criterion=entropy, score=0.5594183359013868, total=   0.0s
[CV] DT__alpha=-1, DT__class_weight=balanced, DT__criterion=entropy ..
[CV]  DT__alpha=-1, DT__class_weight=balanced, DT__criterion=entropy, score=0.6169394116501071, total=   0.0s
[CV] DT__alpha=-0.001, DT__class_weight=balanced, DT__criterion=gini .
[CV]  DT__alpha=-0.001, DT__class_weight=balanced, DT__criterion=gini, score=0.6231856378915201, total=   0.1s
[CV] DT__alpha=-0.001, DT__class_weight=balanced, DT__criterion=gini .


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.1s remaining:    0.0s


[CV]  DT__alpha=-0.001, DT__class_weight=balanced, DT__criterion=gini, score=0.6313559322033898, total=   0.1s
[CV] DT__alpha=-0.001, DT__class_weight=balanced, DT__criterion=gini .
[CV]  DT__alpha=-0.001, DT__class_weight=balanced, DT__criterion=gini, score=0.6511941448382125, total=   0.1s
[CV] DT__alpha=-0.001, DT__class_weight=balanced, DT__criterion=gini .
[CV]  DT__alpha=-0.001, DT__class_weight=balanced, DT__criterion=gini, score=0.5807010785824345, total=   0.1s
[CV] DT__alpha=-0.001, DT__class_weight=balanced, DT__criterion=gini .
[CV]  DT__alpha=-0.001, DT__class_weight=balanced, DT__criterion=gini, score=0.6605299045392558, total=   0.1s
[CV] DT__alpha=-0.001, DT__class_weight=balanced, DT__criterion=entropy 
[CV]  DT__alpha=-0.001, DT__class_weight=balanced, DT__criterion=entropy, score=0.6177425515660808, total=   0.1s
[CV] DT__alpha=-0.001, DT__class_weight=balanced, DT__criterion=entropy 
[CV]  DT__alpha=-0.001, DT__class_weight=balanced, DT__criterion=entropy, score=0.6

[CV]  DT__alpha=-0.1, DT__class_weight=balanced, DT__criterion=gini, score=0.5750192604006163, total=   0.1s
[CV] DT__alpha=-0.1, DT__class_weight=balanced, DT__criterion=gini ...
[CV]  DT__alpha=-0.1, DT__class_weight=balanced, DT__criterion=gini, score=0.6154295733489187, total=   0.1s
[CV] DT__alpha=-0.1, DT__class_weight=balanced, DT__criterion=entropy 
[CV]  DT__alpha=-0.1, DT__class_weight=balanced, DT__criterion=entropy, score=0.5718105423987777, total=   0.1s
[CV] DT__alpha=-0.1, DT__class_weight=balanced, DT__criterion=entropy 
[CV]  DT__alpha=-0.1, DT__class_weight=balanced, DT__criterion=entropy, score=0.6002503852080123, total=   0.1s
[CV] DT__alpha=-0.1, DT__class_weight=balanced, DT__criterion=entropy 
[CV]  DT__alpha=-0.1, DT__class_weight=balanced, DT__criterion=entropy, score=0.583397534668721, total=   0.1s
[CV] DT__alpha=-0.1, DT__class_weight=balanced, DT__criterion=entropy 
[CV]  DT__alpha=-0.1, DT__class_weight=balanced, DT__criterion=entropy, score=0.559418335901

[CV]  DT__alpha=0.1, DT__class_weight=balanced, DT__criterion=entropy, score=0.5965909090909091, total=   0.1s
[CV] DT__alpha=0.1, DT__class_weight=balanced, DT__criterion=entropy .
[CV]  DT__alpha=0.1, DT__class_weight=balanced, DT__criterion=entropy, score=0.5753081664098614, total=   0.1s
[CV] DT__alpha=0.1, DT__class_weight=balanced, DT__criterion=entropy .
[CV]  DT__alpha=0.1, DT__class_weight=balanced, DT__criterion=entropy, score=0.5919540229885057, total=   0.1s
[CV] DT__alpha=0.0031622776601683794, DT__class_weight=balanced, DT__criterion=gini 
[CV]  DT__alpha=0.0031622776601683794, DT__class_weight=balanced, DT__criterion=gini, score=0.6617647058823529, total=   0.1s
[CV] DT__alpha=0.0031622776601683794, DT__class_weight=balanced, DT__criterion=gini 
[CV]  DT__alpha=0.0031622776601683794, DT__class_weight=balanced, DT__criterion=gini, score=0.6340523882896764, total=   0.1s
[CV] DT__alpha=0.0031622776601683794, DT__class_weight=balanced, DT__criterion=gini 
[CV]  DT__alpha=0.

[CV]  DT__alpha=0.001, DT__class_weight=balanced, DT__criterion=entropy, score=0.6677868692772257, total=   0.1s
[learning_curve] Training set sizes: [ 50 100 103 206 309 412 515 618 721]
[CV]  ................................................................
[CV] ....................... , score=0.5172364400305578, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.5513750954927427, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.5513750954927427, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.6543640183346066, total=   0.0s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done 140 out of 140 | elapsed:   14.8s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s


[CV] ....................... , score=0.6847307104660046, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.6709320091673033, total=   0.0s
[CV]  ................................................................
[CV] ........................ , score=0.631398013750955, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.6548414820473644, total=   0.1s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.2s remaining:    0.0s


[CV] ....................... , score=0.6602845683728036, total=   0.1s
[CV]  ................................................................
[CV] ...................................... , score=0.5, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.5388097072419107, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.5679892141756547, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.6171032357473036, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.6030431432973806, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.6424306625577811, total=   0.0s
[CV]  ................................................................
[CV] .

[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.3s remaining:    0.0s


[CV] ....................... , score=0.6437788906009245, total=   0.1s
[CV]  ................................................................
[CV] ....................... , score=0.6210516178736518, total=   0.1s
[CV]  ................................................................
[CV] ...................................... , score=0.5, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.5731895223420647, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.6007318952234206, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.6187403697996918, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.6541795069337442, total=   0.0s
[CV]  ................................................................
[CV] .

[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:    1.6s finished


DT drug 0.1
DT drug 0.2
DT drug 0.3
DT drug 0.4
DT drug 0.5
DT drug 0.6
DT drug 0.7
DT drug 0.8
DT drug 0.9
cancer -1
cancer -0.001
cancer -0.00031622776601683794
cancer -0.01
cancer -0.0031622776601683794
cancer -0.1
cancer -0.0316227766016838
cancer 0
cancer 0.0316227766016838
cancer 0.1
cancer 0.0031622776601683794
cancer 0.01




cancer 0.00031622776601683794
cancer 0.001
drug -1
drug -0.001
drug -0.00031622776601683794
drug -0.01
drug -0.0031622776601683794
drug -0.1
drug -0.0316227766016838
drug 0
drug 0.0316227766016838
drug 0.1
drug 0.0031622776601683794
drug 0.01
drug 0.00031622776601683794
drug 0.001


In [18]:
contra = pd.read_csv('./data/cmc.data.txt', header = None)
contra.columns = ['wifes_age', 'wifes_edu', 'husbs_edu', 'num_children_born', 'wifes_religion', 'wife_employed', 'husbs_occup', 'SOL_index', 'media_expose', 'contra_method']
contra.head()

Unnamed: 0,wifes_age,wifes_edu,husbs_edu,num_children_born,wifes_religion,wife_employed,husbs_occup,SOL_index,media_expose,contra_method
0,24,2,3,3,1,1,2,3,0,1
1,45,1,3,10,1,1,3,4,0,1
2,43,2,3,7,1,1,3,4,0,1
3,42,3,2,9,1,1,3,3,0,1
4,36,3,3,8,1,1,3,2,0,1


In [3]:
import sklearn.model_selection as ms
import pandas as pd
from helpers import basicResults,dtclf_pruned,makeTimingCurve
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np

adult = pd.read_csv('./data/cmc.data.txt')      
col_names = ['wifes_age', 'wifes_edu', 'husbs_edu', 'num_children_born', 'wifes_religion', 'wife_employed', 'husbs_occup', 'SOL_index', 'media_expose', 'contra_method']
adult.columns = col_names
adult[col_names] = adult[col_names].astype(np.float64)
adult = pd.get_dummies(adult, columns = ['husbs_occup'], prefix = 'husbs_occup')
adult['contra_method'] = np.where(adult['contra_method'] == 1, 'no birth control', 'birth control')
print(adult['contra_method'].head())

0    1.0
1    1.0
2    1.0
3    1.0
4    1.0
Name: contra_method, dtype: float64
