In [17]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import  accuracy_score, confusion_matrix
from sklearn.tree import export_text
from sklearn.tree import export_graphviz

Fit a decision tree to predict the binary outcome `baby_low_weight` using `mom_weight`, `premature_labor`, `hypertension`, `irritability`
- Print and interpret the model coefficients
- Does the model make good predictions of `baby_weight`?
- Include `smoked` and `race` to the model, is the new model better than the old one?
- Use all other features in the data (except `baby_weight`). Which features are most useful in this task?

In [4]:
data=pd.read_excel('data/birth_weight.xlsx') #Hosmer & Lemeshow low birth weight data

In [5]:
data.columns=['id', 'baby_low_weight', 'mom_age', 'mom_weight', 'race', 'smoked', 'premature_labor', 'hyptertension', 'irritability', 'doc_visit', 'baby_weight']

In [6]:
data.head()

Unnamed: 0,id,baby_low_weight,mom_age,mom_weight,race,smoked,premature_labor,hyptertension,irritability,doc_visit,baby_weight
0,85,0,19,182,black,nonsmoker,0,0,1,0,2523
1,86,0,33,155,other,nonsmoker,0,0,0,3,2551
2,87,0,20,105,white,smoker,0,0,0,1,2557
3,88,0,21,108,white,smoker,0,0,1,2,2594
4,89,0,18,107,white,smoker,0,0,1,0,2600


In [7]:
data=pd.get_dummies(data)

In [8]:
data.head()

Unnamed: 0,id,baby_low_weight,mom_age,mom_weight,premature_labor,hyptertension,irritability,doc_visit,baby_weight,race_black,race_other,race_white,smoked_nonsmoker,smoked_smoker
0,85,0,19,182,0,0,1,0,2523,1,0,0,1,0
1,86,0,33,155,0,0,0,3,2551,0,1,0,1,0
2,87,0,20,105,0,0,0,1,2557,0,0,1,0,1
3,88,0,21,108,0,0,1,2,2594,0,0,1,0,1
4,89,0,18,107,0,0,1,0,2600,0,0,1,0,1


In [9]:
X = data[['mom_age', 'mom_weight', 'premature_labor',
       'hyptertension', 'irritability', 'doc_visit', 
       'race_black', 'race_other', 'race_white', 'smoked_nonsmoker',
       'smoked_smoker']]
y = data['baby_low_weight']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5)

In [20]:
dtree = tree.DecisionTreeClassifier().fit(X_train, y_train)

In [21]:
yfit = dtree.predict(X_test)

In [22]:
accuracy_score(y_test, yfit)

0.5789473684210527

In [23]:
confusion_matrix(y_test, yfit)

array([[48, 17],
       [23,  7]])

In [24]:
print(export_text(dtree, feature_names=list(X.columns)))

|--- mom_weight <= 111.00
|   |--- mom_age <= 17.50
|   |   |--- class: 1
|   |--- mom_age >  17.50
|   |   |--- mom_age <= 19.50
|   |   |   |--- class: 0
|   |   |--- mom_age >  19.50
|   |   |   |--- mom_weight <= 106.50
|   |   |   |   |--- race_white <= 0.50
|   |   |   |   |   |--- doc_visit <= 0.50
|   |   |   |   |   |   |--- irritability <= 0.50
|   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |--- irritability >  0.50
|   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- doc_visit >  0.50
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |--- race_white >  0.50
|   |   |   |   |   |--- mom_age <= 26.50
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- mom_age >  26.50
|   |   |   |   |   |   |--- doc_visit <= 1.50
|   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |--- doc_visit >  1.50
|   |   |   |   |   |   |   |--- mom_age <= 29.00
|   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |--- m

In [25]:
# let us prune the tree and focus on most important features

In [26]:
dtree.feature_importances_

array([0.35176717, 0.25589786, 0.        , 0.06240046, 0.12057171,
       0.07494905, 0.        , 0.        , 0.03426345, 0.06648983,
       0.03366048])

In [34]:
dtree = tree.DecisionTreeClassifier(max_features=4, max_leaf_nodes=10).fit(X_train, y_train)

In [35]:
print(export_text(dtree, feature_names=list(X.columns)))


|--- mom_age <= 15.50
|   |--- class: 1
|--- mom_age >  15.50
|   |--- smoked_nonsmoker <= 0.50
|   |   |--- mom_weight <= 185.50
|   |   |   |--- mom_age <= 21.50
|   |   |   |   |--- race_black <= 0.50
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- race_black >  0.50
|   |   |   |   |   |--- class: 1
|   |   |   |--- mom_age >  21.50
|   |   |   |   |--- class: 0
|   |   |--- mom_weight >  185.50
|   |   |   |--- mom_weight <= 212.50
|   |   |   |   |--- class: 1
|   |   |   |--- mom_weight >  212.50
|   |   |   |   |--- class: 0
|   |--- smoked_nonsmoker >  0.50
|   |   |--- mom_weight <= 106.50
|   |   |   |--- race_white <= 0.50
|   |   |   |   |--- mom_age <= 20.50
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- mom_age >  20.50
|   |   |   |   |   |--- class: 1
|   |   |   |--- race_white >  0.50
|   |   |   |   |--- class: 0
|   |   |--- mom_weight >  106.50
|   |   |   |--- class: 0



In [36]:
yfit = dtree.predict(X_test)

In [37]:
accuracy_score(y_test, yfit)

0.7052631578947368

In [38]:
confusion_matrix(y_test, yfit)

array([[61,  4],
       [24,  6]])