# Classification and Regression Tree (CART)

## Titanic Dataset
- Kaggle knowledge competition: https://www.kaggle.com/c/titanic

In [None]:
# read in the data
import pandas as pd
url = 'data/titanic.csv'
titanic = pd.read_csv(url)

# encode female as 0 and male as 1
titanic['Sex'] = titanic.Sex.map({'female':0, 'male':1})

# fill in the missing values for age with the median age
titanic.Age.fillna(titanic.Age.median(), inplace=True)

# create a DataFrame of dummy variables for Embarked
embarked_dummies = pd.get_dummies(titanic.Embarked, prefix='Embarked')
embarked_dummies.drop(embarked_dummies.columns[0], axis=1, inplace=True)

# concatenate the original DataFrame and the dummy DataFrame
titanic = pd.concat([titanic, embarked_dummies], axis=1)

# print the updated DataFrame
titanic.head()

In [None]:
# define X and y
feature_cols = ['Pclass', 'Sex', 'Age', 'Embarked_Q', 'Embarked_S']
X = titanic[feature_cols]
y = titanic.Survived

In [None]:
# fit a classification tree with max_depth=3 on all data
from sklearn.tree import DecisionTreeClassifier
treeclf = DecisionTreeClassifier(max_depth=3, random_state=1)
treeclf.fit(X, y)

## CART 그림 도출

- **GraphViz** 프로그램 다운로드 후 설치: http://www.graphviz.org/
- 환경변수(PATH)에 GraphViz 실행파일 경로를 추가: ex) 'C:\Program Files (x86)\Graphviz2.38\bin'를 환경변수에 추가
- **pydotplus** 패키지 설치: pip install pydotplus

In [None]:
# create a Graphviz file
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO
import pydotplus
from IPython.display import Image 
dot_data = StringIO()
export_graphviz(treeclf, out_file= dot_data)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_pdf("titanic_tree.pdf")
graph.write_png("titanic_tree.png")