In [None]:
import numpy as np
import pandas as pd

In [None]:
Titanic = pd.read_csv("train.csv")

In [None]:
Titanic[:5]

In [None]:
Titanic[-1:]

In [None]:
print(Titanic.columns)
print(Titanic.columns.values)

In [None]:
print(Titanic['Fare'][:5])

In [None]:
# This shows none of the 'survived' data is missing
np.where(Titanic["Survived"].isnull())

In [None]:
# Doesn't do anything to Titanic data, only returns column names
# Titanic = Titanic.columns.drop('SibSp')

In [None]:
# Do this instead
Titanic = Titanic.drop('SibSp', 1)

In [None]:
Titanic[:5]

In [None]:
# Delete multiple columns at a time
Titanic = Titanic.drop(['Parch', 'Cabin'], 1)

In [None]:
Titanic[:10]

In [None]:
# List rows with missing age entries
Titanic["Age"].isnull()[:10]

In [None]:
# Delete those rows, ~ is boolean negation
# Titanic = Titanic[~Titanic["Age"].isnull()]

In [None]:
# Replace nans with age average
np.average(Titanic['Age'])

In [None]:
np.nanmean(Titanic['Age'])

In [None]:
mean = np.nanmean(Titanic['Age'])
Titanic['Age'] = Titanic['Age'].fillna(mean)

In [None]:
Titanic[:10]

In [None]:
survived = Titanic["Survived"]

In [None]:
print('There were %d survivors' % np.sum(survived))
print('There were {} survivors'.format(np.sum(survived)))

In [None]:
np.where(Titanic['Fare'].isnull())

In [None]:
print("The average price of a ticket is ${0:.2f}".format(np.average(Titanic['Fare'])))

In [None]:
print("The most expensive ticket is ${0:.2f}".format(np.max(Titanic['Fare'])))

In [None]:
print("The oldest person is {0:0.0f} years old".format(np.max(Titanic['Age'])))

In [None]:
print("{:.2f}".format(189.5466))

In [None]:
print("{:4.0f}".format(189.5466))

In [None]:
Titanic[Titanic['Survived']==1]['Age'][:10]

In [None]:
oldest_survivor = np.max(Titanic[Titanic['Survived']==1]['Age'])

In [None]:
print("The oldest survivor is {:0.0f} years old".format(oldest_survivor))

# Now we try the decision tree

In [None]:
from sklearn import tree
from sklearn.model_selection import train_test_split
from tqdm import tqdm_notebook

import time

from graphviz import Digraph

In [None]:
data = pd.read_csv("train.csv")

# Clean the data

In [None]:
data['Embarked'] = data.Embarked.map({np.nan:0, 'Q':1, 'S':2, 'C':3})
data['Sex'] = data.Sex.map({'male':0, 'female':1})
data['Age'] = data.Age.fillna(np.nanmean(Titanic['Age']))
data = data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'],1)
data[:10]

In [None]:
data = data.astype(float)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('Survived',1), data['Survived'], test_size=.2)

In [None]:
classifier = tree.DecisionTreeClassifier()

In [None]:
for depth in tqdm_notebook(range(2,10)):
    for leaves in range(1, 102, 10):
        X_train, X_test, y_train, y_test = train_test_split(data.drop('Survived',1), data['Survived'], test_size=.2)
        classifier = tree.DecisionTreeClassifier(max_depth=depth, min_samples_leaf=leaves)        
        print('Depth: {}, Leaves: {}'.format(depth, leaves))
        start = time.time()
        classifier.fit(X_train, y_train)
        acc = classifier.score(X_test, y_test)
        stop = time.time() - start
        print('Time: ',stop)
        print('Accuracy: ', acc)

In [None]:
classifier = tree.DecisionTreeClassifier()
classifier.fit(X_train, y_train)

# Graph using graphviz

#### in the terminal, use the command "dot -Tpdf 'name'.dot -o 'name'.pdf"

In [None]:
# This works
graph = tree.export_graphviz(classifier, out_file='tree.dot', max_depth=3)

# Everything below was an attempt to graph inline, but it doesn't work.

In [None]:
dot = Digraph()
dot.render('tree2.dot', view='True') # This command actually saves an empty dot file

In [None]:
dot.source

In [None]:
graph = tree.export_graphviz(classifier, out_file=None, max_depth=3)
print(graph)

In [None]:
# If this was a graphviz object it would work
type(graph)

In [None]:
dot = Digraph(graph)

In [None]:
dot.view()