# Decision Tree

4) Implemente uma árvore de decisão para classificar o conjunto de dados Titanic. Use a
métrica ROC-AUC para avaliar a performance do modelo.

In [1]:
# Bibliotecas necessárias:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split

In [3]:
# Carregando o dataset com os dados do arquivo csv.
titanic_dataset = pd.read_csv('../../data/titanic.csv', encoding='utf-8')

In [4]:
# Visualizando os primeiros dados da tabela:
titanic_dataset.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
5,897,0,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S
6,898,1,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
7,899,0,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S
8,900,1,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
9,901,0,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S


In [None]:
# Tratamento dos dados:

In [None]:
# Remoção das linhas com valores nulos na idade e fare:

In [5]:
titanic_dataset = titanic_dataset.dropna(subset=['Age'])

In [6]:
titanic_dataset = titanic_dataset.dropna(subset=['Fare'])

In [8]:
# Mapeando os dados do sexo para valores numéricos.
titanic_dataset['Sex'] = titanic_dataset['Sex'].replace({'female': 0, 'male': 1}).infer_objects()

In [None]:
# Remoção das colunas Ticket, Cabin, Embarked, PassengerId e Name:

In [9]:
titanic_dataset = titanic_dataset.drop(['PassengerId'], axis=1)

In [10]:
titanic_dataset = titanic_dataset.drop(['Cabin'], axis=1)

In [11]:
titanic_dataset = titanic_dataset.drop(['Embarked'], axis=1)

In [12]:
titanic_dataset = titanic_dataset.drop(['Ticket'], axis=1)

In [13]:
titanic_dataset = titanic_dataset.drop(['Name'], axis=1)

In [15]:
# Visualizando os dados após o tratamento:
titanic_dataset.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,1,34.5,0,0,7.8292
1,1,3,0,47.0,1,0,7.0
2,0,2,1,62.0,0,0,9.6875
3,0,3,1,27.0,0,0,8.6625
4,1,3,0,22.0,1,1,12.2875
5,0,3,1,14.0,0,0,9.225
6,1,3,0,30.0,0,0,7.6292
7,0,2,1,26.0,1,1,29.0
8,1,3,0,18.0,0,0,7.2292
9,0,3,1,21.0,2,0,24.15


In [16]:
# Separando as features e a classe para o modelo:
features, classe = titanic_dataset.drop(['Survived'], axis=1), titanic_dataset['Survived']

In [17]:
# Separando os conjuntos de dados para treinamento e validação:
features_treino, features_validacao, classe_treino, classe_validacao = train_test_split(features, classe, test_size=0.30)

In [18]:
# Criando e treinando uma árvore de decisão:
modelo_DT = DecisionTreeClassifier()
modelo_DT.fit(features_treino, classe_treino)

In [19]:
# Realizando a predição com os dados de validação:
classe_predicao = modelo_DT.predict_proba(features_validacao)[:, 1]

In [20]:
# Calculando o ROC AUC:
roc_auc = roc_auc_score(classe_validacao, classe_predicao)
print(f'ROC AUC Score: {roc_auc:.4f}')

ROC AUC Score: 1.0000


In [None]:
# Criando o gráfico da curva de ROC:
fpr, tpr, thresholds = roc_curve(classe_validacao, classe_predicao)

plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='blue', label='ROC Curve (AUC = {:.4f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='red', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Falso Positivo')
plt.ylabel('Verdadeiro Positivo')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.grid()
plt.show()