# Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from chart import scoring_confusion_matrix
from utils import prepare_titanic_data

In [3]:
SEED = 0

# Data

In [4]:
df = pd.read_csv("data/titanic_train.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
df_ml = prepare_titanic_data(df)
df_ml.head()

Unnamed: 0,Survived,Pclass,is_female,sibsp_parch,Fare,age_imputed
0,0,3,0,1,7.25,22.0
1,1,1,1,1,71.2833,38.0
2,1,3,1,0,7.925,26.0
3,1,1,1,1,53.1,35.0
4,0,3,0,0,8.05,35.0


In [6]:
X = df_ml.drop("Survived", axis=1)
y = df_ml["Survived"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=SEED)

# Model

In [7]:
clf = RandomForestClassifier(random_state=SEED)
clf.fit(X_train, y_train)

y_pred_proba = clf.predict_proba(X_test)

In [8]:
df_scores = pd.DataFrame({"y_true": y_test, "y_score": y_pred_proba[:, 1]})
df_scores.head()

Unnamed: 0,y_true,y_score
495,0,0.03
648,0,0.001
278,0,0.03
31,1,0.92
255,1,0.49


# Scoring Confusion Matrix

In [9]:
scoring_confusion_matrix(df_scores, "y_score", "y_true")

---