In [7]:
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

%matplotlib inline

plt.rcParams['figure.figsize'] = (4, 4)
plt.rcParams['figure.dpi'] = 150
plt.rcParams['lines.linewidth'] = 3
sns.set()

In [8]:
df = sns.load_dataset("titanic")
df = df[["age", "fare", "pclass", "sex", "survived"]]
df = df.dropna()
df['sex'] = df['sex'].replace("male", 0)
df['sex'] = df['sex'].replace("female", 1)
np.random.seed(23)
df_train, df_test = np.split(df, [600])

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

decision_tree = DecisionTreeClassifier()
parameters = {'min_impurity_decrease': [0, 0.01, 0.02, 0.05, 0.1]}

cv_model_finder = GridSearchCV(decision_tree, parameters, cv=10)
cv_model_finder.fit(df_train[["age", "fare", "pclass", "sex"]], df_train["survived"])

GridSearchCV(cv=10, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'min_impurity_decrease': [0, 0.01, 0.02, 0.05, 0.1]}

Model is 4D so we can't visualize it. However, we can assess its accuracy.

In [13]:
from sklearn.metrics import accuracy_score
accuracy_score(df_train["survived"], 
               cv_model_finder.best_estimator_.predict(df_train[["age", "fare", "pclass", "sex"]]))

0.7916666666666666

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

In [16]:
tiny_c_model_fit_intercept_false = Pipeline([        
    ('scale', StandardScaler()),        
    ('poly', PolynomialFeatures(degree=2)),
    ('model', LogisticRegression(fit_intercept = False, C = 1e-8))
])   
tiny_c_model_fit_intercept_false.fit(df_train[["fare", "age", "pclass",  "sex"]], 
                                df_train["survived"]);

In [18]:
tiny_c_model_fit_intercept_true = Pipeline([        
    ('scale', StandardScaler()),        
    ('poly', PolynomialFeatures(degree=2)),
    ('model', LogisticRegression(fit_intercept = True, C = 1e-8))
])   
tiny_c_model_fit_intercept_true.fit(df_train[["fare", "age", "pclass",  "sex"]], 
                                df_train["survived"]);

In [20]:
from sklearn.metrics import accuracy_score
accuracy_score(df_train["survived"], 
               tiny_c_model_fit_intercept_false.predict(df_train[["age", "fare", "pclass", "sex"]]))

0.6883333333333334

In [21]:
from sklearn.metrics import accuracy_score
accuracy_score(df_train["survived"], 
               tiny_c_model_fit_intercept_true.predict(df_train[["age", "fare", "pclass", "sex"]]))

0.5933333333333334

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {'min_impurity_decrease': [0, 0.01, 0.02, 0.05, 0.1]}
mindwg_decision_tree = tree.DecisionTreeClassifier()
cv_model_finder = GridSearchCV(mindwg_decision_tree, parameters, cv=10)
cv_model_finder.fit(train_iris_data[["sepal_length", "sepal_width"]], train_iris_data["species"])

In [None]:
p1a_df = df[["survived", "age"]].copy()
p1a_df = p1a_df.dropna() # drop entries with missing data
p1a_df = p1a_df.sort_values("age") # sort data by age, which will make it easier to plot the data nicely

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
cm = confusion_matrix(training_reality, training_predictions)

In [None]:
sns.heatmap(cm, annot=True, fmt = "d", cmap = "Blues", annot_kws={"size": 16})
plt.ylabel('Predicted')
plt.xlabel('True')

In [None]:
cm_test = confusion_matrix(test_reality, test_predictions)

In [None]:
sns.heatmap(cm_test, annot=True, fmt = "d", cmap = "Blues", annot_kws={"size": 16})
plt.ylabel('Predicted')
plt.xlabel('True')

In [None]:
cm_test = confusion_matrix(test_reality, test_predictions)

In [None]:
# precision recall

In [None]:
training_predictions = p3c_model.predict_proba(p3_train[["fare", "age", "pclass",  "sex"]])[:, 1]

In [None]:
def compute_TPR_FPR(y_obs, y_hat):
    TP = sum((y_obs == y_hat) & (y_obs == 1))
    FN = sum((y_obs != y_hat) & (y_obs == 1))
    TPR = TP / (TP + FN)
    
    TN = sum((y_obs == y_hat) & (y_obs == 0))
    FP = sum((y_obs != y_hat) & (y_obs == 0))
    FPR = FP / (FP + TN)
       
    return TPR, FPR

In [None]:
def compute_model_TPR_FPR(ps, y_obs, threshold):    
    y_hat = ps > threshold    
    return compute_TPR_FPR(y_obs, y_hat)

In [None]:
compute_model_TPR_FPR(training_predictions, p3_train["survived"], 0.5)

In [None]:
def print_model_TPR_FPR(ps, y_obs, threshold):
    TPR, FPR = compute_model_TPR_FPR(ps, y_obs, threshold)
    print(f'TPR: {TPR * 100:.1f}%, FPR: {FPR * 100:.1f}%')

In [None]:
print_model_TPR_FPR(training_predictions, p3_train["survived"], 0.5)

In [None]:
thresholds = np.linspace(0, 1, 101)
TPRs_best_model = []
FPRs_best_model = []
for threshold in thresholds:
    TPR, FPR = compute_model_TPR_FPR(training_predictions, p3_train["survived"], threshold)
    TPRs_best_model.append(TPR)
    FPRs_best_model.append(FPR)

plt.plot(FPRs_best_model, TPRs_best_model)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Training Data');

In [None]:
test_predictions = p3c_model.predict_proba(p3_test[["fare", "age", "pclass",  "sex"]])[:, 1]

In [None]:
thresholds = np.linspace(0, 1, 101)
TPRs_best_model = []
FPRs_best_model = []
for threshold in thresholds:
    TPR, FPR = compute_model_TPR_FPR(test_predictions, p3_test["survived"], threshold)
    TPRs_best_model.append(TPR)
    FPRs_best_model.append(FPR)

plt.plot(FPRs_best_model, TPRs_best_model)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Training Data');