# Import libraries and Dataset

In [1]:
import pandas as pd
import numpy as np
import os
import pickle
import datasets
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import accuracy_score

We can select a single row to run our tests. For example lets look at 292nd row.

In [20]:
titanic = datasets.get_titanic()
titanic.iloc[291]

Survived    1.000000
Pclass     -1.321707
Sex        -1.191964
Age         0.537055
SibSp      -0.460589
Parch      -0.441409
Ticket     -0.759871
Fare        0.689423
Embarked   -0.604770
Name: 61, dtype: float64

# Invariant Tests
First, we check for invariance, by keeping everything constant and changing one irrelevent feature at a time. We should not expect survival probability to change due to the ticket number, or port of embarkation. If it runs without error then it means all the models pass this test.

In [29]:
print("Invariant Testing:\n")
for i in os.listdir("./models/titanic/sklearn/"):
    model = pickle.load(open("./models/titanic/sklearn/"+i, 'rb'))
    print(model)
    X = datasets.get_titanic().iloc[291]
    y = X["Survived"]
    X = X[1:]
    p2_prob = model.predict(np.array(X).reshape(1, -1))[0]  # 1.0
    X['Embarked'] = 2.47593535 
    p2_embarked_prob = model.predict(np.array(X).reshape(1, -1))[0]  # 1.0 
    assert p2_prob == p2_embarked_prob
    # Change ticket number
    X['Ticket'] = 1.86005416
    p2_ticket_prob = model.predict(np.array(X).reshape(1, -1))[0]  # 1.0
    assert p2_prob == p2_ticket_prob
print("Invariant Test Successful!")

Invariant Testing:

GradientBoostingClassifier(learning_rate=0.01, max_depth=2, max_features='auto',
                           random_state=0)
LogisticRegression(C=0.01, max_iter=10, random_state=0, tol=0.01)
MLPClassifier(batch_size=128, max_iter=500, random_state=0)
OneVsRestClassifier(estimator=LinearSVC(), n_jobs=1)
GaussianNB(var_smoothing=1e-07)
LinearSVC(C=0.0001, max_iter=10, random_state=0, tol=0.01)
DecisionTreeClassifier(max_depth=2, random_state=0)
RandomForestClassifier(max_depth=10, random_state=0)
Invariant Test Successful!


# Directional Expectation Tests
Now, we check for the relevent features for directional expectations. We should expect:
   1. Females having higher survival probability (than males)
   2. Higher passenger class having higher survival probability (than lower classes)
   3. Higher fare having higher survival probability (than lower fare)

In [50]:
print("Directional Expectaiton Testing:\n")
for i in os.listdir("./models/titanic/sklearn/"):
    model = pickle.load(open("./models/titanic/sklearn/"+i, 'rb'))
    print("Testing model:", model)
    X = datasets.get_titanic().iloc[291]
    y = X["Survived"]
    X = X[1:]
    p2_prob = model.predict(np.array(X).reshape(1, -1))[0]  # 1.0
    X['Sex'] = 0.83739228  #Change gender
    p2_male_prob = model.predict(np.array(X).reshape(1, -1))[0]  # 0.56
    assert p2_prob > p2_male_prob, 'Changing gender from female to male should decrease survival probability.'
    X['Pclass'] = 0.95828974 # Change class
    p2_class_prob = model.predict(np.array(X).reshape(1, -1))[0]  # 0.0
    assert p2_prob > p2_class_prob, 'Changing class from 1 to 3 should decrease survival probability.'
    X['Fare'] = -0.575978 # # Lower fare
    p2_fare_prob = model.predict(np.array(X).reshape(1, -1))[0]  # 0.85
    assert p2_prob > p2_fare_prob, 'Reducing fare should decrease survival probability.'
print("Directional expectation test succesful")

Directional Expectaiton Testing:

Testing model: GradientBoostingClassifier(learning_rate=0.01, max_depth=2, max_features='auto',
                           random_state=0)
Testing model: LogisticRegression(C=0.01, max_iter=10, random_state=0, tol=0.01)
Testing model: MLPClassifier(batch_size=128, max_iter=500, random_state=0)
Testing model: OneVsRestClassifier(estimator=LinearSVC(), n_jobs=1)
Testing model: GaussianNB(var_smoothing=1e-07)
Testing model: LinearSVC(C=0.0001, max_iter=10, random_state=0, tol=0.01)


AssertionError: Changing gender from female to male should decrease survival probability.

In this case the random forest fails the test.
# Model evaluation to ensure satisfactory performance
At last, we evaluate our model to ensure that the performance does not degrade. Here, we assess model performance in terms of accuracy and it should not be less than 80%.

In [37]:
print("model evaluation to ensure satisfactory performance:\n")
for i in os.listdir("./models/titanic/sklearn/"):
    model = pickle.load(open("./models/titanic/sklearn/"+i, 'rb'))
    print(model)
    X = datasets.get_titanic().iloc[291]
    y = X["Survived"]
    X = X[1:]
    p2_prob = model.predict(np.array(X).reshape(1, -1))[0]  # 1.0
    acc_test = accuracy_score([y], [np.round(p2_prob)])
    assert acc_test > 0.80, 'Accuracy on test should be > 0.82'
print("Performance testing succesful!")

model evaluation to ensure satisfactory performance:

GradientBoostingClassifier(learning_rate=0.01, max_depth=2, max_features='auto',
                           random_state=0)
LogisticRegression(C=0.01, max_iter=10, random_state=0, tol=0.01)
MLPClassifier(batch_size=128, max_iter=500, random_state=0)
OneVsRestClassifier(estimator=LinearSVC(), n_jobs=1)
GaussianNB(var_smoothing=1e-07)
LinearSVC(C=0.0001, max_iter=10, random_state=0, tol=0.01)
DecisionTreeClassifier(max_depth=2, random_state=0)
RandomForestClassifier(max_depth=10, random_state=0)
Performance testing succesful!


We can also try to change the line number and test with another entry.