In [None]:
import os
import sys
import pystan
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
plt.style.use("ggplot")

In [None]:
## Set up constants and file paths

utility_path = 'utilities'
if not utility_path in sys.path:
    sys.path.insert(0, utility_path)

import my_utilities
import psis
import stan_utility

In [None]:
# data
data = pd.read_csv("../data/titanic.txt", index_col="name").drop(["row.names", 
                                                                  "home.dest", 
                                                                  "room", 
                                                                  "ticket", 
                                                                  "boat"], axis=1)
data.head()

In [None]:
# binarize categorical variables, drop NaNs and normalize and scale "age" between 0 and 1
data_binarized = pd.get_dummies(data).dropna(axis=0, how="any")
data_binarized["age"] = preprocessing.minmax_scale(preprocessing.scale(np.array(data_binarized["age"])))
data_binarized.head()

In [None]:
# create arrays for a stan model
y = np.array(data_binarized["survived"])
X = np.array(data_binarized[["age", 
                             "pclass_1st", 
                             "pclass_2nd", 
                             "pclass_3rd", 
                             "embarked_Cherbourg", 
                             "embarked_Queenstown", 
                             "embarked_Southampton", 
                             "sex_female"]], dtype=np.dtype(float))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print "{0} samples in training set \n{1} samples in test set".format(y_train.size, y_test.size)

## Pooled

In [None]:
model = stan_utility.compile_model('logistic_regression_pooled.stan')

In [None]:
model_data = dict(n=X_train.shape[0],
                  d=X_train.shape[1], 
                  X=X_train, 
                  y=y_train, 
                  p_beta_df=4, 
                  p_beta_scale=1)
fit = model.sampling(data=model_data, seed=1, control=dict(max_treedepth=15))
samples = fit.extract(permuted=True)
print fit

In [None]:
# plot betas
n, m = 2,4 
fig, axs = plt.subplots(n, m, figsize=(17, 10))
axs = axs.ravel()
for i in range(n*m):
    axs[i].hist(samples["beta"][:,i], bins=100)
    axs[i].set_title("beta {0}".format(i))


In [None]:
# LOO CV
loo, loos, ks = psis.psisloo(samples["log_lik"])
datapoints = np.arange(1, X_train.shape[0] + 1)
plt.plot(datapoints, ks, 'o')
plt.plot(datapoints, [0.7] * X_train.shape[0])
plt.title("PSIS-LOO({0}) k values".format(round(loo, 2)))
plt.show()

In [None]:
def logistic(x, beta, alpha):
    return (1+np.exp(-(alpha + np.dot(x, beta))))**(-1)

def check_accuracy(data, target, beta, alpha):
    ans_list = []
    for i in range(len(data)):
        res = logistic(data[i], beta, alpha)
        ans = 1 if res > 0.5 else 0
        ans_list.append(ans == target[i])

    return np.mean(ans_list)

mean_list = fit.summary()["summary"]
beta = mean_list[1:9, 0]
alpha = mean_list[0, 0]


print 'Accuracy train: ', check_accuracy(X_train, y_train, beta, alpha)
print 'Accuracy test: ', check_accuracy(X_test, y_test, beta, alpha)

# Appendix

## my_utilities.py