In [1]:
import pickle
import numpy as np
import scipy.stats
from preprocess import preprocess
from EM import EM
from plot import plot

In [22]:
# Preprocess Setting
bin_size = 60 * 18
cutoff = 10
missing_pct = .3
c_zero = True

# EM Setting
num_past_effects = 3
training_pct = .8
single_effect = False
max_iter = 500

In [23]:
data = pickle.load(open('../Data/unimputed_inr_patient_data.pkl', 'rb'), encoding='latin1')

In [24]:
y_pop, X_pop, c_pop = preprocess(data, cutoff, bin_size, missing_pct=missing_pct, c_zero=c_zero)

In [25]:
y_pop.shape

(487, 215)

### Results Analysis
* Single vs. multi effects
    * When using single effect, prediction trajectory of individual level model appears flatter, perhaps contributing to a lower mse
* In EM for individual level model, the lowest MSE appears in different iterations for different individuals, but most often it appears after the first iteration. The total MSE increases with more iterations starting from iteration 1 (?!) 
    * This also happens sometimes in the simulation when run with only one sample. The plot seems to suggest that more iterations sometimes lead to spikes in prediction that doesn't correspond to the actual trajectory (the coefficients don't match up either), thus increasing MSE
    * This happens more drastically when we have more missingness (in the simulation)
    * Could simply because the model is learning bad coefficients, so as iterations continues it just gets worse. since with more missingness, we have less equations in the linear system so the solution is worse (?) 
* For population level, the MSE after the first iteration is also pretty close to the best MSE. In the run that produces the result, MSE rises after iterations begin then starts decreasing pretty soon
    * But at least in the simulation, MSE generally decreases with iterations or fluctuates around the lowest value

In [8]:
def get_data(patient):
    y = y_pop[patient, :].reshape(1, y_pop.shape[1])
    X = X_pop[patient, :, :].reshape(1, X_pop.shape[1], X_pop.shape[2])
    c = np.zeros((1, c_pop.shape[1])) #c_pop[patient, :].reshape(1, c_pop.shape[1]) 
    return (y, X, c)

In [9]:
mse = []

In [1]:
# patient is the patient index
def em_individual(patient):
    print('Patient {}'.format(patient))
    y, X, c = get_data(patient)
    em = EM(y, X, c, num_past_effects, 0, train_pct=training_pct, single_effect=single_effect)
    em.run_EM(max_iter)
    print('Prediction MSE: {}'.format(em.get_MSE()))
    mse.append(em.get_MSE())
    if single_effect:
        print('Coefficient A: {}'.format(em.A))
    else:
        treatment_types = ['nsaid', 'transfusion_plasma', 'transfusion_platelet', 'anticoagulant', 'aspirin']
        for i, treatment in enumerate(treatment_types):
            print('Coefficient for {}: {}'.format(treatment, em.A[:, i]))
        static_types = ['chronic kidney failure', 'sickle cell', 'age']
        for j, static in enumerate(static_types):
            print('Coefficient for {}: {}'.format(static, em.b[j]))
    plot(em, 0, bin_size)

In [11]:
%%time
%%capture
for patient in range(y_pop.shape[0]):
    em_individual(patient)

CPU times: user 14.3 s, sys: 15.3 ms, total: 14.3 s
Wall time: 14.3 s


In [12]:
sum(mse)/len(mse)

0.7708313333085737