In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import pickle
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

data_path = '../data/age_binned/'
model_path = '../../../modeling/v5_2/model/'


In [2]:
# load model
model = tf.keras.models.load_model(model_path+'/model_checkpoint')
scalery = pickle.load(open(model_path+'/model_y_scaler.pkl', 'rb'))

def extract_features(data):

    features = np.histogram(data, np.linspace(-4, 4, 101), density=True)[0]
    features = (features - features.min())/(features.max() - features.min())
    
    return features.reshape(-1, 1)


In [3]:
def predict_df(path, nbootstrap=200):

    df = pd.read_csv(path)
    
    ps = []
    for i in range(nbootstrap):
    
        if i==0:
            data = df.value.values
        else:
            data = np.random.choice(df.value.values, len(df.value.values), replace=True)

        # standardize
        mean = data.mean()
        std = data.std()
        data = (data - mean)/std

        # feature extraction
        features = extract_features(data)

        # predict
        p = model.predict(features[np.newaxis,...], verbose=0)
        p = scalery.inverse_transform(p)[0]
        p = p[-2] # retrieve the elements corresponding to 0.95 quantile
        # convert back to original scale
        p *= std
        p += mean
        ps.append(p)
    
    ps = np.array(ps)
    
    return ps.mean(axis=0), np.quantile(ps, 0.025, axis=0), np.quantile(ps, 0.975, axis=0)
    

In [4]:
predictions = {}
for i in sorted(os.listdir(data_path)):
    print(i)
    p = predict_df(data_path+'/'+i)
    predictions[i] = p
    

data_18.csv
data_25.csv
data_30.csv
data_35.csv
data_40.csv
data_45.csv
data_50.csv
data_55.csv
data_60.csv
data_65.csv
data_70.csv
data_75.csv
data_80.csv
data_85.csv


In [5]:
with open('./predictions_age_99p.pkl', 'wb') as f:
    pickle.dump(predictions, f)
