# GPAR

## 1) Install GPAR

In [None]:
import sys
!{sys.executable} -m pip install gpar

## 2) Import Libraries

In [None]:
import pandas as pd
import numpy as np
import scipy as sp
import math
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import gpar

In [None]:
from scipy.special import iv
from scipy.optimize import fsolve

from gpar.regression import GPARRegressor
from scipy.stats import vonmises
from scipy import stats
from scipy.stats import entropy
from scipy import interpolate

## 3) Define Functions

#### Functions below compute fits and pdf of Vonmises

In [None]:
def vonmises_density(x,mu,kappa):
    """
    Calculate the von Mises density for a series x (a 1D numpy.array).
    Input : 
        x : a 1D numpy.array of size L
        mu : a 1D numpy.array of size n, the mean of the von Mises distributions
        kappa : a 1D numpy.array of size n, the dispersion of the von Mises distributions
    Output : 
        a (L x n) numpy array, L is the length of the series, and n is the size of the array containing the parameters. Each row of the output corresponds to a density
    """
    res = []
    for i in x:
        f = np.exp(kappa*np.cos(i-mu))
        n = 2*np.pi*iv(0,kappa)
        res.append(f/n)
    return(np.array(res))

def vonmises_pdfit(series):
    """
    Calculate the estimator of the mean and deviation of a sample, for a von Mises distribution
    Input : 
        series : a 1D numpy.array
    Output : 
        the estimators of the parameters mu and kappa of a von Mises distribution, in an list [mu, kappa]
    """
    s0 = np.mean(np.sin(series))
    c0 = np.mean(np.cos(series))
    mu = np.arctan2(s0,c0)
    var = 1-np.sqrt(s0**2+c0**2)
    k = lambda kappa: 1-iv(1,kappa)/iv(0,kappa)-var
    kappa = fsolve(k, 0.0)[0]
    return([mu,kappa])

#### Function below computes KL divergence

In [None]:
# Here is the function to define KL divergence
# Note: the samplepdf and observationpdf is the probability density function, not the sample count.
def kldiver(samplepdf, observationpdf):
    print("\nIndividual Entropy\n")
    print(entropy(samplepdf))
    print(entropy(observationpdf))

    print("\nPairwise Kullback Leibler divergence\n")
    firstkl = entropy(samplepdf, qk=observationpdf)
    secondkl = entropy(observationpdf, qk=samplepdf)
    print(firstkl)
    print(secondkl)
    return (firstkl,secondkl)

#### Function below computes inverse sampling

In [None]:
def invcdf(vals, pdf, num_sam):
    # Normalize
    normalize = pdf/np.sum(pdf)
    p = np.cumsum(normalize)
    # define inverse function
    inv_cdf = interpolate.interp1d(p,vals,bounds_error=False, fill_value = (-math.pi, math.pi))
    # get number of data
    r = np.random.rand(num_sam)
    # get sample
    sample = inv_cdf(r)
    return sample

#### Function below compute MSE and MAE

In [None]:
# Here is the function to define the MSE and MAE
def countdiff(bins, sample, observation):
    if len(observation) != len(sample):
        print("Please generate the same length data")
    length = len(observation)
    sumup = 0
    sumup2 = 0
    # counts and divisions in the real data
    count,division = np.histogram(listall,range=(-math.pi,math.pi),bins=bins)
    # change the sample to numpy array, note: if it is already np array, comment it out
    s = sample
    # compute MSE
    for j in range(len(count)):
        modelCount = s[(division[j] < s) & (s < division[j+1])].size
        sumup += np.square(count[j] - modelCount)
        sumup2+= np.abs(count[j] - modelCount)
    mse = sumup/bins
    mae = sumup2/bins
    print('The MSE is ' + str(mse))
    print('The MAE is ' + str(mae))
    return mse, mae 

#### Function below compute MSE

In [None]:
# Here is the function to define the MSE
def countdiffmse(bins, sample, observation):
    length = len(observation)
    sumup = 0
    sumup2 = 0
    # counts and divisions in the real data
    count,division = np.histogram(listall,range=(-math.pi,math.pi),bins=bins)
    # change the sample to numpy array, note: if it is already np array, comment it out
    s = sample
    # compute MSE
    for j in range(len(count)):
        modelCount = s[(division[j] < s) & (s < division[j+1])].size
        sumup += np.square(count[j] - modelCount)
        sumup2+= np.abs(count[j] - modelCount)
    mse = sumup/bins
    mae = sumup2/bins
    return mse

#### Function below compute MAE

In [None]:
# Here is the function to define the MAE
def countdiffmae(bins, sample, observation):
    length = len(observation)
    sumup = 0
    sumup2 = 0
    # counts and divisions in the real data
    count,division = np.histogram(listall,range=(-math.pi,math.pi),bins=bins)
    # change the sample to numpy array, note: if it is already np array, comment it out
    s = sample
    # compute MSE
    for j in range(len(count)):
        modelCount = s[(division[j] < s) & (s < division[j+1])].size
        sumup += np.square(count[j] - modelCount)
        sumup2+= np.abs(count[j] - modelCount)
    mse = sumup/bins
    mae = sumup2/bins
    return mae

#### Function Below compute average KL divergence of both way

In [None]:
# Here is the function to define KL divergence
# Note: the samplepdf and observationpdf is the probability density function, not the sample count.

def kldiver2(samplepdf, observationpdf):
    #print("\nIndividual Entropy\n")
    #print(entropy(samplepdf))
    #print(entropy(observationpdf))

    #print("\nPairwise Kullback Leibler divergence\n")
    firstkl = entropy(samplepdf, qk=observationpdf)
    secondkl = entropy(observationpdf, qk=samplepdf)
    #print(firstkl)
    #print(secondkl)
    #return (firstkl,secondkl)
    return (firstkl+secondkl)/2

# Data

#### Load Data

In [None]:
df = pd.read_csv("/home/idies/workspace/Storage/Genius/TRF/FinalData/Data_June17.csv")
df = df.drop(columns=["Unnamed: 0"])
df = df.drop(columns=["Unnamed: 0.1"])
df.head()

#### Convert Time String to make sure it will be in format (HH:MM:SS)

In [None]:
start = []
end = []
for i in range(len(df['start_time'])):
    if len(df['start_time'][i]) < 8:
        start.append('0' + df['start_time'][i])
    else:
        start.append(df['start_time'][i])
    
    if len(df['end_time'][i]) < 8:
        end.append('0' + df['end_time'][i])
    else:
        end.append(df['end_time'][i])
df['start_time'] = start
df['end_time'] = end

#### Convert time string in integer value 0 - 24

In [None]:
convert_start = []
convert_end = []
for i in range(0,len(df["end_time"])):
    convert_start.append(int(df["start_time"][i][0:2]) + int(df["start_time"][i][3:5])/60 + int(df["start_time"][i][6:])/3600)
    convert_end.append(int(df["end_time"][i][0:2]) + int(df["end_time"][i][3:5])/60 + int(df["end_time"][i][6:])/3600)
df["starthour"] = convert_start
df["endhour"] = convert_end

#### Convert time into $-\pi$ to $\pi$

In [None]:
df3 = df[(df["name.s"] == "Sleep")]
wake = df3["endhour"] * math.pi /12 - math.pi
sleep = df3["starthour"] * math.pi /12 - math.pi

In [None]:
df2 = df[(df["name.s"] == "Food")]
d = df2[(df2["mealsize.s"] == "drinkOnly")]["starthour"] * math.pi /12 - math.pi
lm = df2[(df2["mealsize.s"] == "largeMeal")]["starthour"] * math.pi /12 - math.pi
ls = df2[(df2["mealsize.s"] == "largeSnack")]["starthour"] * math.pi /12 - math.pi
m = df2[(df2["mealsize.s"] == "mediumMeal")]["starthour"] * math.pi /12 - math.pi
sm = df2[(df2["mealsize.s"] == "smallMeal")]["starthour"] * math.pi /12 - math.pi
ss = df2[(df2["mealsize.s"] == "smallSnack")]["starthour"] * math.pi /12 - math.pi

#### Fit each of different type

In [None]:
wake_val = vonmises_pdfit(wake)
print(wake_val)
sleep_val = vonmises_pdfit(sleep)
print(sleep_val)

In [None]:
food1 = vonmises_pdfit(d)
print(food1)
food2 = vonmises_pdfit(lm)
print(food2)
food3 = vonmises_pdfit(ls)
print(food3)
food4 = vonmises_pdfit(m)
print(food4)
food5 = vonmises_pdfit(sm)
print(food5)
food6 = vonmises_pdfit(ss)
print(food6)

### Run GPAR

In [None]:
# Create toy data set.
n = 1000
x = np.linspace(-math.pi, math.pi, n)
noise = 0.05
fmin = np.min([len(wake), len(sleep), len(ss), len(sm), len(m), len(ls), len(lm),len(d),200])

# Draw functions depending on each other in complicated ways.
# Wake
f1 = vonmises_density(x,wake_val[0],wake_val[1])
a1 = vonmises.rvs(wake_val[1],wake_val[0], size =fmin )
# Food
food_val_a1 = vonmises_pdfit(list(d) +list(a1))
f2 = vonmises_density(x,food_val_a1[0],food_val_a1[1])
a2 = vonmises.rvs(food_val_a1[1],food_val_a1[0], size = fmin)

food_val_a2 = vonmises_pdfit(list(sm) +list(a2) + list(a1))
f3 = vonmises_density(x,food_val_a2[0],food_val_a2[1])
a3 = vonmises.rvs(food_val_a2[1],food_val_a2[0], size = fmin)

food_val_a3 = vonmises_pdfit(list(ss) +list(a2) + list(a3))
f4 = vonmises_density(x,food_val_a3[0],food_val_a3[1])
a4 = vonmises.rvs(food_val_a3[1],food_val_a3[0], size = fmin)

if food5[0] > food6[0]:
    mval = list(a4)
else:
    mval = list(a3)

food_val_a4 = vonmises_pdfit(list(m)+list(a2) + mval)
f5 = vonmises_density(x,food_val_a4[0],food_val_a4[1])
a5 = vonmises.rvs(food_val_a4[1],food_val_a4[0], size = fmin)

food_val_a5 = vonmises_pdfit(list(ls)+list(a2) + list(a5))
f6 = vonmises_density(x,food_val_a5[0],food_val_a5[1])
a6 = vonmises.rvs(food_val_a5[1],food_val_a5[0], size = fmin)

food_val_a6 = vonmises_pdfit(list(lm)+list(a2) + list(a5))
f7 = vonmises_density(x,food_val_a6[0],food_val_a6[1])
a7 = vonmises.rvs(food_val_a6[1],food_val_a6[0], size = fmin)

# Sleep 
sleep_val_b = vonmises_pdfit(list(sleep) + list(a7) + list(a6))
f8 = vonmises_density(x,sleep_val_b[0],sleep_val_b[1])

f = np.stack((f1, f2, f3, f4, f5, f6, f7, f8), axis=0).T

# Add noise and subsample.
y = f + noise * np.random.randn(n, 8)
x_obs, y_obs = x[::8], y[::8]

# Fit and predict GPAR.
model = GPARRegressor(scale=0.1,
                      linear=False, #linear_scale=10.,
                      nonlinear=True, nonlinear_scale=1,
                      noise=0.05,
                      impute=True, replace=True, normalise_y=False)
model.fit(x_obs, y_obs)
means, lowers, uppers = \
    model.predict(x, num_samples=200, credible_bounds=True, latent=True)

# Fit and predict independent GPs: set markov=0.
igp = GPARRegressor(scale=0.1,
                    linear=False, #linear_scale=10.,
                    nonlinear=True, nonlinear_scale=1,
                    noise=0.05, markov=0, normalise_y=False)
igp.fit(x_obs, y_obs)
igp_means, igp_lowers, igp_uppers = \
    igp.predict(x, num_samples=200, credible_bounds=True, latent=True)

# Plot the result.
plt.figure(figsize=(12, 12))

for i in range(8):
    ax = plt.subplot(8, 1, i + 1)
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.yaxis.set_ticks_position('left')
    ax.xaxis.set_ticks_position('bottom')
    plt.scatter(x_obs, y_obs[:, i], label='Observations', c='black', s=15)
    plt.plot(x, f[:, i], label='Truth', c='tab:orange')
    plt.plot(x, means[:, i], label='GPAR', c='tab:blue')
    plt.fill_between(x, lowers[:, i], uppers[:, i],
                     facecolor='tab:blue', alpha=.25)
    plt.plot(x, igp_means[:, i], label='IGP', c='tab:green')
    plt.fill_between(x, igp_lowers[:, i], igp_uppers[:, i],
                     facecolor='tab:green', alpha=.25)
    plt.xlabel('$t$')
    plt.ylabel('$y_{}$'.format(i + 1))
    if i == 2:
        leg = plt.legend(facecolor='#eeeeee')
        leg.get_frame().set_linewidth(0)

plt.show()

#### Make one gigantic function

In [None]:
comb = []
for i in range(0, len(igp_means)):
    comb.append(igp_means[:,0][i]/8 + igp_means[:,1][i]/8 + igp_means[:,2][i]/8 + igp_means[:,3][i]/8 + igp_means[:,4][i]/8 + igp_means[:,5][i]/8+ igp_means[:,6][i]/8+ igp_means[:,7][i]/8)
    
listall = list(wake) + list(sleep) + list(d) + list(ss) + list(sm) + list(m) + list(ls) + list(lm)

In [None]:
sns.kdeplot(listall,label='Actual data based kernel desnsity estimation')
plt.plot(x, comb, label = 'GPAR')
plt.hist(listall,density=True,label='Actual Data')
plt.ylabel('Probability')
plt.xlabel('Data')
plt.legend()

#### Compute KL

In [None]:
# generate the probability density function based on actual data
actual_density = stats.kde.gaussian_kde(listall)
actual_density = actual_density(x)
kldiver(actual_density, comb)

#### Compute MAE and MSE

In [None]:
papermse = []
papermae = []
for i in range(1000):
    samples = invcdf(x,comb, len(listall))
    papermse.append(countdiffmse(48,samples, listall))
    papermae.append(countdiffmae(48,samples, listall))

In [None]:
meanmse = np.mean(papermse)
meanmae = np.mean(papermae)
lowersmse = np.percentile(papermse, 2.5)
uppersmse = np.percentile(papermse, 100 - 2.5)
lowersmae = np.percentile(papermae, 2.5)
uppersmae = np.percentile(papermae, 100 - 2.5)

print("MSE mean is " + str(meanmse))
print("MSE lower is " + str(lowersmse))
print("MSE upper is " + str(uppersmse))

print("MAE mean is " + str(meanmae))
print("MAE lower is " + str(lowersmae))
print("MAE upper is " + str(uppersmae))

# Run with each individual
- Because everyone has different number of meals, will combine all the food data and make a simple one

### Split the bigger data set into subset of individual

In [None]:
# Split the Big Dataframe to smaller ones meaning each of the itmes
a = [v for k, v in df.groupby('userid.s')]

In [None]:
user = []
kl = []
kl2=[]
for i in range(0,len(a)):
    print(i)
    # Get Sleep and wake up time
    df3a = a[i][(a[i]["name.s"] == "Sleep")]
    # get wake time and sleep time
    wakea = df3a["endhour"] * math.pi /12 - math.pi
    sleepa = df3a["starthour"] * math.pi /12 - math.pi
    # get food data
    df2a = a[i][(a[i]["name.s"] == "Food")]
    fooda = df2a["starthour"] * math.pi /12 - math.pi
    da = df2a[(df2a["mealsize.s"] == "drinkOnly")]["starthour"] * math.pi /12 - math.pi
    lma = df2a[(df2a["mealsize.s"] == "largeMeal")]["starthour"] * math.pi /12 - math.pi
    lsa = df2a[(df2a["mealsize.s"] == "largeSnack")]["starthour"] * math.pi /12 - math.pi
    ma = df2a[(df2a["mealsize.s"] == "mediumMeal")]["starthour"] * math.pi /12 - math.pi
    sma = df2a[(df2a["mealsize.s"] == "smallMeal")]["starthour"] * math.pi /12 - math.pi
    ssa = df2a[(df2a["mealsize.s"] == "smallSnack")]["starthour"] * math.pi /12 - math.pi
    
    # Von-Mises Fit does not work well under 5 people
    if (len(wakea) >= 5 and len(da) >= 5 and len(lma) >= 5 and len(lsa) >= 5 and len(ma) >= 5 and len(sma) >= 5 and len(ssa) >= 5):
        # Get User
        user.append(np.unique(list(a[i]['userid.s']))[0])
        
        food1a = vonmises_pdfit(da)
        food2a = vonmises_pdfit(lma)
        food3a = vonmises_pdfit(lsa)
        food4a = vonmises_pdfit(ma)
        food5a = vonmises_pdfit(sma)
        food6a = vonmises_pdfit(ssa)
        
        n = 1000
        x = np.linspace(-math.pi, math.pi, n)
        noise = 0.05
        fmin = np.min([len(wakea), len(sleepa), len(ssa), len(sma), len(ma), len(lsa), len(lma),len(da),200])

        # Draw functions depending on each other in complicated ways.
        # Wake
        wake_vala = vonmises_pdfit(wakea)
        f1a = vonmises_density(x,wake_vala[0],wake_vala[1])
        a1a = vonmises.rvs(wake_vala[1],wake_vala[0], size =fmin)
        # Food

        food_val_a1a = vonmises_pdfit(list(da) +list(a1a))
        f2a = vonmises_density(x,food_val_a1a[0],food_val_a1a[1])
        a2a = vonmises.rvs(food_val_a1a[1],food_val_a1a[0], size = fmin)

        food_val_a2a = vonmises_pdfit(list(sma) +list(a2a) + list(a1a))
        f3a = vonmises_density(x,food_val_a2a[0],food_val_a2a[1])
        a3a = vonmises.rvs(food_val_a2a[1],food_val_a2a[0], size = fmin)

        food_val_a3a = vonmises_pdfit(list(ssa) +list(a2a) + list(a3a))
        f4a = vonmises_density(x,food_val_a3a[0],food_val_a3a[1])
        a4a = vonmises.rvs(food_val_a3a[1],food_val_a3a[0], size = fmin)

        if food5a[0] > food6a[0]:
            mval = list(a4a)
        else:
            mval = list(a3a)

        food_val_a4a = vonmises_pdfit(list(ma)+list(a2a) + mval)
        f5a = vonmises_density(x,food_val_a4a[0],food_val_a4a[1])
        a5a = vonmises.rvs(food_val_a4a[1],food_val_a4a[0], size = fmin)

        food_val_a5a = vonmises_pdfit(list(lsa)+list(a2a) + list(a5a))
        f6a = vonmises_density(x,food_val_a5a[0],food_val_a5a[1])
        a6a = vonmises.rvs(food_val_a5a[1],food_val_a5a[0], size = fmin)

        food_val_a6a = vonmises_pdfit(list(lma)+list(a2a) + list(a5a))
        f7a = vonmises_density(x,food_val_a6a[0],food_val_a6a[1])
        a7a = vonmises.rvs(food_val_a6a[1],food_val_a6a[0], size = fmin)

        # Sleep 
        sleep_val_ba = vonmises_pdfit(list(sleepa) + list(a7a) + list(a6a))
        f8a = vonmises_density(x,sleep_val_ba[0],sleep_val_ba[1])
        
        fa = np.stack((f1a, f2a, f3a, f4a, f5a, f6a, f7a, f8a), axis=0).T
        
        # Add noise and subsample.
        ya = fa + noise * np.random.randn(n, 8)
        x_obs, y_obsa = x[::8], ya[::8]

        # Fit and predict independent GPs: set markov=0.
        igpa = GPARRegressor(scale=0.1,
                            linear=False, #linear_scale=10.,
                            nonlinear=True, nonlinear_scale=1,
                            noise=0.05, markov=0, normalise_y=False)
        igpa.fit(x_obs, y_obsa)
        igp_meansa, igp_lowersa, igp_uppersa = \
            igpa.predict(x, num_samples=200, credible_bounds=True, latent=True)

        comba = []
        for j in range(0, len(igp_meansa)):
            comba.append(igp_meansa[:,0][j]/8 + igp_meansa[:,1][j]/8 + igp_meansa[:,2][j]/8 + igp_meansa[:,3][j]/8 + igp_meansa[:,4][j]/8 + igp_meansa[:,5][j]/8 + igp_meansa[:,6][j]/8 + igp_meansa[:,7][j]/8)
        # get KL divergence values
        kl.append(kldiver2(comba, comb))
        
        listalla = list(wakea) + list(sleepa) + list(da) + list(ssa) + list(sma) + list(ma) + list(lsa) + list(lma)
        # generate the probability density function based on actual data
        actual_densitya = stats.kde.gaussian_kde(listalla)
        actual_densitya = actual_densitya(x)
        kl2.append(kldiver2(comba, actual_densitya))

In [None]:
users = []
kls = []
final = {}
for i in range(0, len(kl)):
    users.append(user[i])
    kls.append(kl[i])
    final[user[i]] = kl[i]

In [None]:
final

In [None]:
dfkl = pd.DataFrame(users,columns=['user'])
dfkl["kl"] = kls
dfkl.to_csv("pop.csv")

In [None]:
users = []
kls2 = []
final = {}
for i in range(0, len(kl2)):
    users.append(user[i])
    kls2.append(kl2[i])
    final[user[i]] = kl2[i]

In [None]:
dfkl2 = pd.DataFrame(users,columns=['user'])
dfkl2["kl"] = kls2
dfkl2.to_csv("ind.csv")