# SLDS

## 1) Install Required Packages

In [None]:
import sys
!{sys.executable} -m pip install scipy==1.2.1 ssm tqdm==4.46.0

## 2) Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ssm
import seaborn as sns
import scipy
from scipy.stats import entropy
import time

# Data

In [None]:
df=pd.read_csv("/home/idies/workspace/Storage/Genius/TRF/a(SSM_Primary)/Search param/Data_June17.csv")
df['mealsize.s'] = df['mealsize.s'].replace(np.nan, 'sleep')  
df=df[df["mealsize.s"]!="0"]  
df=df.reset_index(drop=True)

In [None]:
df.head()

### add noise

In [None]:
# add noise
def addnoise(df, N):   
    Df=df[["userid.s", "mealsize.s", "starthour", "endhour"]]   
    hs_addnoise=[]   
    he_addnoise=[]   
    
    for i in range(0, len(Df["starthour"])):   
        hs_addnoise.append(Df["starthour"][i] + np.random.normal(0, N))     
        he_addnoise.append(Df["endhour"][i] + np.random.normal(0, N))   
    Df["hs_addnoise"]=hs_addnoise  
    Df["he_addnoise"]=he_addnoise   
    
    return Df  

### convert to counts and set time bins

In [None]:
Label=['drinkOnly', 'smallSnack', 'largeSnack', 'smallMeal', 'mediumMeal', 'largeMeal', 'sleep']    
                                            
def setTB(bins, dd):                
       
    Newdd = pd.DataFrame(columns=("time", 'wake', 'drinkOnly', 'smallSnack', 'largeSnack', 'smallMeal', 'mediumMeal', 'largeMeal', 'sleep'))  
    
    ddmeal = dd["mealsize.s"].unique()       
    
    kk=dd.groupby('mealsize.s')       
    
    for i in Label and ddmeal:                                               
        
        gg=kk.get_group(i)                              
        
        if i == "sleep":                                     
            
            counts,divisions = np.histogram(np.array(gg["hs_addnoise"]), range=(0, 24), bins=bins)
            
            Newdd["sleep"]=counts
            
            countw,divisionw = np.histogram(np.array(gg["he_addnoise"]), range=(0, 24), bins=bins)
            
            Newdd["wake"]=countw
            
        else:
            
            count,division = np.histogram(np.array(gg["hs_addnoise"]), range=(0, 24), bins=bins)
            Newdd[i]=count
        
    Newdd["time"]=division[0 : -1]
    
    for i in list(set(Label).difference(set(ddmeal))):
        
        if i == "sleep":
            
            Newdd["sleep"]=np.zeros(len(Newdd["time"]))
            
            Newdd["wake"]=np.zeros(len(Newdd["time"]))
            
        Newdd[i] = np.zeros(len(Newdd["time"]))
    
    return Newdd

### Fit model with different latent states

In [None]:
List=['wake', 'drinkOnly', 'smallSnack', 'largeSnack', 'smallMeal', 'mediumMeal', 'largeMeal', 'sleep']

In [None]:
## train, test : dataframe
def testsample(train, test, num_ls, dim_ls):
    
    time_start=time.time()
    
    trainx = np.array(train[['wake', 'drinkOnly', 'smallSnack', 'largeSnack', 'smallMeal', 'mediumMeal', 'largeMeal', 'sleep']], dtype = int)
    
    testx = np.array(test[['wake', 'drinkOnly', 'smallSnack', 'largeSnack', 'smallMeal', 'mediumMeal', 'largeMeal', 'sleep']], dtype = int)
    
    
    n_disc_states = num_ls      # number of discrete states
    latent_dim =  dim_ls      # number of latent dimensions
    emissions_dim = 8   # number of observed dimensions
    slds = ssm.SLDS(emissions_dim, n_disc_states, latent_dim, emissions="poisson")
    
    # Fit the model using Laplace-EM with a structured variational posterior
    
    q_lem_elbos, q_lem = slds.fit(trainx, method="laplace_em",
                                  variational_posterior="structured_meanfield",
                                  num_iters=15, alpha=0.0)    
    
    q_lem_x = q_lem.mean_continuous_states[0]  # continuous states
    
    q_lem_z = slds.most_likely_states(q_lem_x, testx) # discrete states of test data
    
    IInput= np.zeros((testx.shape[0], 0))    
    
    testsamples=slds.emissions.sample(q_lem_z, q_lem_x, input=IInput, tag=None) # generate samples given latent states of test data
    
    time_end=time.time()

    t_cost = time_end-time_start
    
    return testsamples, t_cost

## KL

In [None]:
def KL(testdata, tsample):
    
    # change counts to time points
    
    tp_obs=[]
    tp_sam=[]
    for i in range(len(List)):
        tp_obs.append(np.repeat(np.array(pop2["time"]), np.array(testdata[:, i])))
        tp_sam.append(np.repeat(np.array(pop2["time"]), np.array(tsample[:, i])))   
    
    #combining data from all labels
    total_obs=[]
    total_sam=[]
    
    for i in range(len(List)):
        for j in range(len(tp_obs[i])):
            total_obs.append(tp_obs[i][j])
            
    for i in range(len(List)):
        for j in range(len(tp_sam[i])):
            total_sam.append(tp_sam[i][j])  
            
    #calculate kl
    ttime=pop2["time"]
    act = total_obs
    sam = total_sam
    actual_density = scipy.stats.kde.gaussian_kde(act)
    actual_density = actual_density(ttime)
    sample_density = scipy.stats.kde.gaussian_kde(sam)
    sample_density = sample_density(ttime)
    kl=(entropy(actual_density, qk=sample_density)+entropy(sample_density, qk=actual_density))/2
    
    return kl


- population_sample

In [None]:
pop = addnoise(df, 0.1) # population data after adding noise
pop2 = setTB(60, pop) 
sam_pop, pop_time= testsample(pop2, pop2, 4, 2)
popnp = np.array(pop2[List])
KL_pop_pop = KL(popnp, sam_pop)

In [None]:
print(KL_pop_pop)

In [None]:
print(pop_time)

- KL between Individauls and population

In [None]:
df2 = pop.groupby("userid.s")  

In [None]:
Userid=df["userid.s"].unique()

In [None]:
KL_ind_ind = [] 
KL_ind_pop = [] 
time_ind = [] 

KLdf=pd.DataFrame(columns=("userid.s", "kl_ind_ind", "kl_ind_pop", "time_cost"))

for i in Userid[:2]:        #####################!!
    
    user_i = df2.get_group(i)    
    
    user2_i = setTB(60, user_i)  #dataframe     
    
    sam_user_i, timeind = testsample(user2_i, user2_i, 4, 2)   #dataframe -> numpy
    
    orig_user = np.array(user2_i[List], dtype = int)   
    
    KL_ind_ind.append(KL(orig_user, sam_user_i))        
    
    KL_ind_pop.append(KL(sam_user_i, sam_pop)) 
    
    time_ind.append(timeind)
    
    
KLdf["userid.s"] = Userid[:2]      ###########################!!
KLdf["kl_ind_ind"] = KL_ind_ind
KLdf["kl_ind_pop"] = KL_ind_pop
KLdf["time_cost"] = time_ind 

In [None]:
KLdf

In [None]:
KLdf.to_csv("kl_slds_individuals1-100.csv")   #############!!!!