In [1]:
%run LMLFM.ipynb

import numpy as np
import pandas as pd
from scipy import stats as st
from torch.utils.data import Dataset, DataLoader

# Generate simulated data

In [2]:
%run GeneratingSyntheticData.ipynb

In [3]:
n,m,p = 40,40,5000
correlationType = "both"
seed = 1

np.random.seed(seed)  # this is fixed
non_zero = 10 # only keep 10 non-zero effects
bi = np.random.uniform(1,0,size=non_zero)
bo = np.random.uniform(1,0,size=non_zero)
bi = np.append(bi,np.zeros(p - non_zero))
if p < 15:
    bo = np.concatenate([np.zeros(5),bo])[:p]
else:
    bo = np.concatenate([np.zeros(5),bo,np.zeros(p - 15)])
fixedEff = [1,2,3,-1,-2,-3,7,10]
fixedEff = np.concatenate([fixedEff,np.zeros([p - len(fixedEff)])])

# get theta
Theta_i = np.zeros([n,p])
for k in range(p):
    if bi[k] > 0:
        Theta_i[:,k] = st.laplace.rvs(loc = 0,scale = bi[k],size=n)
Theta_o = np.zeros([m,p])
for k in range(p):
    if bo[k] > 0:
        Theta_o[:,k] = st.laplace.rvs(loc = 0,scale = bo[k],size=m)

if correlationType is 'longitudinal':
    bo = np.zeros_like(bo)
    Theta_o = np.zeros_like(Theta_o)
elif correlationType is 'cluster':
    bi = np.zeros_like(bi)
    Theta_i = np.zeros_like(Theta_i)
    
allData = getData_theta(Theta_i,Theta_o,fixedEff,seed,correlationType)
fullTrain,test = generate(allData,density=0.7,seed = seed)
train,valid = generate(fullTrain,density=0.7,seed = seed)

total ratings to generate: 1600


# Format data to pytorch DataLoader

In [4]:
from collections import defaultdict
class LongitudinalData(Dataset):
    def __init__(self, data):
        self.X = np.array(list(data['X'].values))
        self.y = data['y'].values
        self.iids = data['iid'].values
        self.oids = data['oid'].values
        
        indexes = np.arange(len(self.y))
        self.mapI = defaultdict(list)
        self.mapO = defaultdict(list)
        for ind,(i,o) in enumerate(zip(self.iids,self.oids)):
            self.mapI[i].append(ind)
            self.mapO[o].append(ind)
        
    def __getitem__(self, idx):
        if idx < len(self.mapI):
            related_indexes = self.mapI[idx]
            return {
                 'X':self.X[related_indexes,:],
                     'y':self.y[related_indexes],
                     'target':idx,
                     'indexes':self.oids[related_indexes],
                     'I':True,
                    }
        else:
            idx -= len(self.mapI)
            related_indexes = np.array(self.mapO[idx])
            return {
                 'X':self.X[related_indexes,:],
                     'y':self.y[related_indexes],
                     'target':idx,
                     'indexes':self.iids[related_indexes],
                     'I':False,
                    }
            
    def __len__(self):
        return len(self.mapI) + len(self.mapO)

In [5]:
train_ds = LongitudinalData(train)
valid_ds = LongitudinalData(valid)
full_train_ds = LongitudinalData(fullTrain)
test_ds = LongitudinalData(test)
train_loader = DataLoader(train_ds,batch_size=1,shuffle=True)
valid_loader = DataLoader(valid_ds,batch_size=1,shuffle=False)
full_train_loader = DataLoader(full_train_ds,batch_size=1,shuffle=True)
test_loader = DataLoader(test_ds,batch_size=1,shuffle=False)

In [6]:
device = 'cpu'
dtype = torch.float
tau = 1.5
lmlfm = LMLFM(n,m,p,device=device,alpha = tau/train['y'].var())
lmlfm,_ = train_lmlfm(lmlfm,full_train_loader,30)

epoch 0 finished! loss (-): 26151.115234375
epoch 5 finished! loss (-): 17248.77734375
epoch 10 finished! loss (-): 17135.712890625
epoch 15 finished! loss (-): 17178.95703125
epoch 20 finished! loss (-): 16844.9296875
epoch 25 finished! loss (-): 16844.0625
training finished! loss (-): 16844.017578125


# Evaluation

In [7]:
rsq, y, y_hat = test_lmlfm(lmlfm,test_loader)
rsq

0.8842997550113606

In [8]:
fp, fn = fp_fn(lmlfm,fixedEff)

f.p. => 0, f.n. => 1
