In [1]:
import numpy as np
from lme import LME
import pandas as pd
import copy

  data = yaml.load(f.read()) or {}
  defaults = yaml.load(f)


#### Read data

In [2]:
data = pd.read_csv('./data/tb_prevalence.csv')
data = data.sort_values(by=['location_id','age_group_id','sex_id','year_id'])
data = data.iloc[:,1:]

In [3]:
data.shape

(251160, 54)

In [4]:
data_std = pd.read_csv('./data/20190927_tb_latent_std.csv')

In [5]:
data_std.rename(columns={'value': 'std'}, inplace=True)
data_std.head()

Unnamed: 0,location_id,year_id,age_group_id,sex_id,std
0,1,1990,2,1,0.019341
1,1,1990,2,2,0.018662
2,1,1990,3,1,0.0191
3,1,1990,3,2,0.01843
4,1,1990,4,1,0.017106


In [6]:
data_std_log = pd.read_csv('./data/20191002_tb_latent_std_log.csv')

In [7]:
data_std_log.rename(columns={'value': 'std_log'}, inplace=True)

In [8]:
data_std_log.head()

Unnamed: 0,location_id,year_id,age_group_id,sex_id,std_log
0,1,1990,2,1,0.148213
1,1,1990,2,2,0.147902
2,1,1990,3,1,0.146461
3,1,1990,3,2,0.14614
4,1,1990,4,1,0.130146


In [9]:
data = data.merge(data_std, how='left', on=['age_group_id', 'location_id', 'sex_id', 'year_id'])

In [10]:
data = data.merge(data_std_log, how='left', on=['age_group_id', 'location_id', 'sex_id', 'year_id'])
data.head()

Unnamed: 0,age_group_id,location_id,sex_id,year_id,value,haq,intercept,2_X_1,2_X_2,3_X_1,...,30_X_2,31_X_1,31_X_2,32_X_1,32_X_2,235_X_1,235_X_2,lme_fit,std,std_log
0,2,6,1,1990,-1.816082,43.258941,1,1,0,0,...,0,0,0,0,0,0,0,-2.285194,0.019605,0.139473
1,2,6,1,1991,-1.828331,44.078531,1,1,0,0,...,0,0,0,0,0,0,0,-2.308028,0.019377,0.139203
2,2,6,1,1992,-1.838447,44.913657,1,1,0,0,...,0,0,0,0,0,0,0,-2.331294,0.019193,0.139013
3,2,6,1,1993,-1.846512,45.764612,1,1,0,0,...,0,0,0,0,0,0,0,-2.355002,0.019054,0.138922
4,2,6,1,1994,-1.8523,46.631696,1,1,0,0,...,0,0,0,0,0,0,0,-2.379159,0.018964,0.13893


In [11]:
data.shape

(251160, 56)

In [12]:
Y = data['value'].values
haq = data[(data['age_group_id'] == 2) & (data['sex_id'] == 1)]['haq'].values
#haq = haq - np.mean(haq)
#haq = haq/np.linalg.norm(haq)
n_locs = 195
n_ages = 23
T = 28

In [13]:
# cond_num = set()
# for i in range(n_locs):
#     X = np.insert(haq[i*T:(i+1)*T].reshape((-1,1)),0, 1,axis=1)
#     cond_num.add(np.linalg.cond(np.tile(X,(2,1))))

#### Build model

$$ y = \beta \text{HAQ} + \beta_I \text{age-sex} + \pi_{l,a} + \text{HAQ} \pi_{l,a} $$

In [14]:
model = LME([n_locs,n_ages,2,T], 2, Y, {'haq':(haq,[True,False,False,True])}, 
            indicators={'ind_age-sex':[False,True,True,False]}, 
            global_effects_names={'haq':[-float('inf'), float('inf')]},
            global_intercept=False,
            random_effects={'intercept': [True, True, False, False],'haq': [True, True, False, False]})

- not using std

In [15]:
import time
t0 = time.time()
model.optimize(inner_max_iter=500, inner_tol=1e-5, inner_print_level=5,inner_acceptable_tol=1e-6)
print('elapsed', time.time()-t0)

n_groups 4485
k_beta 47
k_gamma 2
total number of fixed effects variables 50
elapsed 52.84145188331604


In [16]:
model.beta_soln

array([-0.02883241, -1.17945073, -1.23191079, -1.17868918, -1.23109453,
       -1.16868225, -1.22065977, -1.1209782 , -1.17084824, -0.93757558,
       -0.98880391, -0.613157  , -0.67144832, -0.30898629, -0.38138614,
        0.01952141, -0.07531363,  0.26362256,  0.15831193,  0.4276619 ,
        0.32279684,  0.61645792,  0.50283113,  0.71235058,  0.59083687,
        0.76388235,  0.65205128,  0.83749297,  0.74088716,  0.84243491,
        0.74834667,  0.84053199,  0.73777845,  0.89099125,  0.77496315,
        0.68987559,  0.56396912,  0.40199244,  0.2762733 ,  0.26424123,
        0.14957838,  0.12353305,  0.02075873, -0.02263863, -0.11202183,
       -0.17772732, -0.25105099])

In [17]:
model.gamma_soln

array([5.87628745e-01, 1.29159109e-04])

In [18]:
model.delta_soln

array([0.00205217])

In [19]:
yfit1 = copy.deepcopy(model.yfit)

- use std before transformation

In [20]:
S = data['std'].values
import time
t0 = time.time()
model.optimize(inner_max_iter=500, inner_tol=1e-6, inner_print_level=5 ,inner_acceptable_tol=1e-6,S=S)
print('elapsed', time.time()-t0)

n_groups 4485
k_beta 47
k_gamma 2
total number of fixed effects variables 49
elapsed 1.1454298496246338


In [21]:
model.beta_soln

array([ 0.00019423, -0.02557646, -0.06165879, -0.02562522, -0.061315  ,
       -0.02280433, -0.06112016, -0.02357069, -0.06940491, -0.01822478,
       -0.0853705 , -0.01327616, -0.09874069, -0.01920095, -0.10573783,
       -0.02658774, -0.10475293, -0.03541001, -0.10089694, -0.03959379,
       -0.10040237, -0.03377626, -0.10611691, -0.03443884, -0.1076334 ,
       -0.04397339, -0.09603879, -0.05534305, -0.08136005, -0.06018654,
       -0.07957636, -0.05729241, -0.08474294, -0.04888209, -0.09018413,
       -0.02582553, -0.1067293 , -0.00293378, -0.12699336,  0.01656852,
       -0.12869593,  0.01314079, -0.11447481, -0.00277457, -0.1052694 ,
       -0.03934762, -0.09416683])

In [22]:
model.gamma_soln

array([0.12089918, 0.13539431])

In [23]:
yfit2 = copy.deepcopy(model.yfit)

In [24]:
print(np.linalg.norm(yfit1 - yfit2)/np.linalg.norm(yfit1))

0.013789514621908465


- use log-transformed std

In [25]:
S = data['std_log'].values
import time
t0 = time.time()
model.optimize(inner_max_iter=500, inner_tol=1e-6, inner_print_level=5,inner_acceptable_tol=1e-6,S=S)
print('elapsed', time.time()-t0)

n_groups 4485
k_beta 47
k_gamma 2
total number of fixed effects variables 49
elapsed 3.509734869003296


In [26]:
model.beta_soln

array([-0.02898112, -0.0732834 , -0.13156889, -0.0692    , -0.1273399 ,
       -0.02867547, -0.08418134, -0.11122161, -0.1662157 , -0.11470971,
       -0.16836117, -0.04091219, -0.10240147,  0.01111177, -0.06341677,
        0.084607  , -0.01480835,  0.12413208,  0.01278147,  0.13971095,
        0.03088963,  0.17856048,  0.05921938,  0.19097109,  0.06600025,
        0.20521256,  0.08621872,  0.21467437,  0.10820046,  0.21286262,
        0.11106609,  0.21430666,  0.10477343,  0.2299274 ,  0.10477687,
        0.21877308,  0.07825269,  0.17509014,  0.03713   ,  0.19392104,
        0.07188358,  0.19324205,  0.09294768,  0.14614003,  0.0505963 ,
        0.00849824, -0.06489276])

In [27]:
model.gamma_soln

array([1.39415508e-01, 2.44430682e-05])

In [28]:
yfit3 = copy.deepcopy(model.yfit)

In [29]:
print(np.linalg.norm(yfit1 - yfit3)/np.linalg.norm(yfit1))

0.018468479568165563
