## System Setup

In [11]:
# Find CPU info
!grep -m1 'model name' /proc/cpuinfo | awk -F": " '{print $2}'
!grep 'cpu cores' /proc/cpuinfo | awk -F": " '{a[cores]+=$2}END{print "CPU cores: " a[cores]}'

Intel(R) Xeon(R) CPU @ 2.20GHz
CPU cores: 72


In [12]:
# Find Ram Info
!grep MemTotal /proc/meminfo | awk '{printf "%.1fGB RAM", $2 / 1024 / 1024}'

83.5GB RAM

In [13]:
!python3 --version

Python 3.10.12


In [14]:
!nvidia-smi

Tue Aug 27 07:21:33 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P0              50W / 400W |    689MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [15]:
# !pip install git+https://github.com/julien-hec/pyBKTR.git
!pip install pyBKTR



## Library Setup

In [16]:
from pyBKTR.bktr import BKTRRegressor
from pyBKTR.examples.bixi import BixiData
from pyBKTR.kernels import KernelMatern, KernelParameter, KernelSE
from pyBKTR.tensor_ops import TSR
from pyBKTR.utils import simulate_spatiotemporal_data

# Following two librairies are for results manipulation
import numpy as np
import pandas as pd

## Imputation Analysis (Section 5.2)

In [21]:
%%capture

df_res_arr = []
TSR.set_params('float32', 'cuda', 1)
BURN_IN_ITER = 500
SAMPLING_ITER = 500

for len_scale in [3, 6]:
    for miss_perc in [0.1, 0.5, 0.9]:
        for i in range(1, 11):
            spatial_kernel = KernelMatern(
                smoothness_factor = 5,
                lengthscale = KernelParameter(value = len_scale)
            )
            temporal_kernel = KernelSE(
                lengthscale = KernelParameter(value = len_scale)
            )

            simu_data = simulate_spatiotemporal_data(
                nb_locations=100,
                nb_time_points=150,
                nb_spatial_dimensions=2,
                spatial_scale=10,
                time_scale=10,
                spatial_covariates_means=[0, 2, 4],
                temporal_covariates_means=[1, 3],
                spatial_kernel=spatial_kernel,
                temporal_kernel=temporal_kernel,
                noise_variance_scale=1
            )

            data_df = simu_data['data_df'].copy()
            spatial_positions_df = simu_data['spatial_positions_df']
            temporal_positions_df = simu_data['temporal_positions_df']
            index_choices_tsr = TSR.tensor(list(range(len(data_df))))
            nb_miss_index = round(miss_perc * len(data_df))
            na_index = TSR.rand_choice(
                index_choices_tsr, nb_miss_index, use_replace=False
            ).cpu().numpy().astype(int)
            data_df.iloc[na_index, 0] = pd.NA

            bktr_regressor = BKTRRegressor(
                data_df = data_df,
                rank_decomp = 10,
                burn_in_iter = BURN_IN_ITER,
                sampling_iter = SAMPLING_ITER,
                spatial_kernel = KernelMatern(smoothness_factor = 5),
                spatial_positions_df = simu_data['spatial_positions_df'],
                temporal_kernel = KernelSE(),
                temporal_positions_df = simu_data['temporal_positions_df'],
                has_geo_coords = False
            )
            bktr_regressor.mcmc_sampling()

            y_err = (
                bktr_regressor.imputed_y_estimates.iloc[
                    na_index
                ][['y']].to_numpy()
                - simu_data['data_df'].iloc[na_index][['y']].to_numpy()
            )
            beta_err = (
                np.abs(bktr_regressor.beta_estimates.to_numpy()
                - simu_data['beta_df'].to_numpy())
            )
            y_rmse = float(np.sqrt(np.mean(y_err**2)))
            y_mae = float(np.mean(abs(y_err)))
            beta_rmse = float(np.sqrt(np.mean(beta_err**2)))
            beta_mae = float(np.mean(abs(beta_err)))

            df_res_arr.append([
                len_scale,
                miss_perc,
                i,
                beta_mae,
                beta_rmse,
                y_mae,
                y_rmse,
                bktr_regressor.result_logger.total_elapsed_time
            ])



In [22]:
print('## Iterations dataframe ##')
df = pd.DataFrame(df_res_arr, columns=[
    'Lengthscale', 'Missing', 'Iter', 'B_MAE',
    'B_RMSE', 'Y_MAE', 'Y_RMSE', 'Time'
])
print(df)



## Iterations dataframe ##
    Lengthscale  Missing  Iter     B_MAE    B_RMSE     Y_MAE    Y_RMSE  \
0             3      0.1     1  1.433314  2.569366  0.855584  1.076056   
1             3      0.1     2  0.845891  1.273455  0.878563  1.105888   
2             3      0.1     3  0.574145  0.926837  0.849937  1.067444   
3             3      0.1     4  0.547044  0.868825  0.841883  1.052478   
4             3      0.1     5  1.308256  2.200260  0.940188  1.182618   
5             3      0.1     6  1.949870  4.980751  0.859076  1.080340   
6             3      0.1     7  1.016235  1.692931  0.926721  1.169011   
7             3      0.1     8  0.651573  1.033412  0.880711  1.118243   
8             3      0.1     9  1.028118  1.616776  0.849865  1.064260   
9             3      0.1    10  0.543136  0.898671  0.826377  1.055736   
10            3      0.5     1  0.671935  1.107424  0.908295  1.145334   
11            3      0.5     2  1.496861  2.799132  0.895344  1.126215   
12         

In [23]:
print('## Aggregated dataframe ##')
agg_df = df.groupby(['Lengthscale', 'Missing'])[[
    'B_MAE', 'B_RMSE', 'Y_MAE', 'Y_RMSE', 'Time'
]].agg(['mean', 'std']).reset_index()
print(agg_df)

## Aggregated dataframe ##
  Lengthscale Missing     B_MAE              B_RMSE               Y_MAE  \
                           mean       std      mean       std      mean   
0           3     0.1  0.989758  0.462534  1.806129  1.257014  0.870891   
1           3     0.5  0.811188  0.248060  1.314685  0.529623  0.909432   
2           3     0.9  0.674252  0.140432  0.991699  0.258465  1.143228   
3           6     0.1  0.204515  0.077878  0.329732  0.129214  0.826318   
4           6     0.5  0.193037  0.021355  0.304975  0.044958  0.819493   
5           6     0.9  0.241455  0.042016  0.367298  0.079752  0.882668   

               Y_RMSE                 Time            
        std      mean       std       mean       std  
0  0.036746  1.097207  0.046419  55.499747  0.956223  
1  0.011611  1.148435  0.015825  55.222292  0.766036  
2  0.065219  1.480921  0.098310  52.616762  0.913133  
3  0.014179  1.034502  0.016359  52.760185  0.675133  
4  0.008321  1.029116  0.010531  50.592661

In [26]:
def format_var_row(row, var_symbol):
    mae_key = f'{var_symbol}_MAE'
    rmse_key = f'{var_symbol}_RMSE'
    return (
        f'{row[mae_key]["mean"]:.2f}±{row[mae_key]["std"]:.2f}'
        f'/{row[rmse_key]["mean"]:.2f}±{row[rmse_key]["std"]:.2f}'
    )

fmt_df = agg_df.loc[:, ['Lengthscale', 'Missing']]
fmt_df['B_res'] = agg_df[['B_MAE', 'B_RMSE']].apply(format_var_row, var_symbol='B', axis=1)
fmt_df['Y_res'] = agg_df[['Y_MAE', 'Y_RMSE']].apply(format_var_row, var_symbol='Y', axis=1)
fmt_df

Unnamed: 0,Lengthscale,Missing,B_res,Y_res
,,,,
0.0,3.0,0.1,0.99±0.46/1.81±1.26,0.87±0.04/1.10±0.05
1.0,3.0,0.5,0.81±0.25/1.31±0.53,0.91±0.01/1.15±0.02
2.0,3.0,0.9,0.67±0.14/0.99±0.26,1.14±0.07/1.48±0.10
3.0,6.0,0.1,0.20±0.08/0.33±0.13,0.83±0.01/1.03±0.02
4.0,6.0,0.5,0.19±0.02/0.30±0.04,0.82±0.01/1.03±0.01
5.0,6.0,0.9,0.24±0.04/0.37±0.08,0.88±0.02/1.11±0.02
