## System Setup

In [1]:
# Find CPU info
!grep -m1 'model name' /proc/cpuinfo | awk -F": " '{print $2}'
!grep 'cpu cores' /proc/cpuinfo | awk -F": " '{a[cores]+=$2}END{print "CPU cores: " a[cores]}'

Intel(R) Xeon(R) CPU @ 2.20GHz
CPU cores: 72


In [2]:
# Find Ram Info
!grep MemTotal /proc/meminfo | awk '{printf "%.1fGB RAM", $2 / 1024 / 1024}'

83.5GB RAM

In [3]:
!python3 --version

Python 3.10.12


In [4]:
!nvidia-smi

Tue Aug 27 07:21:43 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0              43W / 400W |      2MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [5]:
# !pip install git+https://github.com/julien-hec/pyBKTR.git
!pip install pyBKTR



## Library Setup

In [6]:
from pyBKTR.bktr import BKTRRegressor
from pyBKTR.examples.bixi import BixiData
from pyBKTR.kernels import KernelMatern, KernelParameter, KernelSE
from pyBKTR.tensor_ops import TSR
from pyBKTR.utils import simulate_spatiotemporal_data

# Following two librairies are for results manipulation
import numpy as np
import pandas as pd

## Interpolation Analysis (Section 5.3)

In [7]:
%%capture

TSR.set_params('float32', 'cuda', 1)

BURN_IN_ITER = 500
SAMPLING_ITER = 500

nb_aside_locs = 4
nb_aside_times = 6

df_res_arr = []

for ds_type in ['Smaller', 'Larger']:
    for len_scale in [3, 6]:
        for i in range(1, 11):
            matern_lengthscale = KernelParameter(value = len_scale)
            se_lengthscale = KernelParameter(value = len_scale)
            spatial_kernel = KernelMatern(
                lengthscale = matern_lengthscale, smoothness_factor = 5
            )
            temporal_kernel = KernelSE(lengthscale = se_lengthscale)

            nb_locs = 20 if ds_type == 'Smaller' else 100
            nb_times = 30 if ds_type == 'Smaller' else 150
            spa_cov_means = [0, 2] if ds_type == 'Smaller' else [0, 2, 4]
            tem_cov_means = [1] if ds_type == 'Smaller' else [1, 3]

            simu_data = simulate_spatiotemporal_data(
                nb_locations=nb_locs,
                nb_time_points=nb_times,
                nb_spatial_dimensions=2,
                spatial_scale=10,
                time_scale=10,
                spatial_covariates_means=spa_cov_means,
                temporal_covariates_means=tem_cov_means,
                spatial_kernel=spatial_kernel,
                temporal_kernel=temporal_kernel,
                noise_variance_scale=1
            )

            data_df = simu_data['data_df'].copy()
            spatial_positions_df = simu_data['spatial_positions_df']
            temporal_positions_df = simu_data['temporal_positions_df']

            obs_nb_locs = nb_locs - nb_aside_locs
            obs_nb_times = nb_times - nb_aside_times

            all_locs = data_df.index.get_level_values(0).unique().to_list()
            all_times = data_df.index.get_level_values(1).unique().to_list()

            locs_indx_sample = list(TSR.rand_choice(
                TSR.tensor(range(1, len(all_locs) + 1)),
                obs_nb_locs
            ).cpu().numpy())
            obs_locs = [all_locs[int(i) - 1] for i in locs_indx_sample]
            new_locs = list(set(all_locs) - set(obs_locs))

            times_indx_sample = list(TSR.rand_choice(TSR.tensor(
                range(1, len(all_times) + 1)),
                obs_nb_times
            ).cpu().numpy())
            obs_times = [all_times[int(i) - 1] for i in times_indx_sample]
            new_times = list(set(all_times) - set(obs_times))

            obs_data_df = data_df.drop(index=new_locs, level='location')
            obs_data_df = obs_data_df.drop(index=new_times, level='time')
            obs_spatial_pos_df = spatial_positions_df.drop(index=new_locs,)
            obs_temporal_pos_df = temporal_positions_df.drop(index=new_times,)

            new_data_df = data_df[
                (data_df.index.get_level_values(0).isin(new_locs)) |
                (data_df.index.get_level_values(1).isin(new_times))
            ].copy()
            new_beta_data_df = simu_data['beta_df'][
                simu_data['beta_df'].index.get_level_values(0).isin(new_locs) |
                simu_data['beta_df'].index.get_level_values(1).isin(new_times)
            ].copy()
            new_spatial_pos_df = spatial_positions_df[
                spatial_positions_df.index.isin(new_locs)
            ].copy()
            new_temporal_pos_df = temporal_positions_df[
                temporal_positions_df.index.isin(new_times)
            ].copy()


            bktr_regressor = BKTRRegressor(
                data_df = obs_data_df,
                rank_decomp = 10,
                burn_in_iter = BURN_IN_ITER,
                sampling_iter = SAMPLING_ITER,
                spatial_kernel = KernelMatern(smoothness_factor = 5),
                spatial_positions_df = obs_spatial_pos_df,
                temporal_kernel = KernelSE(),
                temporal_positions_df = obs_temporal_pos_df,
                has_geo_coords = False
            )
            bktr_regressor.mcmc_sampling()

            preds_y_df, preds_beta_df = bktr_regressor.predict(
                new_data_df,
                new_spatial_pos_df,
                new_temporal_pos_df
            )

            preds_y_df.sort_index(inplace=True)
            new_data_df.sort_index(inplace=True)
            preds_beta_df.sort_index(inplace=True)
            new_beta_data_df.sort_index(inplace=True)
            preds_y_err = (
                new_data_df['y'].to_numpy() - preds_y_df['y'].to_numpy()
            )
            preds_beta_err = (
                new_beta_data_df.to_numpy() - preds_beta_df.to_numpy()
            )
            df_res_arr.append([
                ds_type,
                len_scale,
                i,
                np.mean(np.abs(preds_beta_err)),
                np.sqrt(np.mean(np.square(preds_beta_err))),
                np.mean(np.abs(preds_y_err)),
                np.sqrt(np.mean(np.square(preds_y_err))),
            ])



In [8]:
print('## Iterations dataframe ##')
df = pd.DataFrame(df_res_arr, columns=[
    'Dataset_Type', 'Lengthscale', 'Iter',
    'B_MAE', 'B_RMSE', 'Y_MAE', 'Y_RMSE'
])
print(df)

## Iterations dataframe ##
   Dataset_Type  Lengthscale  Iter     B_MAE    B_RMSE     Y_MAE    Y_RMSE
0       Smaller            3     1  0.660119  0.871570  1.229162  1.566454
1       Smaller            3     2  0.765964  0.984350  1.462015  1.954927
2       Smaller            3     3  0.646867  0.907108  1.387590  1.908763
3       Smaller            3     4  0.624262  0.937684  1.262828  1.596692
4       Smaller            3     5  0.483263  0.658382  1.106988  1.379000
5       Smaller            3     6  1.162499  1.619392  2.187072  3.313260
6       Smaller            3     7  0.695293  0.935071  1.581736  2.136059
7       Smaller            3     8  0.635039  0.822060  1.382198  1.855471
8       Smaller            3     9  0.955343  1.330783  1.547371  2.048242
9       Smaller            3    10  0.693688  0.969387  1.142089  1.444003
10      Smaller            6     1  0.265941  0.338934  0.977999  1.243594
11      Smaller            6     2  0.433679  0.631588  1.107326  1.40791

In [9]:
print('## Aggregated dataframe ##')
agg_df = df.groupby(['Dataset_Type', 'Lengthscale'])[[
    'B_MAE', 'B_RMSE', 'Y_MAE', 'Y_RMSE'
]].agg(['mean', 'std']).reset_index()
print(agg_df)

## Aggregated dataframe ##
  Dataset_Type Lengthscale     B_MAE              B_RMSE               Y_MAE  \
                                mean       std      mean       std      mean   
0       Larger           3  1.056731  0.477765  1.712462  0.832991  2.511188   
1       Larger           6  0.248190  0.078143  0.409954  0.143800  0.984085   
2      Smaller           3  0.732234  0.192779  1.003579  0.274004  1.428905   
3      Smaller           6  0.420434  0.103926  0.590677  0.171168  1.081319   

               Y_RMSE            
        std      mean       std  
0  1.209082  4.218502  2.383817  
1  0.261005  1.255063  0.353573  
2  0.311014  1.920287  0.553633  
3  0.170755  1.407780  0.273482  


In [15]:
def format_var_row(row, var_symbol):
    mae_key = f'{var_symbol}_MAE'
    rmse_key = f'{var_symbol}_RMSE'
    return (
        f'{row[mae_key]["mean"]:.2f}±{row[mae_key]["std"]:.2f}'
        f'/{row[rmse_key]["mean"]:.2f}±{row[rmse_key]["std"]:.2f}'
    )

fmt_df = agg_df.loc[:, ['Dataset_Type', 'Lengthscale']]
fmt_df['B_res'] = agg_df[['B_MAE', 'B_RMSE']].apply(format_var_row, var_symbol='B', axis=1)
fmt_df['Y_res'] = agg_df[['Y_MAE', 'Y_RMSE']].apply(format_var_row, var_symbol='Y', axis=1)
fmt_df.sort_values(by=['Dataset_Type', 'Lengthscale'], ascending=[False, True])

Unnamed: 0,Dataset_Type,Lengthscale,B_res,Y_res
,,,,
2.0,Smaller,3.0,0.73±0.19/1.00±0.27,1.43±0.31/1.92±0.55
3.0,Smaller,6.0,0.42±0.10/0.59±0.17,1.08±0.17/1.41±0.27
0.0,Larger,3.0,1.06±0.48/1.71±0.83,2.51±1.21/4.22±2.38
1.0,Larger,6.0,0.25±0.08/0.41±0.14,0.98±0.26/1.26±0.35
