## System Setup

In [1]:
# Find CPU info
!grep -m1 'model name' /proc/cpuinfo | awk -F": " '{print $2}'
!grep 'cpu cores' /proc/cpuinfo | awk -F": " '{a[cores]+=$2}END{print "CPU cores: " a[cores]}'

Intel(R) Xeon(R) CPU @ 2.00GHz
CPU cores: 32


In [2]:
# Find Ram Info
!grep MemTotal /proc/meminfo | awk '{printf "%.1fGB RAM", $2 / 1024 / 1024}'

51.0GB RAM

In [3]:
!python3 --version

Python 3.10.12


In [4]:
!nvidia-smi

Wed Oct 25 03:26:31 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    33W / 300W |      0MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [5]:
# !pip install git+https://github.com/julien-hec/pyBKTR.git
!pip install pyBKTR



## Library Setup

In [6]:
from pyBKTR.bktr import BKTRRegressor
from pyBKTR.examples.bixi import BixiData
from pyBKTR.kernels import KernelMatern, KernelParameter, KernelSE
from pyBKTR.tensor_ops import TSR
from pyBKTR.utils import simulate_spatiotemporal_data

# Following two librairies are for results manipulation
import numpy as np
import pandas as pd

## Bixi Data Intro Example

In [7]:
TSR.set_params('float32', 'cuda', 1)

bixi_data = BixiData()

bktr_regressor = BKTRRegressor(
    data_df = bixi_data.data_df,
    spatial_positions_df = bixi_data.spatial_positions_df,
    temporal_positions_df = bixi_data.temporal_positions_df,
    burn_in_iter = 500,
    sampling_iter = 500,
)

bktr_regressor.mcmc_sampling()

print(bktr_regressor.summary)

* Iter 1     | Elapsed     0.34s | MAE  0.1027 | RMSE  0.1438 *
* Iter 2     | Elapsed     0.12s | MAE  0.0675 | RMSE  0.0912 *
* Iter 3     | Elapsed     0.14s | MAE  0.0587 | RMSE  0.0794 *
* Iter 4     | Elapsed     0.09s | MAE  0.0564 | RMSE  0.0761 *
* Iter 5     | Elapsed     0.19s | MAE  0.0557 | RMSE  0.0752 *
* Iter 6     | Elapsed     0.22s | MAE  0.0554 | RMSE  0.0747 *
* Iter 7     | Elapsed     0.18s | MAE  0.0553 | RMSE  0.0745 *
* Iter 8     | Elapsed     0.16s | MAE  0.0553 | RMSE  0.0745 *
* Iter 9     | Elapsed     0.16s | MAE  0.0552 | RMSE  0.0744 *
* Iter 10    | Elapsed     0.19s | MAE  0.0550 | RMSE  0.0742 *
* Iter 11    | Elapsed     0.33s | MAE  0.0550 | RMSE  0.0740 *
* Iter 12    | Elapsed     0.29s | MAE  0.0547 | RMSE  0.0737 *
* Iter 13    | Elapsed     0.29s | MAE  0.0546 | RMSE  0.0737 *
* Iter 14    | Elapsed     0.27s | MAE  0.0545 | RMSE  0.0735 *
* Iter 15    | Elapsed     0.13s | MAE  0.0545 | RMSE  0.0734 *
* Iter 16    | Elapsed     0.24s | MAE  

In [8]:
from pyBKTR.plots import plot_y_estimates

plot_y_estimates(bktr_regressor)

## Imputation Analysis (Section 5.2)

In [9]:
%%capture

df_res_arr = []
TSR.set_params('float32', 'cuda', 2023)
BURN_IN_ITER = 500
SAMPLING_ITER = 500

for len_scale in [3, 6]:
    for miss_perc in [0.1, 0.5, 0.9]:
        for i in range(1, 11):
            spatial_kernel = KernelMatern(
                smoothness_factor = 5,
                lengthscale = KernelParameter(value = len_scale)
            )
            temporal_kernel = KernelSE(
                lengthscale = KernelParameter(value = len_scale)
            )

            simu_data = simulate_spatiotemporal_data(
                nb_locations=100,
                nb_time_points=150,
                nb_spatial_dimensions=2,
                spatial_scale=10,
                time_scale=10,
                spatial_covariates_means=[0, 2, 4],
                temporal_covariates_means=[1, 3],
                spatial_kernel=spatial_kernel,
                temporal_kernel=temporal_kernel,
                noise_variance_scale=1
            )

            data_df = simu_data['data_df'].copy()
            spatial_positions_df = simu_data['spatial_positions_df']
            temporal_positions_df = simu_data['temporal_positions_df']
            index_choices_tsr = TSR.tensor(list(range(len(data_df))))
            nb_miss_index = round(miss_perc * len(data_df))
            na_index = TSR.rand_choice(
                index_choices_tsr, nb_miss_index, use_replace=False
            ).cpu().numpy().astype(int)
            data_df.iloc[na_index, 0] = pd.NA

            bktr_regressor = BKTRRegressor(
                data_df = data_df,
                rank_decomp = 10,
                burn_in_iter = BURN_IN_ITER,
                sampling_iter = SAMPLING_ITER,
                spatial_kernel = KernelMatern(smoothness_factor = 5),
                spatial_positions_df = simu_data['spatial_positions_df'],
                temporal_kernel = KernelSE(),
                temporal_positions_df = simu_data['temporal_positions_df'],
                has_geo_coords = False
            )
            bktr_regressor.mcmc_sampling()

            y_err = (
                bktr_regressor.imputed_y_estimates.iloc[
                    na_index
                ][['y']].to_numpy()
                - simu_data['data_df'].iloc[na_index][['y']].to_numpy()
            )
            beta_err = (
                np.abs(bktr_regressor.beta_estimates.to_numpy()
                - simu_data['beta_df'].to_numpy())
            )
            y_rmse = float(np.sqrt(np.mean(y_err**2)))
            y_mae = float(np.mean(abs(y_err)))
            beta_rmse = float(np.sqrt(np.mean(beta_err**2)))
            beta_mae = float(np.mean(abs(beta_err)))

            df_res_arr.append([
                len_scale,
                miss_perc,
                i,
                beta_mae,
                beta_rmse,
                y_mae,
                y_rmse,
                bktr_regressor.result_logger.total_elapsed_time
            ])



In [10]:
print('## Iterations dataframe ##')
df = pd.DataFrame(df_res_arr, columns=[
    'Lengthscale', 'Missing', 'Iter', 'B_MAE',
    'B_RMSE', 'Y_MAE', 'Y_RMSE', 'Time'
])
print(df)

print('## Aggregated dataframe ##')
agg_df = df.groupby(['Lengthscale', 'Missing'])[[
    'B_MAE', 'B_RMSE', 'Y_MAE', 'Y_RMSE', 'Time'
]].agg(['mean', 'std']).reset_index()
print(agg_df)


## Iterations dataframe ##
    Lengthscale  Missing  Iter     B_MAE    B_RMSE     Y_MAE    Y_RMSE  \
0             3      0.1     1  0.456836  0.646745  0.899862  1.139481   
1             3      0.1     2  1.803280  3.278733  0.902540  1.121975   
2             3      0.1     3  1.087550  1.876726  0.895373  1.126965   
3             3      0.1     4  0.756219  1.218265  0.847705  1.067894   
4             3      0.1     5  1.070130  1.796503  0.882446  1.110933   
5             3      0.1     6  1.040516  1.580049  0.853942  1.086386   
6             3      0.1     7  0.712554  1.032774  0.906656  1.124438   
7             3      0.1     8  0.577121  0.953384  0.875244  1.093974   
8             3      0.1     9  0.468574  0.753467  0.861979  1.066301   
9             3      0.1    10  0.471016  0.773561  0.873121  1.093671   
10            3      0.5     1  1.215339  1.953418  0.963654  1.221397   
11            3      0.5     2  0.863352  1.359332  0.923672  1.171887   
12         

## Interpolation Analysis (Section 5.3)

In [11]:
%%capture

TSR.set_params('float32', 'cuda', 3)

BURN_IN_ITER = 500
SAMPLING_ITER = 500

nb_aside_locs = 4
nb_aside_times = 6

df_res_arr = []

for ds_type in ['Smaller', 'Larger']:
    for len_scale in [3, 6]:
        for i in range(1, 11):
            matern_lengthscale = KernelParameter(value = len_scale)
            se_lengthscale = KernelParameter(value = len_scale)
            spatial_kernel = KernelMatern(
                lengthscale = matern_lengthscale, smoothness_factor = 5
            )
            temporal_kernel = KernelSE(lengthscale = se_lengthscale)

            nb_locs = 20 if ds_type == 'Smaller' else 100
            nb_times = 30 if ds_type == 'Smaller' else 150
            spa_cov_means = [0, 2] if ds_type == 'Smaller' else [0, 2, 4]
            tem_cov_means = [1] if ds_type == 'Smaller' else [1, 3]

            simu_data = simulate_spatiotemporal_data(
                nb_locations=nb_locs,
                nb_time_points=nb_times,
                nb_spatial_dimensions=2,
                spatial_scale=10,
                time_scale=10,
                spatial_covariates_means=spa_cov_means,
                temporal_covariates_means=tem_cov_means,
                spatial_kernel=spatial_kernel,
                temporal_kernel=temporal_kernel,
                noise_variance_scale=1
            )

            data_df = simu_data['data_df'].copy()
            spatial_positions_df = simu_data['spatial_positions_df']
            temporal_positions_df = simu_data['temporal_positions_df']

            obs_nb_locs = nb_locs - nb_aside_locs
            obs_nb_times = nb_times - nb_aside_times

            all_locs = data_df.index.get_level_values(0).unique().to_list()
            all_times = data_df.index.get_level_values(1).unique().to_list()

            locs_indx_sample = list(TSR.rand_choice(
                TSR.tensor(range(1, len(all_locs) + 1)),
                obs_nb_locs
            ).cpu().numpy())
            obs_locs = [all_locs[int(i) - 1] for i in locs_indx_sample]
            new_locs = list(set(all_locs) - set(obs_locs))

            times_indx_sample = list(TSR.rand_choice(TSR.tensor(
                range(1, len(all_times) + 1)),
                obs_nb_times
            ).cpu().numpy())
            obs_times = [all_times[int(i) - 1] for i in times_indx_sample]
            new_times = list(set(all_times) - set(obs_times))

            obs_data_df = data_df.drop(index=new_locs, level='location')
            obs_data_df = obs_data_df.drop(index=new_times, level='time')
            obs_spatial_pos_df = spatial_positions_df.drop(index=new_locs,)
            obs_temporal_pos_df = temporal_positions_df.drop(index=new_times,)

            new_data_df = data_df[
                (data_df.index.get_level_values(0).isin(new_locs)) |
                (data_df.index.get_level_values(1).isin(new_times))
            ].copy()
            new_beta_data_df = simu_data['beta_df'][
                simu_data['beta_df'].index.get_level_values(0).isin(new_locs) |
                simu_data['beta_df'].index.get_level_values(1).isin(new_times)
            ].copy()
            new_spatial_pos_df = spatial_positions_df[
                spatial_positions_df.index.isin(new_locs)
            ].copy()
            new_temporal_pos_df = temporal_positions_df[
                temporal_positions_df.index.isin(new_times)
            ].copy()


            bktr_regressor = BKTRRegressor(
                data_df = obs_data_df,
                rank_decomp = 10,
                burn_in_iter = BURN_IN_ITER,
                sampling_iter = SAMPLING_ITER,
                spatial_kernel = KernelMatern(smoothness_factor = 5),
                spatial_positions_df = obs_spatial_pos_df,
                temporal_kernel = KernelSE(),
                temporal_positions_df = obs_temporal_pos_df,
                has_geo_coords = False
            )
            bktr_regressor.mcmc_sampling()

            preds_y_df, preds_beta_df = bktr_regressor.predict(
                new_data_df,
                new_spatial_pos_df,
                new_temporal_pos_df
            )

            preds_y_df.sort_index(inplace=True)
            new_data_df.sort_index(inplace=True)
            preds_beta_df.sort_index(inplace=True)
            new_beta_data_df.sort_index(inplace=True)
            preds_y_err = (
                new_data_df['y'].to_numpy() - preds_y_df['y'].to_numpy()
            )
            preds_beta_err = (
                new_beta_data_df.to_numpy() - preds_beta_df.to_numpy()
            )
            df_res_arr.append([
                ds_type,
                len_scale,
                i,
                np.mean(np.abs(preds_beta_err)),
                np.sqrt(np.mean(np.square(preds_beta_err))),
                np.mean(np.abs(preds_y_err)),
                np.sqrt(np.mean(np.square(preds_y_err))),
            ])



In [12]:
print('## Iterations dataframe ##')
df = pd.DataFrame(df_res_arr, columns=[
    'Dataset_Type', 'Lengthscale', 'Iter',
    'B_MAE', 'B_RMSE', 'Y_MAE', 'Y_RMSE'
])
print(df)

print('## Aggregated dataframe ##')
agg_df = df.groupby(['Dataset_Type', 'Lengthscale'])[[
    'B_MAE', 'B_RMSE', 'Y_MAE', 'Y_RMSE'
]].agg(['mean', 'std']).reset_index()
print(agg_df)

## Iterations dataframe ##
   Dataset_Type  Lengthscale  Iter     B_MAE    B_RMSE     Y_MAE     Y_RMSE
0       Smaller            3     1  0.868864  1.183598  1.551411   2.058035
1       Smaller            3     2  0.948494  1.441087  1.847889   2.608730
2       Smaller            3     3  0.483438  0.622200  1.436089   1.831888
3       Smaller            3     4  0.749204  1.002860  1.274208   1.650741
4       Smaller            3     5  0.995754  1.360431  1.269009   1.657968
5       Smaller            3     6  0.848008  1.207954  1.389270   1.963238
6       Smaller            3     7  0.639127  0.817665  1.137642   1.445038
7       Smaller            3     8  0.430833  0.589409  1.234845   1.664905
8       Smaller            3     9  0.550359  0.761297  1.737363   2.262836
9       Smaller            3    10  0.953655  1.332466  1.997401   2.755547
10      Smaller            6     1  0.427266  0.604590  1.025018   1.304990
11      Smaller            6     2  0.426282  0.551310  1.079

## Mercator Projection (Appendix E)

In [13]:
import plotly.express as px

bixi_data = BixiData()

bktr_regressor = BKTRRegressor(
    data_df = bixi_data.data_df,
    spatial_positions_df = bixi_data.spatial_positions_df,
    temporal_positions_df = bixi_data.temporal_positions_df
)

print('# Initial dataframe (longitude, latitude) #')
print(bktr_regressor.geo_coords_projector.ini_df.head())
print()
print('# Mercator Projection dataframe ([-5, 5] scaled) #')
print(bktr_regressor.geo_coords_projector.scaled_ini_df.head())


# Initial dataframe (longitude, latitude) #
                                                     latitude  longitude
location                                                                
10002 - Métro Charlevoix (Centre / Charlevoix)      45.478228 -73.569651
4000 - Jeanne-d'Arc / Ontario                       45.549598 -73.541874
4001 - Graham / Brookfield                          45.520075 -73.629776
4002 - Graham / Wicksteed                           45.516937 -73.640483
5006 - Collège Édouard-Montpetit (de Gentilly /...  45.537226 -73.495067

# Mercator Projection dataframe ([-5, 5] scaled) #
                                                       lon_x     lat_y
location                                                              
10002 - Métro Charlevoix (Centre / Charlevoix)      1.035426 -1.583683
4000 - Jeanne-d'Arc / Ontario                       2.094213  2.298574
4001 - Graham / Brookfield                         -1.256364  0.692039
4002 - Graham / Wicksteed             

In [14]:
FIG_WIDTH = 550

fig_scale = px.scatter(
    bktr_regressor.geo_coords_projector.scaled_ini_df.reset_index(),
    x='lon_x', y='lat_y', hover_name='location',
    width = FIG_WIDTH
)
fig_scale.update_xaxes(range=[-5.5, 5.5])
fig_scale.update_yaxes(range=[-5.5, 5.5])
fig_scale.show()

fig_map = px.scatter_mapbox(
    bktr_regressor.geo_coords_projector.ini_df,
    lat='latitude', lon='longitude', zoom=9.9,
    mapbox_style='carto-positron',
    width = FIG_WIDTH
)
fig_map.show()
