In [1]:
from datetime import datetime

import torch
from config import *

from data.compress import *
from data.util import count_points_between, crop_q_between

%reload_ext autoreload
%autoreload 2

In [2]:
READ_START_DATE = datetime.strptime(CONFIG['READ_START_DATE'], DATE_FORMAT)
READ_END_DATE = datetime.strptime(CONFIG['READ_END_DATE'], DATE_FORMAT)
TRAIN_START_DATE = datetime.strptime(CONFIG['TRAIN_START_DATE'], DATE_FORMAT)
TRAIN_END_DATE = datetime.strptime(CONFIG['TRAIN_END_DATE'], DATE_FORMAT)
ALPHA = CONFIG['ALPHA']

Load matrix $Q$

In [3]:
mat_q = torch.load(out_path('mat_q_resid.pt'))
mat_q = torch.abs(mat_q)
mat_q.shape

torch.Size([32064, 472])

Crop $Q$ to the required time period

In [4]:
n_samples, n_sections = mat_q.shape
mat_q = crop_q_between(mat_q, READ_START_DATE, READ_END_DATE, TRAIN_START_DATE, TRAIN_END_DATE)
assert mat_q.shape == (count_points_between(TRAIN_START_DATE, TRAIN_END_DATE), n_sections)
mat_q.shape

torch.Size([2880, 472])

Construct a correlation coefficient matrix
$$
R(i, j)=\frac{\sum_{k=1}^d(z(s_i,t_k)-\tilde{z}(s_i))(z(s_j,t_k)-\tilde{z}(s_j))}{\sqrt{\sum_{k=1}^d(z(s_i,t_k)-\tilde{z}(s_i))^2}\sqrt{\sum_{k=1}^d(z(s_j,t_k)-\tilde{z}(s_j))^2}},
$$
where $$\tilde{z}(s_i)=\frac{1}{d}\sum_{k=1}^dz(s_i,t_k)$$

In [16]:
mat_r_orig = build_correlation_matrix(mat_q)
mat_r, nonempty = build_correlation_matrix(mat_q, True)
display(mat_r_orig)
display(mat_r)
del mat_r_orig
nonempty

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  1.0000,  0.0000,  ...,  0.0021, -0.0053, -0.0141],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [ 0.0000,  0.0021,  0.0000,  ...,  1.0000,  0.3693,  0.3200],
        [ 0.0000, -0.0053,  0.0000,  ...,  0.3693,  1.0000,  0.3010],
        [ 0.0000, -0.0141,  0.0000,  ...,  0.3200,  0.3010,  1.0000]])

tensor([[ 1.0000,  0.0512,  0.0026,  ...,  0.0021, -0.0053, -0.0141],
        [ 0.0512,  1.0000,  0.1154,  ...,  0.1707,  0.2455,  0.1391],
        [ 0.0026,  0.1154,  1.0000,  ...,  0.4088,  0.2961,  0.3215],
        ...,
        [ 0.0021,  0.1707,  0.4088,  ...,  1.0000,  0.3693,  0.3200],
        [-0.0053,  0.2455,  0.2961,  ...,  0.3693,  1.0000,  0.3010],
        [-0.0141,  0.1391,  0.3215,  ...,  0.3200,  0.3010,  1.0000]])

tensor([  1,   3,   5,   6,   9,  11,  12,  13,  15,  17,  19,  21,  23,  24,
         25,  26,  27,  29,  30,  32,  33,  34,  37,  38,  39,  40,  41,  43,
         45,  46,  47,  49,  50,  52,  53,  54,  55,  56,  58,  59,  61,  63,
         65,  67,  69,  71,  72,  74,  77,  79,  80,  81,  83,  85,  87,  89,
         90,  91,  92,  98,  99, 101, 102, 103, 105, 106, 109, 110, 111, 112,
        113, 114, 115, 116, 117, 119, 120, 121, 123, 124, 125, 126, 129, 130,
        133, 134, 135, 136, 137, 139, 141, 142, 143, 144, 145, 147, 148, 149,
        150, 151, 153, 155, 156, 157, 159, 160, 161, 163, 165, 166, 167, 168,
        169, 170, 171, 173, 174, 175, 177, 179, 180, 183, 184, 185, 186, 189,
        190, 191, 193, 195, 197, 199, 201, 202, 203, 205, 206, 207, 209, 210,
        211, 212, 215, 217, 219, 220, 221, 223, 225, 226, 227, 229, 230, 231,
        233, 234, 235, 237, 238, 239, 241, 242, 243, 245, 246, 248, 249, 251,
        253, 254, 255, 256, 257, 258, 259, 261, 263, 264, 267, 2

In [6]:
groups = split_sections_into_groups(mat_r, 0.54)
set([len(g) for g in groups]), len(groups)

({1, 2, 3, 4, 6, 9, 10, 11, 14, 23}, 347)

Analyse grouping

In [7]:
_, n_sections = mat_q.shape
print(f'Using alpha={ALPHA}, {n_sections} correlated sections were divided ' +
      f'into {len(groups)} groups:')
for i, group in enumerate(groups, start=1):
      print(f'Group {i} - {len(group)} sections: {", ".join(str(s) for s in group)}')


Using alpha=0.54, 472 correlated sections were divided into 347 groups:
Group 1 - 1 sections: 0
Group 2 - 1 sections: 1
Group 3 - 1 sections: 2
Group 4 - 6 sections: 3, 137, 309, 92, 354, 30
Group 5 - 1 sections: 4
Group 6 - 1 sections: 5
Group 7 - 10 sections: 6, 151, 249, 105, 248, 347, 352, 372, 464, 25
Group 8 - 1 sections: 7
Group 9 - 1 sections: 8
Group 10 - 2 sections: 9, 32
Group 11 - 1 sections: 10
Group 12 - 1 sections: 11
Group 13 - 14 sections: 12, 307, 13, 50, 106, 110, 168, 255, 282, 299, 173, 315, 340, 38
Group 14 - 1 sections: 14
Group 15 - 1 sections: 15
Group 16 - 1 sections: 16
Group 17 - 1 sections: 17
Group 18 - 1 sections: 18
Group 19 - 1 sections: 19
Group 20 - 1 sections: 20
Group 21 - 3 sections: 21, 203, 85
Group 22 - 1 sections: 22
Group 23 - 1 sections: 23
Group 24 - 11 sections: 24, 436, 71, 150, 245, 281, 373, 453, 465, 466, 467
Group 25 - 1 sections: 26
Group 26 - 2 sections: 27, 303
Group 27 - 1 sections: 28
Group 28 - 1 sections: 29
Group 29 - 1 section

In [8]:
mat_c = get_compression_matrix(mat_q, groups)

assert mat_c.shape == (mat_q.shape[0], len(groups))
torch.save(mat_c, out_path('mat_c.pt'))
mat_c.shape

torch.Size([2880, 347])

In [9]:
mat_x = get_compressed_matrix(mat_c, mat_q)
torch.save(mat_x, out_path('mat_x.pt'))
mat_x.shape

torch.Size([347, 472])