In [1]:
from datetime import datetime

import torch
from config import *

from data.compress import *
from data.util import count_points_between, crop_q_between

%reload_ext autoreload
%autoreload 2

In [2]:
READ_START_DATE = datetime.strptime(CONFIG['READ_START_DATE'], DATE_FORMAT)
READ_END_DATE = datetime.strptime(CONFIG['READ_END_DATE'], DATE_FORMAT)
TRAIN_START_DATE = datetime.strptime(CONFIG['TRAIN_START_DATE'], DATE_FORMAT)
TRAIN_END_DATE = datetime.strptime(CONFIG['TRAIN_END_DATE'], DATE_FORMAT)
ALPHA = CONFIG['ALPHA']

Load matrix $Q$

In [3]:
mat_q = torch.load(out_path('mat_q_resid.pt'))
mat_q = torch.abs(mat_q)
mat_q.shape

torch.Size([32064, 472])

Crop $Q$ to the required time period

In [4]:
n_samples, n_sections = mat_q.shape
mat_q = crop_q_between(mat_q, READ_START_DATE, READ_END_DATE, TRAIN_START_DATE, TRAIN_END_DATE)
assert mat_q.shape == (count_points_between(TRAIN_START_DATE, TRAIN_END_DATE), n_sections)
mat_q.shape

torch.Size([2880, 472])

Construct a correlation coefficient matrix
$$
R(i, j)=\frac{\sum_{k=1}^d(z(s_i,t_k)-\tilde{z}(s_i))(z(s_j,t_k)-\tilde{z}(s_j))}{\sqrt{\sum_{k=1}^d(z(s_i,t_k)-\tilde{z}(s_i))^2}\sqrt{\sum_{k=1}^d(z(s_j,t_k)-\tilde{z}(s_j))^2}},
$$
where $$\tilde{z}(s_i)=\frac{1}{d}\sum_{k=1}^dz(s_i,t_k)$$

In [5]:
mat_r, nonempty = build_correlation_matrix(mat_q, True)
mat_r.shape, nonempty

(torch.Size([319, 319]),
 tensor([  1,   3,   5,   6,   9,  11,  12,  13,  15,  17,  19,  21,  23,  24,
          25,  26,  27,  29,  30,  32,  33,  34,  37,  38,  39,  40,  41,  43,
          45,  46,  47,  49,  50,  52,  53,  54,  55,  56,  58,  59,  61,  63,
          65,  67,  69,  71,  72,  74,  77,  79,  80,  81,  83,  85,  87,  90,
          91,  92,  98,  99, 101, 102, 103, 105, 106, 109, 110, 111, 112, 113,
         114, 115, 116, 117, 119, 120, 121, 123, 124, 125, 126, 129, 130, 133,
         134, 135, 136, 137, 139, 141, 142, 143, 144, 145, 147, 148, 149, 150,
         151, 153, 155, 156, 157, 159, 160, 161, 163, 165, 166, 167, 168, 169,
         170, 171, 173, 174, 175, 177, 179, 180, 183, 184, 185, 186, 189, 190,
         191, 193, 195, 197, 199, 201, 202, 203, 205, 206, 207, 209, 210, 211,
         212, 215, 217, 219, 220, 221, 223, 225, 226, 227, 229, 230, 231, 233,
         234, 235, 237, 238, 239, 241, 242, 243, 245, 246, 248, 249, 251, 253,
         254, 255, 256, 257

In [6]:
groups = split_sections_into_groups(mat_r, 0.54)
set([len(g) for g in groups]), len(groups)

({1, 2, 3, 4, 6, 9, 10, 11, 14, 23}, 194)

Analyse grouping

In [7]:
_, n_sections = mat_q.shape
print(f'Using alpha={ALPHA}, {n_sections} correlated sections were divided ' +
      f'into {len(groups)} groups:')
for i, group in enumerate(groups, start=1):
      print(f'Group {i} - {len(group)} sections: {", ".join(str(s) for s in group)}')


Using alpha=0.54, 472 correlated sections were divided into 194 groups:
Group 1 - 1 sections: 0
Group 2 - 6 sections: 1, 87, 204, 57, 238, 18
Group 3 - 1 sections: 2
Group 4 - 10 sections: 3, 98, 165, 63, 164, 232, 236, 248, 311, 14
Group 5 - 2 sections: 4, 19
Group 6 - 1 sections: 5
Group 7 - 14 sections: 6, 203, 7, 32, 64, 66, 110, 169, 186, 198, 114, 209, 226, 23
Group 8 - 1 sections: 8
Group 9 - 1 sections: 9
Group 10 - 1 sections: 10
Group 11 - 3 sections: 11, 133, 53
Group 12 - 1 sections: 12
Group 13 - 11 sections: 13, 291, 45, 97, 162, 185, 249, 303, 312, 313, 314
Group 14 - 1 sections: 15
Group 15 - 2 sections: 16, 201
Group 16 - 1 sections: 17
Group 17 - 23 sections: 20, 126, 171, 122, 316, 136, 25, 70, 71, 127, 174, 269, 288, 306, 308, 85, 298, 302, 235, 80, 176, 305, 116
Group 18 - 1 sections: 21
Group 19 - 1 sections: 22
Group 20 - 1 sections: 24
Group 21 - 6 sections: 26, 67, 115, 251, 301, 84
Group 22 - 1 sections: 27
Group 23 - 9 sections: 28, 89, 154, 219, 259, 131, 12

In [8]:
mat_c = get_compression_matrix(mat_q, groups)

assert mat_c.shape == (mat_q.shape[0], len(groups))
torch.save(mat_c, out_path('mat_c.pt'))
mat_c.shape

torch.Size([2880, 194])

In [9]:
mat_x = get_compressed_matrix(mat_c, mat_q)
torch.save(mat_x, out_path('mat_x.pt'))
mat_x.shape

torch.Size([194, 472])