In [27]:
from datetime import datetime

import torch
from config import *

from data.compress import *
from data.util import count_points_between, crop_q_between_dates

%reload_ext autoreload
%autoreload 2

In [28]:
READ_START_DATE = datetime.strptime(CONFIG['READ_START_DATE'], DATE_FORMAT)
READ_END_DATE = datetime.strptime(CONFIG['READ_END_DATE'], DATE_FORMAT)
TRAIN_START_DATE = datetime.strptime(CONFIG['TRAIN_START_DATE'], DATE_FORMAT)
TRAIN_END_DATE = datetime.strptime(CONFIG['TRAIN_END_DATE'], DATE_FORMAT)
ALPHA = CONFIG['ALPHA']

Load matrix $Q$

In [29]:
mat_q = torch.load(out_path('mat_q_resid.pt'))
mat_q = torch.abs(mat_q)
mat_q.shape

torch.Size([32064, 472])

Crop $Q$ to the required time period

In [30]:
_, n_sections = mat_q.shape
mat_q = crop_q_between_dates(mat_q, READ_START_DATE, READ_END_DATE, TRAIN_START_DATE, TRAIN_END_DATE)
assert mat_q.shape == (count_points_between(TRAIN_START_DATE, TRAIN_END_DATE), n_sections)
mat_q.shape

torch.Size([2880, 472])

Construct a correlation coefficient matrix
$$
R(i, j)=\frac{\sum_{k=1}^d(z(s_i,t_k)-\tilde{z}(s_i))(z(s_j,t_k)-\tilde{z}(s_j))}{\sqrt{\sum_{k=1}^d(z(s_i,t_k)-\tilde{z}(s_i))^2}\sqrt{\sum_{k=1}^d(z(s_j,t_k)-\tilde{z}(s_j))^2}},
$$
where $$\tilde{z}(s_i)=\frac{1}{d}\sum_{k=1}^dz(s_i,t_k)$$

In [31]:
mat_r = build_correlation_matrix(mat_q)
mat_r.shape

torch.Size([472, 472])

In [32]:
groups, n_ungrouped = split_sections_into_groups(mat_r, ALPHA)

Analyse grouping

In [33]:
_, n_sections = mat_q.shape
print(f'Using alpha={ALPHA}, {n_sections} correlated sections were divided ' +
      f'into {len(groups)} groups:')
n_ungrouped = 0
for i, (group, corr_min, corr_max) in enumerate(groups, start=1):
      group_coeffs = mat_r[tuple(group.T), :]
      print(f'Group {i} - {len(group)} sections with correlation coefficients '
          f'{corr_min:.3f} to {corr_max:.3f}')
print(str(n_ungrouped) + " section(s) don't correlate with anything and weren't grouped")


Using alpha=0.9999, 472 correlated sections were divided into 159 groups:
Group 1 - 2 sections with correlation coefficients 1.000 to 1.000
Group 2 - 2 sections with correlation coefficients 0.994 to 0.994
Group 3 - 2 sections with correlation coefficients 0.844 to 0.844
Group 4 - 2 sections with correlation coefficients 0.781 to 0.781
Group 5 - 2 sections with correlation coefficients 0.776 to 0.776
Group 6 - 2 sections with correlation coefficients 0.756 to 0.756
Group 7 - 2 sections with correlation coefficients 0.707 to 0.707
Group 8 - 2 sections with correlation coefficients 0.697 to 0.697
Group 9 - 2 sections with correlation coefficients 0.680 to 0.680
Group 10 - 2 sections with correlation coefficients 0.671 to 0.671
Group 11 - 2 sections with correlation coefficients 0.660 to 0.660
Group 12 - 2 sections with correlation coefficients 0.658 to 0.658
Group 13 - 2 sections with correlation coefficients 0.656 to 0.656
Group 14 - 2 sections with correlation coefficients 0.652 to 0.6

  group_coeffs = mat_r[tuple(group.T), :]


In [35]:
mat_c = get_compression_matrix(mat_q, groups)

assert mat_c.shape == (mat_q.shape[0], len(groups))
torch.save(mat_c, out_path('mat_c.pt'))
mat_c.shape

torch.Size([2880, 159])

In [36]:
mat_x = get_compressed_matrix(mat_c, mat_q)
torch.save(mat_x, out_path('mat_x.pt'))
mat_x.shape

torch.Size([159, 472])