In [None]:
"""Imports all necessary libs"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import sklearn
from Code.setup import *
import datetime as dt
import torch.nn as nn
import copy
from sklearn.model_selection import KFold
from sklearn.model_selection import GroupKFold
from RNN_for_GCPL import setup


seedEverything(seed=DEFAULT_RANDOM_SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
"""For compatibility - cd to folder with data and models"""
os.chdir('../../RNN_for_GCPL/')

In [None]:
"""Create new """
input_dim = 2
output_dim = 20
num_layers = 2
bidir= True
lr = 5e-4
criterion = nn.MSELoss(reduction='none')

init_model = MyGRU(input_dim, output_dim, num_layers=num_layers,bidir = bidir)
best_model = MyGRU(input_dim, output_dim, num_layers=num_layers,bidir = bidir)

gkf = GroupKFold(4)


In [None]:
models = []
"""number of neurons, number of layers, bidirectional"""
params = [(20, 2, False),
          (20, 2, True),
          (30, 3, True)]
for a,b,c in params:
    model = MyGRU(input_dim, a, num_layers=b, bidir=c)
    models.append(model)
sampling = [30, 60, 120, 180, 300]

In [None]:
# probs = balancing(dataset_main, 3)
# soh, info = statistics(dataset_main+dataset_val)

In [None]:
class BucketSampler(torch.utils.data.Sampler):
    """
    Bucket sampler from the internet.
    """
    def __init__(self, lengths, buckets=(50,500,50), shuffle=True, batch_size=32, drop_last=False):

        super().__init__(lengths)

        self.shuffle = shuffle
        self.batch_size = batch_size
        self.drop_last = drop_last

        assert isinstance(buckets, tuple)
        bmin, bmax, bstep = buckets
        assert (bmax - bmin) % bstep == 0

        buckets = defaultdict(list)
        for i, length in enumerate(lengths):
            if length > bmin:
                bucket_size = min((length // bstep) * bstep, bmax)
                buckets[bucket_size].append(i)

        self.buckets = dict()
        for bucket_size, bucket in buckets.items():
            if len(bucket) > 0:
                self.buckets[bucket_size] = torch.tensor(bucket, dtype=torch.int, device='cpu')

        # call __iter__() to store self.length
        self.__iter__()

    def __iter__(self):

        if self.shuffle == True:
            for bucket_size in self.buckets.keys():
                self.buckets[bucket_size] = self.buckets[bucket_size][torch.randperm(self.buckets[bucket_size].nelement())]

        batches = []
        for bucket in self.buckets.values():
            curr_bucket = torch.split(bucket, self.batch_size)
            if len(curr_bucket) > 1 and self.drop_last == True:
                if len(curr_bucket[-1]) < len(curr_bucket[-2]):
                    curr_bucket = curr_bucket[:-1]
            batches += curr_bucket

        self.length = len(batches)

        if self.shuffle == True:
            random.shuffle(batches)

        return iter(batches)

    def __len__(self):
        return self.length


In [None]:
def collate_batch_named(batch, named=True):
    sample_list = []
    label_list = []
    pouch = []
    cycle = []
    filenames = []
    for i in batch:
        sample = np.stack([i['E'], i['I']],axis=-1)
        sample_list.append(torch.tensor(sample, dtype=torch.float32))
        label_list.append(i['SoH'])
        pouch.append(i['Pouch'])
        cycle.append(i['Cycle'])
        filenames.append(i['Filename'])
    sequence_pad = nn.utils.rnn.pad_sequence(sample_list)
    labels = torch.tensor(label_list, dtype=torch.float32)
    if not named:
        return sequence_pad, labels
    else:
        return sequence_pad, labels, pouch, cycle, filenames

In [None]:
def detailed_loss(handler, data_loader, criterion):
    "data_loader выдаёт 5 элемент - data, labels, pouch, cycle и filename"
    with torch.no_grad():
        loss = []
        pouches = []
        soh = []
        pred = []
        cycles = []
        filenames = []
        for data, labels, pouch, cycle, filename in data_loader:
            preds = handler.best_model(data)
            pred.extend(preds)
            loss_ = criterion(preds, labels)
            loss.append(loss_)
            pouches.extend(pouch)
            cycles.extend(cycle)
            soh.extend(labels)
            filenames.extend(filename)

        loss = torch.hstack(loss)
        soh = torch.Tensor(soh)*50+50
        pred = torch.Tensor(pred)*50+50
        loss *= 2500
        results = pd.DataFrame({'Pouch':pouches, 'Loss':loss, 'SoH':soh,'Pred':pred, 'Cycle':cycles, 'Filename':filenames} )
        # results.sort_values(by='Loss',axis=0, ascending=0,inplace=True)
        return results

In [None]:
batch_size = 64
model_version = 15
losses = []
criterion = nn.MSELoss(reduction='none')
dataset_version = 6
s = 30
k = 0
n_bins = 30
datapath = f'data/v{dataset_version}/{s}/'
model_path = f'./models/v{model_version}/{s}/{k}'

dataset_main = GCPL_dataset_resampled3(datapath)

soh, info = statistics(dataset_main)

min_length =  info['Len'].min() - 1
max_length = info['Len'].max()
bin_length = np.ceil((max_length - min_length)/n_bins)
max_length += bin_length - (max_length-min_length)% bin_length
bucket_sampler_main = BucketSampler(info['Len'].to_numpy(), buckets = (min_length, max_length, bin_length), batch_size=batch_size, shuffle=False)
main_loader = torch.utils.data.DataLoader(dataset_main, batch_size =1,  batch_sampler=bucket_sampler_main, collate_fn=collate_batch_named)


for i, (train_indices, val_indices) in enumerate(gkf.split(dataset_main, groups=info.Pouch)):
    seedEverything(seed=DEFAULT_RANDOM_SEED)
    path = model_path + f'/{i}'
    print(path)
    init_model = copy.deepcopy(models[k])
    handler = ModelHandler(init_model)
    handler.load(path)
    result = detailed_loss(handler, main_loader, criterion)
    result['Train/val'] = 'Train'
    result.loc[val_indices, 'Train/val'] = 'Val'

    losses.append(result)

In [None]:
loss = losses[2]
loss[loss.Loss>30]['Loss'].hist(bins=100)

In [None]:
for i, (train_indices, val_indices) in enumerate(gkf.split(dataset_new, groups=info.Pouch)):