In [None]:
"""For google colab workflow - mounts Google Drive and go to it"""
import os
from google.colab import drive
drive.mount('/content/gdrive')
os.chdir('./gdrive/MyDrive/Projects/RNN_for_GCPL/Notebooks')

In [None]:
"""Imports all necessary libs"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import sklearn
from Code.setup import *
import datetime as dt
import torch.nn as nn
import copy
from sklearn.model_selection import KFold
from sklearn.model_selection import GroupKFold
from RNN_for_GCPL import setup


seedEverything(seed=DEFAULT_RANDOM_SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
"""For compatibility - cd to folder with data and models"""
os.chdir('../../RNN_for_GCPL/')

In [None]:
"""Dataset consists of "main" an "validate" parts """
dataset_path = os.path.normpath(r'./data/v4/60/')

dataset_main_path = os.path.join(dataset_path, 'main')
dataset_val_path = os.path.join(dataset_path, 'validate')

dataset_main = GCPL_dataset_resampled3(dataset_main_path)
dataset_val = GCPL_dataset_resampled3(dataset_val_path)

In [None]:
dataset_new = GCPL_dataset_resampled3(os.path.normpath(r'./data/v6/30/'))
soh_new, info_new = statistics(dataset_new)

In [None]:
"""Create new """
input_dim = 2
output_dim = 20
num_layers = 2
bidir= True
lr = 5e-4
criterion = nn.MSELoss(reduction='none')

init_model = MyGRU(input_dim, output_dim, num_layers=num_layers,bidir = bidir)
best_model = MyGRU(input_dim, output_dim, num_layers=num_layers,bidir = bidir)

gkf = GroupKFold(4)


In [None]:
models = []
"""number of neurons, number of layers, bidirectional"""
params = [(20, 2, False),
          (20, 2, True),
          (30, 3, True)]
for a,b,c in params:
    model = MyGRU(input_dim, a, num_layers=b, bidir=c)
    models.append(model)
sampling = [30, 60, 120, 180, 300]

In [None]:
probs = balancing(dataset_main, 5)
soh, info = statistics(dataset_main)

In [None]:
class BucketSampler(torch.utils.data.Sampler):
    """
    Bucket sampler from the internet.
    """
    def __init__(self, lengths, buckets=(50,500,50), shuffle=True, batch_size=32, drop_last=False):
        
        super().__init__(lengths)
        
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.drop_last = drop_last
        
        assert isinstance(buckets, tuple)
        bmin, bmax, bstep = buckets
        assert (bmax - bmin) % bstep == 0
        
        buckets = defaultdict(list)
        for i, length in enumerate(lengths):
            if length > bmin:
                bucket_size = min((length // bstep) * bstep, bmax)
                buckets[bucket_size].append(i)
                
        self.buckets = dict()
        for bucket_size, bucket in buckets.items():
            if len(bucket) > 0:
                self.buckets[bucket_size] = torch.tensor(bucket, dtype=torch.int, device='cpu')
        
        # call __iter__() to store self.length
        self.__iter__()
            
    def __iter__(self):
        
        if self.shuffle == True:
            for bucket_size in self.buckets.keys():
                self.buckets[bucket_size] = self.buckets[bucket_size][torch.randperm(self.buckets[bucket_size].nelement())]
                
        batches = []
        for bucket in self.buckets.values():
            curr_bucket = torch.split(bucket, self.batch_size)
            if len(curr_bucket) > 1 and self.drop_last == True:
                if len(curr_bucket[-1]) < len(curr_bucket[-2]):
                    curr_bucket = curr_bucket[:-1]
            batches += curr_bucket
            
        self.length = len(batches)
        
        if self.shuffle == True:
            random.shuffle(batches)
            
        return iter(batches)
    
    def __len__(self):
        return self.length


In [None]:
def collate_batch_length(batch):
    """
    This function pads batch up to the maximum length of any element.
    It extracts only voltage and current data.
    Also returns labels for each sample
    This version of batch additionally returns list of lengths of samples (to test sequence bucketing)
    Args:
        batch (iter): all samples for one batch, in list or any other iterator

    Returns():
        padded batch (torch.tensor), labels (torch.tensor), lengths(torch.tensor).
    """
    sample_list = []
    label_list = []
    lengths = []
    for i in batch:
        sample = np.stack([i['E'], i['I']],axis=-1)
        sample_list.append(torch.tensor(sample, dtype=torch.float32))
        label_list.append(i['SoH'])
        lengths.append(len(i['E']))

    sequence_pad = nn.utils.rnn.pad_sequence(sample_list)
    labels_tensor = torch.tensor(label_list, dtype=torch.float32)
    lengths_torch = torch.tensor(lengths, dtype=torch.float32)
    return sequence_pad, labels_tensor, lengths_torch


In [None]:
#TODO Узнать, есть ли корреляция между длиной цикла и SoH, и что она значит.
for k, (train_indices, val_indices) in enumerate(gkf.split(dataset_main, groups=info.Pouch)):
    train_set = torch.utils.data.Subset(dataset_main, train_indices)
    prob_train = probs.copy()
    prob_train[val_indices] = 0

    balanced_train_indices = torch.multinomial(torch.tensor(prob_train), len(train_indices), replacement=True)

    plt.hist(info.loc[balanced_train_indices.numpy(), 'SoH'], 100, label=k)
plt.legend()

In [None]:
for k, (train_indices, val_indices) in enumerate(gkf.split(dataset_main, groups=info.Pouch)):
    train_set = torch.utils.data.Subset(dataset_main, train_indices)
    prob_train = probs.copy()
    prob_train[val_indices] = 0

    balanced_train_indices = torch.multinomial(torch.tensor(prob_train), len(train_indices), replacement=True)

    plt.hist(info.loc[balanced_train_indices.numpy(), 'Len'], 100, label=k)
plt.legend()

In [None]:
"""This is study technique. """
draw_every = 25
num_epochs = 75
save_every = 10
batch_size = 64
n_bins = 10
_ , ind = statistics(dataset_main)
for m in sampling:
    dataset_main = GCPL_dataset_resampled3(f'data/v4/{m}/main')
    dataset_val = GCPL_dataset_resampled3(f'data/v4/{m}/validate')
    for l, init_model in enumerate(models):
        path = f'./models/v10/{m}/{l}'
        for k, (train_indices, val_indices) in enumerate(gkf.split(dataset_main, groups=info.Pouch)):
            seedEverything(seed=DEFAULT_RANDOM_SEED)
            min_length =  ind.loc[train_indices,'Len'].min()
            max_length = ind.loc[train_indices,'Len'].max()
            max_length += n_bins - (max_length-min_length)% n_bins
            balanced_train_set = torch.utils.data.Subset(dataset_main, train_indices)
            val_set = torch.utils.data.Subset(dataset_main, val_indices)
            # sampler = torch.utils.data.WeightedRandomSampler(probs[train_indices], len(train_indices)) 
            # train_loader = torch.utils.data.DataLoader(train_set, batch_size, collate_fn=collate_batch, sampler=sampler)
            # val_loader = torch.utils.data.DataLoader(val_set, batch_size, shuffle=True, collate_fn=collate_batch)
            bucket_sampler_train = BucketSampler(ind.loc[train_indices,'Len'].to_numpy(), buckets = (min_length, max_length, n_bins), batch_size=batch_size)
            bucket_sampler_val = BucketSampler(ind.loc[val_indices,'Len'].to_numpy(), buckets = (min_length, max_length, n_bins), batch_size=batch_size)
            train_loader = torch.utils.data.DataLoader(balanced_train_set, batch_size =1, batch_sampler=bucket_sampler_train, collate_fn=collate_batch)
            model = copy.deepcopy(init_model)
            val_loader = torch.utils.data.DataLoader(val_set, batch_size =1,  batch_sampler=bucket_sampler_val, collate_fn=collate_batch)
            model = copy.deepcopy(init_model)
            model.to(device)
            optimizer =  torch.optim.Adam(model.parameters(), lr=lr)
            sheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)
            handler = ModelHandler(model, 1e6, f'20 neurons, 2 layers, 120s sampling, 80% train, 20% val, k-fold No {k}.', path= path, kfold=True)
            pp = ProgressPlotter()
            counter = 0
            for i in range(num_epochs):
                model.train()
                loss_val = 0
                loss_train = 0
                for data, labels in train_loader:
                    optimizer.zero_grad()
                    preds = model(data.to(device))
                    loss = torch.sum(criterion(preds, labels.to(device)))
                    loss.backward()
                    optimizer.step()
                    loss_train += loss.detach().cpu()
                
                
                with torch.no_grad():
                    loss_train = loss_train/len(train_loader)
                    pp.add_scalar('loss_train', loss_train.detach().cpu().numpy())
                    model.eval()
                    for data, labels in val_loader:
                        preds = model(data.to(device))
                        loss = torch.sum(criterion(preds, labels.to(device)))
                        loss_val += loss.detach().cpu()
                    loss_val = loss_val/len(val_set)
                    sheduler.step(loss_val)
                    handler.check_loss(loss_val, draw_every)
                    pp.add_scalar('loss_val', loss_val.detach().cpu().numpy())
                    if loss_val > handler.best_loss:
                        counter +=1
                    else:
                        counter = 0
                    if (i+1)%draw_every == 0:
                        pp.display([['loss_train', 'loss_val']])
                    print(i, counter, loss_train*2500, loss_val*2500)
                if (i+1)%save_every==0:
                    handler.save(kfold_number=k)
                    
                    
            handler.add_pp(pp)
            handler.display()
            handler.save(kfold_number=k)
            with torch.no_grad():
                loss_val = []
                for data, labels in val_loader:
                    preds = model(data.to(device))
                    loss = criterion(preds, labels.to(device))
                    loss_val.append(loss)
                loss_val = torch.hstack(loss_val)
                print(loss_val.mean(), loss_val.std())


In [None]:
"""Tests for zero padding of batches. Here we test mean padding length, mean length of dataset"""
missed_length = []
n_bins = 50
#TODO оказывается это не количество бинов, а количество длины которое входит в один бин.
batch_size = 64

for k, (train_indices, val_indices) in enumerate(gkf.split(dataset_main, groups=info.Pouch)):

    seedEverything(seed=DEFAULT_RANDOM_SEED)
    train_set = torch.utils.data.Subset(dataset_main,train_indices)
    """create balanced training set"""
    prob_train = probs.copy()
    prob_train[val_indices] = 0
    balanced_train_indices = torch.multinomial(torch.tensor(prob_train), len(train_indices), replacement=True).numpy()
    balanced_train_set = torch.utils.data.Subset(dataset_main, balanced_train_indices)
    """Testing length"""
    min_length =  info.loc[train_indices,'Len'].min()
    max_length = info.loc[train_indices,'Len'].max()
    bin_length = np.ceil((max_length - min_length)/n_bins)
    max_length += bin_length - (max_length-min_length)% bin_length #for appropriate work bins should equally fit into difference (max_length-min_length)% bin_length ==0
    print(min_length, max_length)

    """New version of sampling, need other loader configuration - balancing happens when you create dataset, not in sampler."""
    bucket_sampler = BucketSampler(info.loc[balanced_train_indices, 'Len'].to_numpy(),
                                   buckets = (min_length, max_length, bin_length),
                                   batch_size=batch_size,
                                   shuffle=True)
    train_loader = torch.utils.data.DataLoader(balanced_train_set,
											   batch_size =1,
											   batch_sampler=bucket_sampler,
											   collate_fn=collate_batch_length)
    """Bucket sampling without balancing"""
    # bucket_sampler = BucketSampler(info.loc[train_indices, 'Len'].to_numpy(),
    #                                buckets = (min_length, max_length, bin_length),
    #                                batch_size=batch_size,
    #                                shuffle=True)
    # train_loader = torch.utils.data.DataLoader(train_set,
	# 										   batch_size =1,
	# 										   batch_sampler=bucket_sampler,
	# 										   collate_fn=collate_batch_length)

    """elder version of sampling"""
    # sampler = torch.utils.data.WeightedRandomSampler(probs[train_indices], len(train_indices))
    # train_loader = torch.utils.data.DataLoader(train_set,
    #                                            batch_size =batch_size,
    #                                            sampler=sampler,
    #                                            collate_fn=collate_batch_length)


    val_set = torch.utils.data.Subset(dataset_main, val_indices)
    val_loader = torch.utils.data.DataLoader(val_set, batch_size, shuffle=True, collate_fn=collate_batch_length)
    miss_ = []
    for data, labels, length in train_loader:
        m = data.shape[0]- length.mean()
        miss_.append([m, data.shape[0], length.mean(), length.min()])
    miss_ = torch.tensor(miss_)
    missed_length.append(miss_)
    print(torch.mean(miss_[:,1]-miss_[:,3]))
    print(miss_.mean(axis=0))


#WRS - средняя длина 430, общая длина 620
#BS10 - средняя длина 510, общая длина 515 (bucket 10)
#BS10 + Weighting - средняя длина 430, общая длина 440

In [None]:
"""Loss checking??? - требует значительного пересмотра всё, что ниже блока обучения"""
loss = []
model = copy.deepcopy(init_model) 
path= f'./models/v127/2'
handler = ModelHandler(model, 1e6, path= f'./models/v2/', kfold=True)
batches = next(os.walk(path))[1]
loss_ = []
epochs_ = []
for batch in batches:
    batchpath = path+ '/' + batch
    handler.load(batchpath)
    loss_.append(handler.best_loss.cpu())
loss.append(loss_)
loss = np.array(loss)*2500
print(str(np.mean(loss))+ '±' + str(np.std(loss)))
loss

In [None]:
def load(path_):
    if path_.split('.')[-1] == 'pt':
        path = '/'.join(path_.split('/')[:-1])
        name = path_.split('/')[-1]
        checkpoint = torch.load(path+ '/' + name, map_location=torch.device('cpu'))
        loss = [checkpoint['loss'].numpy()]
    else:
        path = path_
        files = next(os.walk(path_))[2]
        x = lambda x: int(x.split('_')[-1].split('.')[0].split('(')[0])
        files.sort(key=x)
        loss = []
        for name in files:
            checkpoint = torch.load(path+ '/' + name, map_location=torch.device('cpu'))
            loss.append(checkpoint['loss'].numpy())
    return np.array(loss)
#TODO почему-то в папках моделей появились названия типа 3(1) и пустые папки - это сильно портит моей программе жизнь. Надо удалить всё лишнее из папки с моделями.

In [None]:
def collate_batch_named(batch, named=True):
    sample_list = []
    label_list = []
    pouch = []
    cycle = []
    for i in batch:
        sample = np.stack([i['E'], i['I']],axis=-1)
        sample_list.append(torch.tensor(sample, dtype=torch.float32))
        label_list.append(i['SoH'])
        pouch.append(i['Pouch'])
        cycle.append(i['Cycle'])
    sequence_pad = nn.utils.rnn.pad_sequence(sample_list)
    labels = torch.tensor(label_list, dtype=torch.float32)
    if not named:
        return sequence_pad, labels    
    else:
        return sequence_pad, labels, pouch, cycle

In [None]:
def detailed_loss(handler, val_loader):
    with torch.no_grad():
        loss = []
        pouches = []
        soh = []
        pred = []
        cycles = []
        for data, labels, pouch, cycle in val_loader:
            preds = handler.best_model(data)
            pred.extend(preds)
            loss_ = criterion(preds, labels)
            loss.append(loss_)
            pouches.extend(pouch)
            cycles.extend(cycle)
            soh.extend(labels)
        
        loss = torch.hstack(loss)
        soh = torch.Tensor(soh)*50+50
        pred = torch.Tensor(pred)*50+50
        loss *= 2500 
        results = pd.DataFrame({'Pouch':pouches,'Cycle':cycles, 'Loss':loss, 'SoH':soh,'Pred':pred} )
        # results.sort_values(by='Loss',axis=0, ascending=0,inplace=True)
        return results

In [None]:
batch_size = 64
model_version = 14
losses = []
for k in range(3):
    for s in sampling:
        datapath = f'data/v4/{s}/'
        model_path = f'./models/v{model_version}/{s}/{k}'
        dataset_val = GCPL_dataset_resampled3(datapath + 'validate')
        dataset_rs2 = GCPL_dataset_resampled3(datapath+'main')
        main_loader = torch.utils.data.DataLoader(dataset_rs2, batch_size, shuffle=False, collate_fn=collate_batch_named)
        val_loader = torch.utils.data.DataLoader(dataset_val, batch_size, shuffle=False, collate_fn=collate_batch_named)
        for i in next(os.walk(model_path))[1]:
            init_model = copy.deepcopy(models[k])
            path = model_path + f'/{i}'
            print(path)
            handler = ModelHandler(init_model)
            handler.load(path)
            result_main = detailed_loss(handler, main_loader)
            result_val = detailed_loss(handler, val_loader)
            result = pd.concat([result_main, result_val])
            loss = pd.concat([result.groupby('Pouch').Loss.mean(), result.groupby('Pouch').Loss.std()], axis=1)
            loss.columns = [f'{s}/{k}/{i}_Mean', f'{s}/{k}/{i}_Std']
            losses.append(loss)
f_loss = pd.concat(losses, axis=1)


In [None]:
f_loss.iloc[:, 0::2]

In [None]:
loss = {}
for s in sampling:
    l_ = {}
    for i in range(3):
        path = f'./models/v15/{s}/{i}'
        batches = next(os.walk(path))[1]
        loss_ = []
        for batch in batches:
            batchpath = path+ '/' + batch
            a = load(batchpath)
            loss_.append(a[-1])
        loss_ = torch.tensor(loss_)**0.5*50
        l_[i] = loss_
        print(s, i, f'{loss_}')
    loss[s] = l_
#TODO похоже, что в 10-14 моделях огромный лосс на 1 k-фолде. Надо понять в чём там проблема
#TODO Добавить 2 разных функции - в одной просто должно происходить извлечение loss (общий), это не требует модели, во второй сделать честный прогон через модель всей валидационной выборки (выборку можно выбирать самостоятельно) - и на ней можно исследовать лосс от каждого элемента.
#TODO

In [None]:
loss_= {}
for s in sampling:
    loss_[s]={}
    for i in range(3):
        loss_[s].update({i:f'{loss[s][i].mean():.3f}±{loss[s][i].std():.3f}'})
        # print(f'{s}, {loss[s][i].mean():.3f}±{loss[s][i].std():.3f}')

pd.DataFrame(loss_)

In [None]:
for i, s in enumerate(sampling):
    loss_[i]

In [None]:
[info_new.Pouch.str.contains('G')]

In [None]:
info_new

In [None]:
i = 10
info_new

In [None]:
for i in range(10,100, 10):
    idx = info_new[(info_new.SoH>i) & (info_new.SoH <= i+10)].sample().index
    idx = idx[0]
    plt.figure(1)
    plt.plot(dataset_new[idx]['E']*1.45+2.75, label = f'{dataset_new[idx]["Pouch"]}, SoH = {dataset_new[idx]["SoH"]*50+50:.1f}')
    plt.ylabel('E, V')
    plt.figure(2)
    plt.plot(dataset_new[idx]['I']*150, label = f'{dataset_new[idx]["Pouch"]}, SoH = {dataset_new[idx]["SoH"]*50+50:.1f}')
    plt.ylabel('I, mA')
    plt.legend()

In [None]:
plt.locator_params(nbins=10)
ax = (info_new.Len*0.5).hist(bins=50)
ax.set_xlabel('Len, min')
ax.set_ylabel('n of cycles')