In [11]:
import os
import json
import glob

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import StratifiedKFold


In [12]:
import time
import psutil
from contextlib import contextmanager

def show_memory_usage(name = "unknown"):
    vm = psutil.virtual_memory()
    print(f"[MEMUSE] memory usage (in {name}): {vm.used/1024/1024:.2f}MB ({vm.percent}%)")

@contextmanager
def timer(name: str):
    show_memory_usage(f"before {name}")
    s = time.time()
    yield
    elapsed = time.time() - s
    print(f"[{name}] {elapsed:.3f}sec")
    show_memory_usage(f"after {name}")

In [13]:
os.chdir('/Users/ziyangtan/Documents/applied_ml/2rd_solution')

In [14]:
class Config:
    def __init__(self):
        
        # self.train_path = '../data/data_raw/train_series.parquet'
        # self.train_target_path = '../data/data_raw/train_events.csv'
        # self.test_path = '../data/data_raw/test_series.parquet'
        # self.sample_submission_path = '../data/data_raw/sample_submission.csv'
        # self.preprocess_dir = '../data/data_processed'
        self.steps_per_sec = 0.2
        self.step_for_a_day = 60 * self.steps_per_sec * 60 * 24
        self.step_for_30min = 60 * self.steps_per_sec * 30
        self.step_for_15min = 60 * self.steps_per_sec * 15
        self.step_for_1min = 60 * self.steps_per_sec

    def from_json(self, json_path):
        json_data = json.load(open(json_path))
        for k, v in json_data.items():
            print(k, v)
            setattr(self, k, v)
        return self

setting_file = "SETTINGS.json"
Cfg = Config().from_json(setting_file)
os.makedirs(Cfg.preprocess_dir, exist_ok=True)


IS_DEBUG True
train_path /Users/ziyangtan/Documents/applied_ml/2rd_solution/data/data_raw/train_series.parquet
train_target_path /Users/ziyangtan/Documents/applied_ml/2rd_solution/data/data_raw/train_events.csv
test_path /Users/ziyangtan/Documents/applied_ml/2rd_solution/data/data_raw/test_series.parquet
sample_submission_path /Users/ziyangtan/Documents/applied_ml/2rd_solution/data/data_raw/sample_submission.csv
preprocess_dir data_processed/
weight_dir_1dcnn model/weights/
weight_dir_lgbm model/lgbm_models/
inputs_2nd data_processed/df_second_model.feather
steps_per_sec 0.2
step_for_1min 12.0
step_for_15min 180.0
step_for_30min 360.0
step_for_a_day 17280.0


In [15]:
%load_ext cython

The cython extension is already loaded. To reload it, use:
  %reload_ext cython


In [17]:
%%cython
import numpy as np
cimport numpy as cnp
cimport cython

def cumsum_morethan_zero(cnp.ndarray[cnp.float64_t, ndim=1] x):
    cdef int i, n
    n = x.shape[0]
    cdef cnp.ndarray[cnp.float64_t, ndim=1] y = np.zeros(n)
    cdef cnp.ndarray[cnp.float64_t, ndim=1] y_rev = np.zeros(n)
    y[0] = x[0]
    for i in range(1, n):
        if x[i] == 0:
            y[i] = 0
        else:
            y[i] = y[i-1] + x[i]
    y_rev[-1] = y[-1]
    for i in range(n-2, -1, -1):
        if y_rev[i+1] > y[i]:
            if x[i] == 0:
                y_rev[i] = 0
            else:
                y_rev[i] = y_rev[i+1]
        else:
            y_rev[i] = y[i]
    return y_rev

def easy_convolve(cnp.ndarray[cnp.float64_t, ndim=1] x, int filter_size):
    """
    padding same, kernel is ones
    """
    cdef int i, j, n, p, m
    m = filter_size - 1
    p = m // 2
    n = x.shape[0]
    cdef cnp.ndarray[cnp.float64_t, ndim=1] x_p = np.zeros(n+2*p)
    cdef cnp.ndarray[cnp.float64_t, ndim=1] y = np.zeros(n)
    x_p[p:n+p] = x

    for j in range(filter_size):
        y[0] += x_p[j]

    for i in range(1, n):# filter_size, n+p+p-filter_size+1):
        y[i] = x_p[i+m] + y[i-1] - x_p[i-1]
    return y

def minimum(cnp.ndarray[cnp.float64_t, ndim=1] x, cnp.float64_t maxval):
    cdef int i, n
    n = x.shape[0]
    cdef cnp.ndarray[cnp.float64_t, ndim=1] y = np.zeros(n)
    for i in range(n):
        y[i] = min(x[i], maxval)
    return y

def easy_closing(cnp.ndarray[cnp.float64_t, ndim=1] x, int filter_size):
    """
    closing = dilation -> erosion
    padding same, kernel is ones, x is 0 or 1
    """
    x = easy_convolve(x, filter_size)
    x = minimum(x, 1)
    x = 1 - x
    x = easy_convolve(x, filter_size)
    x = minimum(x, 1)
    x = 1 - x
    return x

def easy_closing_q(cnp.ndarray[cnp.float64_t, ndim=1] x, int filter_size):
    """
    closing = dilation -> erosion
    padding same, kernel is ones, x is 0 or 1
    少し早いけどわかりにくい…。
    """
    cdef int i, j, n, p, m
    m = filter_size - 1
    p = m // 2
    n = x.shape[0]
    cdef cnp.ndarray[cnp.float64_t, ndim=1] x_p = np.zeros(n+2*p)
    cdef cnp.ndarray[cnp.float64_t, ndim=1] y_p = np.zeros(n+2*p)
    cdef cnp.ndarray[cnp.float64_t, ndim=1] y = np.zeros(n)
    cdef cnp.ndarray[cnp.float64_t, ndim=1] z = np.zeros(n)
    
    x_p[p:n+p] = x
    for j in range(filter_size):
        y[0] += x_p[j]
    for i in range(1, n):# filter_size, n+p+p-filter_size+1):
        y[i] = x_p[i+m] + y[i-1] - x_p[i-1]
    for i in range(n):
        y[i] = 1 - min(y[i], 1)
    
    y_p[p:n+p] = y
    for j in range(filter_size):
        z[0] += y_p[j]
    for i in range(1, n):# filter_size, n+p+p-filter_size+1):
        z[i] = y_p[i+m] + z[i-1] - y_p[i-1]
    for i in range(n):
        z[i] = 1 - min(z[i], 1)
    
    return z


def _detect_peak(cnp.ndarray[cnp.float64_t, ndim=1] x, int k):
    cdef int n = x.shape[0]
    cdef cnp.ndarray[cnp.float64_t, ndim=1] max_array = np.zeros(n, dtype=np.float64)
    cdef cnp.ndarray[cnp.int32_t, ndim=1] max_indices = np.zeros(n, dtype=np.int32)
    cdef cnp.ndarray[cnp.float64_t, ndim=1] result = np.zeros(n, dtype=np.float64)
    cdef int i, j, start, end, max_index
    
    # calculate max values in each window
    for i in range(n):
        start = max(0, i - k)
        end = min(n, i + k + 1)
        max_index = start
        for j in range(start, end):
            if x[j] > x[max_index]:
                max_index = j
        max_array[i] = x[max_index]
        max_indices[i] = max_index
    
    # set peak values to 1
    for i in range(n):
        if x[i] == max_array[max_indices[i]]:
            result[i] = 1.0
    
    return max_array

def _detect_peak_r(cnp.ndarray[cnp.float64_t, ndim=1] x, int k):
    cdef int n = x.shape[0]
    cdef cnp.ndarray[cnp.float64_t, ndim=1] max_array = np.zeros(n, dtype=np.float64)
    cdef cnp.ndarray[cnp.int32_t, ndim=1] max_indices = np.zeros(n, dtype=np.int32)
    cdef cnp.ndarray[cnp.float64_t, ndim=1] result = np.zeros(n, dtype=np.float64)
    cdef int i, j, start, end, max_index
    
    # calculate max values in first window
    max_index = 0
    for i in range(k):
        if x[i] > x[max_index]:
            max_index = i
    max_array[k-1] = x[max_index]
    max_indices[k-1] = max_index
    
    # calculate max values in each window
    for i in range(k, n):
        start = i - k
        end = i
        if max_index == start - 1:
            max_index = start
            for j in range(start, end):
                if x[j] > x[max_index]:
                    max_index = j
        else:
            if x[i] > x[max_index]:
                max_index = i
        max_array[i] = x[max_index]
        max_indices[i] = max_index
    
    # set peak values to 1
    for i in range(n):
        if x[i] == max_array[max_indices[i]]:
            result[i] = 1.0
    
    return max_array

@cython.boundscheck(False)
@cython.wraparound(False)
def detect_peak_kmat(cnp.ndarray[cnp.float64_t, ndim=1] x, int k):
    cdef int n = x.shape[0]
    cdef cnp.ndarray[cnp.float64_t, ndim=1] max_array = np.zeros(n, dtype=np.float64)
    cdef cnp.ndarray[cnp.float64_t, ndim=1] result_val = np.zeros(n, dtype=np.float64)
    cdef cnp.ndarray[cnp.float64_t, ndim=1] result = np.zeros(n, dtype=np.float64)
    cdef int i, j, start, end, max_index, half_k

    half_k = k // 2
    for i in range(half_k, n-half_k):
        result[i] = 1
        result_val[i] = x[i]
        for j in range(1, half_k+1):
            if x[i] < x[i-j]:
                result[i] = 0
                result_val[i] = 0
                break
            if x[i] < x[i+j]:
                result[i] = 0
                result_val[i] = 0
                break
    return result, result_val


In [18]:
def sample_train_target_by_id(df_train, df_target, id_no):
    return df_train[df_train['series_id']==id_no], df_target[df_target['series_id']==id_no]

def anglez_enmo_to_az(enmo, anglez):
    # anglezのtanをとる。anglezは-90~90
    a = np.clip(np.tan(anglez / 180 * np.pi), -0.99999, 0.99999)
    # a = np.tan(anglez / 180 * np.pi)
    b = (enmo + 1)**2
    axay_sqsum = b / (1 + a**2)
    axay = np.sqrt(axay_sqsum)
    az = a * axay
    return az, axay

def make_features(df_train_id):
    az, axay = anglez_enmo_to_az(df_train_id["enmo"].values, df_train_id["anglez"].values)
    df_train_id["accz"] = az
    df_train_id["accaxay"] = axay
    return df_train_id

def find_sensor_error(df_train_id, column='anglez'):
    """
    df_train_idは特定ユーザのデータ
    日にちごとに分割したうえで、同じ時間帯のデータの差をとる
    差がゼロになる部分の数をカウントした列を追加する
    """
    def padding(x, left, right, value=0):
        return np.pad(x, (left, right), 'constant', constant_values=(value, value))


    x = df_train_id['daily_step'].values
    y = df_train_id[column].values
    mask = np.ones_like(y)
    tar = df_train_id['target'].values * (df_train_id['target'].values > -1e-7)# 一時的。効果の有無を確認したい
    pad_left = int(x[0])
    pad_right = int(Cfg.step_for_a_day) -1 - int(x[-1]) % int(Cfg.step_for_a_day)
    
    x = padding(x, pad_left, pad_right, value=0)
    y = padding(y, pad_left, pad_right, value=0)
    y_dif = padding(np.abs(y[1:] - y[:-1]), 1, 0, value=0)
    mask = padding(mask, pad_left, pad_right, value=0)
    tar = padding(tar, pad_left, pad_right, value=0)


    # reshape to (num_day, step_for_a_day)
    x = x.reshape(-1, int(Cfg.step_for_a_day))
    y = y.reshape(-1, int(Cfg.step_for_a_day))
    y_dif = y_dif.reshape(-1, int(Cfg.step_for_a_day))
    mask = mask.reshape(-1, int(Cfg.step_for_a_day))
    tar = tar.reshape(-1, int(Cfg.step_for_a_day))
    day_counter = np.cumsum(mask, axis=0) # (num_day, step_for_a_day)

    # メモリ注意。
    delta_matrix = y[np.newaxis, :, :] - y[:, np.newaxis, :] # (num_day, num_day, step_for_a_day)
    mask_matrix = mask[np.newaxis, :, :] * mask[:, np.newaxis, :] # (num_day, num_day, step_for_a_day)
    delta_matrix_full = (delta_matrix==0) * mask_matrix # where delta is zero except for the padding area
    
    delta_matrix = np.sum(delta_matrix_full, axis=0) - 1 # (num_day, step_for_a_day). 同じデータ同士の差は0になるので-1
    mask_matrix = np.sum(mask_matrix, axis=0) # (num_day, step_for_a_day)
    nan_counter = np.cumsum(delta_matrix > 0, axis=0) # (num_day, step_for_a_day) 初回のnan風なものは案外ラベルがついている？
    # print(nan_counter.shape)
    # ↑ミスってるので修正(これだと初日情報つかんでるかも…？これは有効に働いてるのか…？とりあえずdiagの部分だけを使う)
    # nan_counter = np.cumsum(delta_matrix_full, axis=0) # (num_day, step_for_a_day) 初回のnan風なものは案外ラベルがついている？
    # num_day = nan_counter.shape[0]
    # nan_counter = nan_counter[np.arange(num_day), np.arange(num_day), :]# np.diagonal(nan_counter, axis1=0, axis2=1)



    nan_exist_other_day = np.any(delta_matrix > 0, axis=0, keepdims=True) # (1, step_for_a_day)
    nan_exist_other_day = np.tile(nan_exist_other_day, (delta_matrix.shape[0], 1)) # (num_day, step_for_a_day)


    maybe_not_nan = delta_matrix == 0
    valid = mask * maybe_not_nan
    y_valid = y_dif * maybe_not_nan # (num_day, step_for_a_day) ★一時的にy_difを使う。
    y_sum = np.sum(y_valid, axis=0, keepdims=True) # (1, step_for_a_day)
    y_mean = y_sum / (np.sum(valid, axis=0, keepdims=True)+1e-7) # (1, step_for_a_day)
    y_dev = (y_valid - y_mean) * valid # (num_day, step_for_a_day)
    y_std = np.sqrt(np.sum(y_dev**2, axis=0, keepdims=True) / (np.sum(valid, axis=0, keepdims=True)+1e-7)) # (1, step_for_a_day)
    # mean, stdをtileして、yと同じshapeにする
    y_mean = np.tile(y_mean, (y.shape[0], 1))
    y_std = np.tile(y_std, (y.shape[0], 1)) # (num_day, step_for_a_day)

    tar_sum = np.sum(tar, axis=0, keepdims=True) # (1, step_for_a_day)
    tar_other = (tar_sum - tar) / (np.sum(mask, axis=0, keepdims=True) - mask + 1e-7) # (num_day, step_for_a_day)

    # flatten
    delta_matrix = delta_matrix.reshape(-1)
    mask_matrix = mask_matrix.reshape(-1)
    day_counter = day_counter.reshape(-1)
    nan_counter = nan_counter.reshape(-1)
    nan_exist_other_day = nan_exist_other_day.reshape(-1)
    y_mean = y_mean.reshape(-1)
    y_std = y_std.reshape(-1)
    tar_other = tar_other.reshape(-1)

    # left, rightのpadding部分を除く
    start = pad_left
    end = -pad_right if pad_right > 0 else len(delta_matrix)
    delta_matrix = delta_matrix[start:end]
    mask_matrix = mask_matrix[start:end]
    day_counter = day_counter[start:end]
    nan_counter = nan_counter[start:end]
    nan_exist_other_day = nan_exist_other_day[start:end]
    y_mean = y_mean[start:end]
    y_std = y_std[start:end]
    tar_other = tar_other[start:end]

    df_train_id[column+"_numrepeat"] = delta_matrix
    df_train_id[column+"_daycount"] = mask_matrix
    df_train_id[column+"_daycounter"] = day_counter
    df_train_id[column+"_nancounter"] = nan_counter
    df_train_id[column+"_nanexist"] = nan_exist_other_day
    df_train_id[column+"_daymean"] = y_mean
    df_train_id[column+"_daystd"] = y_std
    df_train_id[column+"_tarother"] = tar_other

    # smoothing(膨張収縮処理)で超短期を埋めておく。とりあえず5stepクロージング。    
    df_train_id[column+"_simpleerror"] = easy_closing_q((delta_matrix > 0).astype(np.float64),  5) # 5step
    df_train_id[column+"_simpleerror_span"] = cumsum_morethan_zero(df_train_id[column+"_simpleerror"].values)
    df_train_id[column+"_simpleerror_v2"] = df_train_id[column+"_simpleerror"].astype(int) + (df_train_id[column+"_simpleerror_span"] > Cfg.step_for_30min).astype(int) + (df_train_id[column+"_simpleerror_span"] > (Cfg.step_for_30min*2)).astype(int) # 

    return df_train_id

def make_binary_target(df_train_id, df_target_id, id_no):
    df_train_id['target'] = -1
    # df_target_idを順番に見ていって、onset と wakeupの間を0に、wakeupとonsetの間を1にする
    # ただし、wakeupのあとのonsetがnanになっている場合は、-1のままにする
    current_state = -1
    current_step = 0
    # nanでも直後の1-2時間ぐらいは前の状態が継続していると考えられる
    continue_length = int(Cfg.step_for_30min*4)

    for total_step, event_type, night_no in zip(df_target_id['step'].values, df_target_id['event'].values, df_target_id['night'].values):
        tmp = current_state
        data_exist = not np.isnan(total_step)
        if event_type == 'onset' and data_exist:
            if current_state == 1 or current_state == -1:
                df_train_id.loc[(df_train_id['step'] >= current_step) & (df_train_id['step'] < total_step), 'target'] = 1
            if current_state == -2: # nan before onset. 直前がnanでもevent変化前の1-2時間ぐらいは前の状態が継続していると考えられる
                df_train_id.loc[(df_train_id['step'] >= np.maximum(total_step - continue_length, 0)) & (df_train_id['step'] < total_step), 'target'] = 1
            current_state = 0
            current_step = total_step
        elif event_type == 'wakeup' and data_exist:
            if current_state == 0 or current_state == -1: # 初回wakeupはあるのか？
                df_train_id.loc[(df_train_id['step'] >= current_step) & (df_train_id['step'] < total_step), 'target'] = 0
            if current_state == -2: # nan before wakeup. 直前がnanでもevent変化前の1-2時間ぐらいは前の状態が継続していると考えられる
                df_train_id.loc[(df_train_id['step'] >= np.maximum(total_step - continue_length, 0)) & (df_train_id['step'] < total_step), 'target'] = 0
            current_state = 1
            current_step = total_step
        elif not data_exist: # nan
            # 直後の1-2時間ぐらいは前の状態が継続していると考えられる
            if current_state == 1: # nan after wakeup
                df_train_id.loc[(df_train_id['step'] >= current_step) & (df_train_id['step'] < current_step+continue_length), 'target'] = 1
            elif current_state == 0: # nan after onset
                df_train_id.loc[(df_train_id['step'] >= current_step) & (df_train_id['step'] < current_step+continue_length), 'target'] = 0

            current_state = -2
        # print(tmp, "->", event_type, "->", current_state, data_exist, total_step)
        
    # 最後、終了時点の分も追加
    # if not id_no == 'a596ad0b82aa':# これ以外もだめっぽい…。ないやつは何なんだ。
    #     if current_state == 1:
    #         df_train_id.loc[(df_train_id['step'] >= current_step), 'target'] = 1
    #     elif current_state == 0:
    #         df_train_id.loc[(df_train_id['step'] >= current_step), 'target'] = 0
    # else:
    #     print("-----------------------------------"*10)
    #     print("id a596ad0b82aa has no wakeup event in last N days.")


    # 学習時にレンジ絞るようにstep_for_trainを作ったけど、現状使わないことにしている。
    if current_state == 1: # nan after wakeup
        df_train_id.loc[(df_train_id['step'] >= current_step) & (df_train_id['step'] < current_step+continue_length), 'target'] = 1
        end_step_for_train = current_step+continue_length
    elif current_state == 0: # nan after onset
        df_train_id.loc[(df_train_id['step'] >= current_step) & (df_train_id['step'] < current_step+continue_length), 'target'] = 0
        end_step_for_train = current_step+continue_length
    # if current_state == 1:
    #     df_train_id.loc[(df_train_id['step'] >= current_step), 'target'] = 1
    #     end_step_for_train = current_step+continue_length
    # elif current_state == 0:
    #     df_train_id.loc[(df_train_id['step'] >= current_step), 'target'] = 0
    #     end_step_for_train = current_step+continue_length
    elif current_state == -2:
        end_step_for_train = (night_no - 1) * Cfg.step_for_a_day

    
    return df_train_id, end_step_for_train

def make_multilabel_target(df_train_id, df_target_id, id_no):
    df_train_id['target'] = -1
    df_train_id['target_sw'] = 0
    sw_step_range = [i-1 for i in [1, 12, 36, 60, 90, 120, 150, 180, 240, 300, 360]][::-1]
    sw_labels = [11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1][::-1] # multi labelしてみる 11はど真ん中。使用は未定

    # df_target_idを順番に見ていって、onset と wakeupの間を0に、wakeupとonsetの間を1にする
    # ただし、wakeupのあとのonsetがnanになっている場合は、-1のままにする
    current_state = -1
    current_step = 0
    # nanでも直後の1-2時間ぐらいは前の状態が継続していると考えられる
    continue_length = int(Cfg.step_for_30min*4)
    max_step = df_target_id['step'].values.max()

    for total_step, event_type, night_no in zip(df_target_id['step'].values, df_target_id['event'].values, df_target_id['night'].values):
        tmp = current_state
        data_exist = not np.isnan(total_step)
        if event_type == 'onset' and data_exist:
            if current_state == 1 or current_state == -1:
                df_train_id.loc[(df_train_id['step'] >= current_step) & (df_train_id['step'] < total_step), 'target'] = 1
            if current_state == -2: # nan before onset. 直前がnanでもevent変化前の1-2時間ぐらいは前の状態が継続していると考えられる
                df_train_id.loc[(df_train_id['step'] >= np.maximum(total_step - continue_length, 0)) & (df_train_id['step'] < total_step), 'target'] = 1
            current_state = 0
            current_step = total_step

            # target_swは、状態変化のタイミングでstepが合致する前後のデータを数値に変換する
            for label_range, sw_label in zip(sw_step_range, sw_labels): # 広い範囲から順番に。
                df_train_id.loc[(df_train_id['step'] >= (total_step - label_range)) & (df_train_id['step'] <= (total_step + label_range)), 'target_sw'] = sw_label


        elif event_type == 'wakeup' and data_exist:
            if current_state == 0 or current_state == -1: # 初回wakeupはあるのか？
                df_train_id.loc[(df_train_id['step'] >= current_step) & (df_train_id['step'] < total_step), 'target'] = 0
            if current_state == -2: # nan before wakeup. 直前がnanでもevent変化前の1-2時間ぐらいは前の状態が継続していると考えられる
                df_train_id.loc[(df_train_id['step'] >= np.maximum(total_step - continue_length, 0)) & (df_train_id['step'] < total_step), 'target'] = 0
            current_state = 1
            current_step = total_step

            # target_swは、状態変化のタイミングでstepが合致する前後のデータを数値に変換する
            for label_range, sw_label in zip(sw_step_range, sw_labels): # 広い範囲から順番に。
                df_train_id.loc[(df_train_id['step'] >= (total_step - label_range)) & (df_train_id['step'] <= (total_step + label_range)), 'target_sw'] = sw_label

        elif not data_exist: # nan
            # 直後の1-2時間ぐらいは前の状態が継続していると考えられる
            if current_state == 1: # nan after wakeup
                df_train_id.loc[(df_train_id['step'] >= current_step) & (df_train_id['step'] < current_step+continue_length), 'target'] = 1
            elif current_state == 0: # nan after onset
                df_train_id.loc[(df_train_id['step'] >= current_step) & (df_train_id['step'] < current_step+continue_length), 'target'] = 0

            current_state = -2
        # print(tmp, "->", event_type, "->", current_state, data_exist, total_step)
        

    # 学習時にレンジ絞るようにstep_for_trainを作ったけど、現状使わないことにしている。
    if current_state == 1: # nan after wakeup
        df_train_id.loc[(df_train_id['step'] >= current_step) & (df_train_id['step'] < current_step+continue_length), 'target'] = 1
        end_step_for_train = current_step+continue_length
    elif current_state == 0: # nan after onset
        df_train_id.loc[(df_train_id['step'] >= current_step) & (df_train_id['step'] < current_step+continue_length), 'target'] = 0
        end_step_for_train = current_step+continue_length
    # if current_state == 1:
    #     df_train_id.loc[(df_train_id['step'] >= current_step), 'target'] = 1
    #     end_step_for_train = current_step+continue_length
    # elif current_state == 0:
    #     df_train_id.loc[(df_train_id['step'] >= current_step), 'target'] = 0
    #     end_step_for_train = current_step+continue_length
    elif current_state == -2:
        end_step_for_train = (night_no - 1) * Cfg.step_for_a_day

    
    return df_train_id, end_step_for_train

def prerprocess_inputs_for_mlmodel(df_train_id):
    df_train_id["daily_step"] = (df_train_id["daily_step"] / Cfg.step_for_a_day).astype(np.float32)
    df_train_id["anglez"] = (df_train_id["anglez"] / 90).astype(np.float32)
    df_train_id["anglez_simpleerror"] = df_train_id["anglez_simpleerror"].astype(np.float32)
    # anglez_shimpleerror_spanはlog1pをとっておく
    df_train_id["anglez_simpleerror_span"] = np.clip(np.log1p(df_train_id["anglez_simpleerror_span"]) / 10, 0, 1).astype(np.float32)
    df_train_id["anglez_nanexist"] = df_train_id["anglez_nanexist"].astype(np.float32)
    df_train_id["anglez_daystd"] = np.clip(df_train_id["anglez_daystd"] / 90, 0, 2).astype(np.float32)
    df_train_id["anglez_daymean"] = np.clip(df_train_id["anglez_daymean"] / 90, 0, 2).astype(np.float32)
    df_train_id["anglez_daycounter"] = (df_train_id["anglez_daycounter"] / df_train_id["anglez_daycounter"].max()).astype(np.float32)
    df_train_id["anglez_nancounter"] = (df_train_id["anglez_nancounter"]>2).astype(np.float32)
    # df_train_id["anglez_nancounter"] = np.clip(df_train_id["anglez_nancounter"]-2, -1, 1).astype(np.float32)
    # df_train_id["anglez_tarother"] = np.clip(df_train_id["anglez_tarother"], 0, 1).astype(np.float32)
    max_daystd = np.max(df_train_id["anglez_daystd"].values)
    max_daymean = np.max(df_train_id["anglez_daymean"].values)
    if max_daystd > 10:
        print("max_daystd > 10", max_daystd)
    if max_daymean > 10:
        print("max_daymean > 10", max_daymean)
        

    df_train_id["enmo"] = np.clip(np.log1p(df_train_id["enmo"]), 0, 5).astype(np.float32)
    df_train_id["accz"] = np.clip(df_train_id["accz"], -2, 2).astype(np.float32)
    df_train_id["accaxay"] = np.clip(df_train_id["accaxay"], 0, 3).astype(np.float32)

    df_train_id["step_count"] = (df_train_id["step"]/df_train_id["step"].max()).astype(np.float32)
    df_train_id["dayofweek"] = (df_train_id["dayofweek"] / 7.).astype(np.float32)

    df_train_id["target_sw"] = df_train_id["target_sw"].astype(np.float32)
    df_train_id["target"] = df_train_id["target"].astype(np.float32)
    return df_train_id

def connect_2_arrays(first_array, second_array):
    end_step_0 = first_array[-1,0]
    initial_step_1 = second_array[0,0]
    # print(first_array[-1,0], second_array[0,0])
    if end_step_0 >= initial_step_1:
        offset = int((end_step_0 - initial_step_1) * Cfg.step_for_a_day)
        second_array = second_array[offset:]
    else:
        offset = int((1 - initial_step_1 + end_step_0) * Cfg.step_for_a_day)
        second_array = second_array[offset:]
        # print("offset2nd", offset)
    # print(first_array[-1,0], second_array[0,0], offset)
    return np.concatenate([first_array, second_array], axis=0)

def generate_data(processed_array, id_no, days_to_generate_ratio=0.15, min_split_minutes=30, max_split_minutes=60, save_dir=None, plot=False, second_array=None, seed_no=111):
    if second_array is not None:
        processed_array = connect_2_arrays(processed_array, second_array)
    np.random.seed(seed_no)
    save_dir = save_dir or Cfg.gen_data_dir
    os.makedirs(save_dir, exist_ok=True)
    file_path = os.path.join(save_dir, f"id_{id_no}_feature.npy")
    file_step_path = os.path.join(save_dir, f"id_{id_no}_step.npy")
    file_path_meta = os.path.join(save_dir, f"id_{id_no}_meta.json")
    days_to_generate = max(int((days_to_generate_ratio * processed_array.shape[0]) / int(Cfg.step_for_a_day)), 1)

    def blend_dataset(processed_array):
        step_length = processed_array.shape[0]
        day_length = int(step_length // int(Cfg.step_for_a_day))
        residual_step = int(step_length % int(Cfg.step_for_a_day))
        start =  np.random.randint(0, residual_step) if residual_step > 0 else 0
        end = step_length - residual_step + start
        processed_array = processed_array[start:end].reshape(day_length, int(Cfg.step_for_a_day), 12)

        generated_array = []
        step_to_generate = int(Cfg.step_for_a_day * days_to_generate)
        total_start = 0
        total_end = 0
        while total_start < step_to_generate:
            start = 0
            end = 0
            while end < int(Cfg.step_for_a_day):
                gen_step = np.minimum(np.random.randint(min_split_minutes, max_split_minutes) * int(Cfg.step_for_1min), step_to_generate - total_start)
                end = min(start + gen_step, int(Cfg.step_for_a_day))
                gen_step = end - start
                total_end = total_start + gen_step
                day_idx = np.random.randint(0, day_length)
                generated_array.append(processed_array[day_idx, start:end, :])
                start = end
                total_start = total_end
                # print(end, total_start, step_to_generate)
                # time.sleep(0.1)
        generated_array = np.concatenate(generated_array, axis=0)
        return generated_array

    
    def remake_target(target_binary):
        state_change = (np.pad(target_binary[:-1] - target_binary[1:], (1,0), 'constant', constant_values=(0, 0))).astype(int)
        state_valid = np.pad((target_binary[:-1] >= 0) * (target_binary[1:]>=0), (1,0), 'constant', constant_values=(0, 0))
        state_change = state_change * state_valid
        # plt.plot(state_change)
        # plt.show()
        
        # state changeは-1,0,1の行列。-1が0->1のwakeup, 1が1->0のonset。
        # {"state": "sleep", "start": idx, "duration": next_idx - idx}のような形式で整理する
        states = []
        current_idx = 0
        previous_continue = False
        # min_duration = int(Cfg.step_for_30min)
        min_duration = int(int(Cfg.step_for_30min) * 0.5)
        if target_binary[0] == 0:
            current_state = "sleep"
        elif target_binary[0] == 1:
            current_state = "awake"
        else:
            current_state = "nan"
        while True:
            idx = np.argmax((np.abs(state_change)))
            if state_change[idx] == 0:
                if not previous_continue:
                    states.append({"state": current_state, "start": current_idx, "duration": len(state_change) - current_idx})
                else:
                    states[-1]["duration"] += len(state_change) - states[-1]["start"]
                break
            if state_change[idx] == -1:
                state_change[idx] = 0
                if idx < min_duration and current_idx > 0:
                    if not previous_continue: # 二回連続で短い場合は無視
                        previous_continue = True
                        continue
                state_change = state_change[idx:]
                # if current_state in ["sleep", "nan"]: # 前の状態と合致しない場合(nan)は無視
                if not previous_continue:
                    states.append({"state": "sleep", "start": current_idx, "duration": idx})
                else:
                    states[-1]["duration"] += idx
                previous_continue = False
                current_idx += idx
                current_state = "awake"
                continue
            if state_change[idx] == 1:
                state_change[idx] = 0
                if idx < min_duration and current_idx > 0: # 初回は短くても許容する
                    if not previous_continue:
                        previous_continue = True
                        continue
                state_change = state_change[idx:]
                # if current_state in ["awake", "nan"]:
                if not previous_continue:
                    states.append({"state": "awake", "start": current_idx, "duration": idx})
                else:
                    states[-1]["duration"] += idx
                previous_continue = False
                current_idx += idx
                current_state = "sleep"
                continue
        
        sw_step_range = [i-1 for i in [1, 12, 36, 60, 90, 120, 150, 180, 240, 300, 360]][::-1]
        max_step_range = np.max(sw_step_range)
        sw_labels = [11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1][::-1]
        event_switch =  np.zeros_like(target_binary)
        for event in states[1:]:
            event_idx = event["start"]
            event_switch[event_idx] = 1
            for label_range, sw_label in zip(sw_step_range, sw_labels):
                if min_duration < max_step_range:
                    event_switch[max(event_idx - label_range, 0): min(event_idx + label_range+1, len(event_switch))] = np.maximum(event_switch[max(event_idx - label_range, 0): min(event_idx + label_range+1, len(event_switch))], sw_label)
                else:
                    event_switch[max(event_idx - label_range, 0): min(event_idx + label_range+1, len(event_switch))] = sw_label # np.maximum(event_switch[max(event_idx - label_range, 0): min(event_idx + label_range+1, len(event_switch))], sw_label)
        
        return event_switch

    generated_array = blend_dataset(processed_array)
    event_switch = remake_target(generated_array[:, -1])
    # generated_array[:, -2]を差し換える
    tmp = generated_array[:, -2].copy()
    generated_array[:, -2] = event_switch

    # 描画subplotで4つ
    if plot:
        plt.figure(figsize=(20, 10))
        plt.subplot(4,1,1)
        plt.plot(generated_array[:, 1])
        plt.subplot(4,1,2)
        plt.plot(generated_array[:,-1])
        plt.subplot(4,1,3)
        plt.plot(tmp)
        plt.subplot(4,1,4)
        plt.plot(event_switch)
        plt.show()

        plt.figure(figsize=(20, 10))
        plt.subplot(4,1,1)
        plt.plot(processed_array[:len(generated_array), 1])
        plt.subplot(4,1,2)
        plt.plot(processed_array[:len(generated_array),-1])
        plt.subplot(4,1,3)
        plt.plot(processed_array[:len(generated_array),-2])
        plt.subplot(4,1,4)
        plt.plot(processed_array[:len(generated_array),0])
        plt.show()


    dummy_meta_data = {"end_step_for_train": len(generated_array)}
    dummy_step = np.arange(len(generated_array))
    np.save(file_path, generated_array)
    np.save(file_step_path, dummy_step)
    with open(file_path_meta, 'w') as f:
        json.dump(dummy_meta_data, f, indent=4)

    # return generated_array



def make_dataset(df_train_id, df_target_id, id_no):
    file_path = os.path.join(Cfg.preprocess_dir, f"id_{id_no}_feature.npy")
    file_step_path = os.path.join(Cfg.preprocess_dir, f"id_{id_no}_step.npy")
    file_path_meta = os.path.join(Cfg.preprocess_dir, f"id_{id_no}_meta.json")
    train_columns = ['daily_step', 'anglez', 'anglez_simpleerror', 'anglez_simpleerror_span', 
    'anglez_nanexist', 'anglez_daystd', 'anglez_daymean', "anglez_daycounter", "anglez_nancounter", 
    'enmo', # "accz", "accaxay", # "step_count", "dayofweek", 
    'target_sw', 'target']
    # with timer(f"select_dataset {id_no}"):
    #     df_train_id, df_target_id = sample_train_target_by_id(df_train, df_target, id_no)
    with timer(f"timestamp_to_step {id_no}"):
        # df_train_id = timestamp_to_step(df_train_id)
        # df_target_id = timestamp_to_step(df_target_id)
        df_train_id, df_target_id = timestamp_to_step_single_id(df_train_id, df_target_id)
    with timer(f"make_binary_target {id_no}"):
        # df_train_id, end_step_for_train = make_binary_target(df_train_id, df_target_id, id_no)
        df_train_id, end_step_for_train = make_multilabel_target(df_train_id, df_target_id, id_no)

    with timer(f"find_sensor_error {id_no}"):
        df_train_id = find_sensor_error(df_train_id, column='anglez')
    df_train_id = make_features(df_train_id)
    df_train_id = prerprocess_inputs_for_mlmodel(df_train_id)
    meta_data = {"end_step_for_train": end_step_for_train}

    # numpyに変換して保存する
    np.save(file_path, df_train_id[train_columns].values)
    np.save(file_step_path, df_train_id["step"].values)
    with open(file_path_meta, 'w') as f:
        json.dump(meta_data, f, indent=4)
        
    

def plot_train_target_by_id(df_train, df_target, id_no):
    df_train_s, df_target_s = sample_train_target_by_id(df_train, df_target, id_no)
    df_train_s = timestamp_to_step(df_train_s)
    df_target_s = timestamp_to_step(df_target_s)
    # trainはtimestampを横軸にしてanglezをプロット
    # targetはtimestampを横軸にしてaxivlineで破線をひく
    print(df_train_s.shape, df_target_s.shape)
    df_train_s = find_sensor_error(df_train_s, column='anglez')
    df_train_s = make_binary_target(df_train_s, df_target_s)
    print(df_train_s.shape, df_target_s.shape)


    x = df_train_s['daily_step'].values
    y = df_train_s['anglez'].values
    # n = df_train_s['anglez_simpleerror_v2'].values
    n = df_train_s['target'].values
    s = df_train_s['step'].values
    # x,yをcfg.step_for_a_dayごとのindex長さに分割してプロットする
    start = 0
    end = int(Cfg.step_for_a_day) - int(x[0])
    num_view = 10
    
    while end < len(x):
        fig, ax = plt.subplots(1, 1, figsize=(20, 5))
        
        start_step = s[start]
        end_step = s[end]    
        ax.plot(x[start:end], y[start:end])
        ax.plot(x[start:end], 10*n[start:end])
        

        if num_view==5:
            tmp5 = y[start:end]
        if num_view==1:
            tmp9 = y[start:end]

        start = end
        end += int(Cfg.step_for_a_day) 



        for total_step, event_step, event_type in zip(df_target_s['step'].values, df_target_s['daily_step'].values, df_target_s['event'].values):
            if total_step < start_step:
                continue
            elif total_step > end_step:
                break
            if event_type == 'onset':
                ax.axvline(event_step, color='red', linestyle='--')
            elif event_type == 'wakeup':
                ax.axvline(event_step, color='green', linestyle='--')
        # x rangeは0~Cfg.step_for_a_day
        ax.set_xlim(0, Cfg.step_for_a_day)
        plt.show()
        num_view -= 1
        if num_view == 0:
            break

    return df_train_s, df_target_s


def timestamp_to_step_single_id(df_train_id, df_target_id):
    """
    Convert timestamp to step
    timestepは2018-08-14T22:26:00-0400といった形で与えられる
    これを午前0時開始の経過時間に変換し、5秒ごとのstepに変換する
    idごとに処理する前提として、最初の時間をオフセットして、Cfg.daily_stepで割ってあまりをとる
    """
    # iloc[0]で最初の時間を取得
    offset_date = df_train_id['timestamp'].iloc[0:1].str.split('T', expand=True)[0]
    offset_time = df_train_id['timestamp'].iloc[0:1].str.split('T', expand=True)[1].str.split('-', expand=True)[0]
    offset = pd.to_datetime(offset_date + ' ' + offset_time)
    offset_step = offset.dt.hour * 60 * 12 + offset.dt.minute * 12 + offset.dt.second / 5
    df_train_id["daily_step"] = (df_train_id['step'] + offset_step.values[0]) % Cfg.step_for_a_day
    df_target_id["daily_step"] = (df_target_id['step'] + offset_step.values[0]) % Cfg.step_for_a_day
    df_train_id["dayofweek"] = ((offset.dt.dayofweek.values[0] + (df_train_id['step'] + offset_step.values[0]) // Cfg.step_for_a_day)) % 7
    # print((df['step'].iloc[-1] + offset_step) % Cfg.step_for_a_day)
    return df_train_id, df_target_id

# display(df_train_id.tail())
# timestamp_to_step_single_id(df_train_id)    

## Main
- make fold no
- split data for nn model

In [19]:
df_train = pd.read_parquet(Cfg.train_path)
df_test = pd.read_parquet(Cfg.test_path)
sample_submission = pd.read_csv(Cfg.sample_submission_path)
df_target = pd.read_csv(Cfg.train_target_path)


In [20]:
print("--- make fold no split ---")
df_target = pd.read_csv(Cfg.train_target_path)


max_step = df_train.groupby("series_id")["step"].max().reset_index().rename(columns={"step": "step_max"})
df_train_target = pd.merge(df_target, max_step, on="series_id", how="left")
df_train_target.loc[df_train_target["step"].isnull(), "event"] = "nan"
df_train_target["step"] = df_train_target["step"].fillna(1)
df_train_target = df_train_target[df_train_target["step"] < df_train_target["step_max"]]
df_train_target = df_train_target[df_train_target["step"] > 0]
#target_count = df_train_target[df_train_target["event"].isin(["onset", "wakeup"])].groupby("series_id")["event"].count().reset_index()
target_count = df_train_target.groupby("series_id")["event"].count().reset_index()

display(target_count)

df_train_target = pd.merge(max_step, target_count, on="series_id", how="left")
df_train_target["event"] = df_train_target["event"].fillna(0)
display(df_train_target)
df_train_target["days"] = df_train_target["step_max"] // Cfg.step_for_a_day
df_train_target["ratio"] = df_train_target["event"] / df_train_target["days"]
# plt.figure(figsize=(20, 5))
# plt.scatter(df_train_target["event"], df_train_target["days"])
# plt.show()
# plt.hist(df_train_target["ratio"], bins=100)
# plt.show()

display(df_train_target)

# stratified k-foldを行う 

for seed in [111, 42]:
    df_train_target["fold"] = -1
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    for fold, (_, val_idx) in enumerate(skf.split(df_train_target, (df_train_target["ratio"].values//0.2).astype(int))):
        df_train_target.loc[val_idx, "fold"] = fold
    df_train_target.to_csv(os.path.join(Cfg.preprocess_dir, f"series_id_5fold_seed{seed}.csv"), index=False)

    df_train_target["fold"] = -1
    skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=111)
    for fold, (_, val_idx) in enumerate(skf.split(df_train_target, (df_train_target["ratio"].values//0.2).astype(int))):
        df_train_target.loc[val_idx, "fold"] = fold
    df_train_target.to_csv(os.path.join(Cfg.preprocess_dir, f"series_id_2fold_seed{seed}.csv"), index=False)


--- make fold no split ---


Unnamed: 0,series_id,event
0,038441c925bb,46
1,03d92c9f6f8a,74
2,0402a003dae9,48
3,04f547b8017d,74
4,05e1944c3818,16
...,...,...
272,fa149c3c4bde,48
273,fb223ed2278c,106
274,fbf33b1a2c10,48
275,fcca183903b7,72


Unnamed: 0,series_id,step_max,event
0,038441c925bb,389879,46
1,03d92c9f6f8a,724139,74
2,0402a003dae9,397259,48
3,04f547b8017d,637559,74
4,05e1944c3818,400859,16
...,...,...,...
272,fa149c3c4bde,406799,48
273,fb223ed2278c,918359,106
274,fbf33b1a2c10,421019,48
275,fcca183903b7,620639,72


Unnamed: 0,series_id,step_max,event,days,ratio
0,038441c925bb,389879,46,22.0,2.090909
1,03d92c9f6f8a,724139,74,41.0,1.804878
2,0402a003dae9,397259,48,22.0,2.181818
3,04f547b8017d,637559,74,36.0,2.055556
4,05e1944c3818,400859,16,23.0,0.695652
...,...,...,...,...,...
272,fa149c3c4bde,406799,48,23.0,2.086957
273,fb223ed2278c,918359,106,53.0,2.000000
274,fbf33b1a2c10,421019,48,24.0,2.000000
275,fcca183903b7,620639,72,35.0,2.057143




In [21]:
counter = 0
with timer("make_dataset"): # error at c8053490cec2, 'c908a0ad3e31'
    for id_no, df_train_id in df_train.groupby('series_id'):
        print(id_no)
        df_target_id = df_target[df_target['series_id']==id_no]
        #print(df_train_id['daily_step'].values)
        #break
        make_dataset(df_train_id, df_target_id, id_no)
        # break
        counter += 1

        if Cfg.IS_DEBUG:
            if counter > 40:
                print("debug mode. only 40 dataset. break")
                break

[MEMUSE] memory usage (in before make_dataset): 4620.72MB (79.6%)
038441c925bb
[MEMUSE] memory usage (in before timestamp_to_step 038441c925bb): 4638.31MB (82.8%)
[timestamp_to_step 038441c925bb] 0.048sec
[MEMUSE] memory usage (in after timestamp_to_step 038441c925bb): 4632.80MB (82.8%)
[MEMUSE] memory usage (in before make_binary_target 038441c925bb): 4632.80MB (82.8%)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target_id["daily_step"] = (df_target_id['step'] + offset_step.values[0]) % Cfg.step_for_a_day


[make_binary_target 038441c925bb] 0.328sec
[MEMUSE] memory usage (in after make_binary_target 038441c925bb): 4668.00MB (82.8%)
[MEMUSE] memory usage (in before find_sensor_error 038441c925bb): 4668.05MB (82.8%)
[find_sensor_error 038441c925bb] 0.061sec
[MEMUSE] memory usage (in after find_sensor_error 038441c925bb): 4673.48MB (82.9%)
03d92c9f6f8a
[MEMUSE] memory usage (in before timestamp_to_step 03d92c9f6f8a): 4662.30MB (82.8%)
[timestamp_to_step 03d92c9f6f8a] 0.016sec
[MEMUSE] memory usage (in after timestamp_to_step 03d92c9f6f8a): 4668.42MB (82.8%)
[MEMUSE] memory usage (in before make_binary_target 03d92c9f6f8a): 4668.44MB (82.8%)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target_id["daily_step"] = (df_target_id['step'] + offset_step.values[0]) % Cfg.step_for_a_day


[make_binary_target 03d92c9f6f8a] 0.282sec
[MEMUSE] memory usage (in after make_binary_target 03d92c9f6f8a): 4703.64MB (82.9%)
[MEMUSE] memory usage (in before find_sensor_error 03d92c9f6f8a): 4703.64MB (82.9%)
[find_sensor_error 03d92c9f6f8a] 0.163sec
[MEMUSE] memory usage (in after find_sensor_error 03d92c9f6f8a): 4703.64MB (82.9%)
0402a003dae9
[MEMUSE] memory usage (in before timestamp_to_step 0402a003dae9): 4697.02MB (82.8%)
[timestamp_to_step 0402a003dae9] 0.011sec
[MEMUSE] memory usage (in after timestamp_to_step 0402a003dae9): 4703.94MB (82.9%)
[MEMUSE] memory usage (in before make_binary_target 0402a003dae9): 4703.95MB (82.9%)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target_id["daily_step"] = (df_target_id['step'] + offset_step.values[0]) % Cfg.step_for_a_day


[make_binary_target 0402a003dae9] 0.351sec
[MEMUSE] memory usage (in after make_binary_target 0402a003dae9): 4717.14MB (82.9%)
[MEMUSE] memory usage (in before find_sensor_error 0402a003dae9): 4717.14MB (82.9%)
[find_sensor_error 0402a003dae9] 0.036sec
[MEMUSE] memory usage (in after find_sensor_error 0402a003dae9): 4695.62MB (82.9%)
04f547b8017d
[MEMUSE] memory usage (in before timestamp_to_step 04f547b8017d): 4695.62MB (82.9%)
[timestamp_to_step 04f547b8017d] 0.015sec
[MEMUSE] memory usage (in after timestamp_to_step 04f547b8017d): 4695.62MB (82.9%)
[MEMUSE] memory usage (in before make_binary_target 04f547b8017d): 4695.62MB (82.9%)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target_id["daily_step"] = (df_target_id['step'] + offset_step.values[0]) % Cfg.step_for_a_day


[make_binary_target 04f547b8017d] 0.571sec
[MEMUSE] memory usage (in after make_binary_target 04f547b8017d): 4695.55MB (82.7%)
[MEMUSE] memory usage (in before find_sensor_error 04f547b8017d): 4695.55MB (82.7%)
[find_sensor_error 04f547b8017d] 0.067sec
[MEMUSE] memory usage (in after find_sensor_error 04f547b8017d): 4689.53MB (82.7%)
05e1944c3818
[MEMUSE] memory usage (in before timestamp_to_step 05e1944c3818): 4685.62MB (82.7%)
[timestamp_to_step 05e1944c3818] 0.010sec
[MEMUSE] memory usage (in after timestamp_to_step 05e1944c3818): 4688.11MB (82.7%)
[MEMUSE] memory usage (in before make_binary_target 05e1944c3818): 4688.11MB (82.7%)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target_id["daily_step"] = (df_target_id['step'] + offset_step.values[0]) % Cfg.step_for_a_day


[make_binary_target 05e1944c3818] 0.123sec
[MEMUSE] memory usage (in after make_binary_target 05e1944c3818): 4689.52MB (82.7%)
[MEMUSE] memory usage (in before find_sensor_error 05e1944c3818): 4689.52MB (82.7%)
[find_sensor_error 05e1944c3818] 0.047sec
[MEMUSE] memory usage (in after find_sensor_error 05e1944c3818): 4689.52MB (82.7%)
062cae666e2a
[MEMUSE] memory usage (in before timestamp_to_step 062cae666e2a): 4689.52MB (82.7%)
[timestamp_to_step 062cae666e2a] 0.013sec
[MEMUSE] memory usage (in after timestamp_to_step 062cae666e2a): 4689.52MB (82.7%)
[MEMUSE] memory usage (in before make_binary_target 062cae666e2a): 4689.52MB (82.7%)
[make_binary_target 062cae666e2a] 0.057sec
[MEMUSE] memory usage (in after make_binary_target 062cae666e2a): 4689.52MB (82.7%)
[MEMUSE] memory usage (in before find_sensor_error 062cae666e2a): 4689.52MB (82.7%)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target_id["daily_step"] = (df_target_id['step'] + offset_step.values[0]) % Cfg.step_for_a_day
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target_id["daily_step"] = (df_target_id['step'] + offset_step.values[0]) % Cfg.step_for_a_day


[find_sensor_error 062cae666e2a] 0.048sec
[MEMUSE] memory usage (in after find_sensor_error 062cae666e2a): 4689.52MB (82.7%)
062dbd4c95e6
[MEMUSE] memory usage (in before timestamp_to_step 062dbd4c95e6): 4689.52MB (82.7%)
[timestamp_to_step 062dbd4c95e6] 0.017sec
[MEMUSE] memory usage (in after timestamp_to_step 062dbd4c95e6): 4689.52MB (82.7%)
[MEMUSE] memory usage (in before make_binary_target 062dbd4c95e6): 4689.52MB (82.7%)
[make_binary_target 062dbd4c95e6] 0.423sec
[MEMUSE] memory usage (in after make_binary_target 062dbd4c95e6): 4689.52MB (82.7%)
[MEMUSE] memory usage (in before find_sensor_error 062dbd4c95e6): 4689.52MB (82.7%)
[find_sensor_error 062dbd4c95e6] 0.133sec
[MEMUSE] memory usage (in after find_sensor_error 062dbd4c95e6): 4438.09MB (81.6%)
08db4255286f
[MEMUSE] memory usage (in before timestamp_to_step 08db4255286f): 4450.27MB (81.6%)
[timestamp_to_step 08db4255286f] 0.010sec
[MEMUSE] memory usage (in after timestamp_to_step 08db4255286f): 4445.69MB (81.6%)
[MEMUSE] m

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target_id["daily_step"] = (df_target_id['step'] + offset_step.values[0]) % Cfg.step_for_a_day


[make_binary_target 08db4255286f] 0.444sec
[MEMUSE] memory usage (in after make_binary_target 08db4255286f): 4435.05MB (81.5%)
[MEMUSE] memory usage (in before find_sensor_error 08db4255286f): 4435.05MB (81.5%)
[find_sensor_error 08db4255286f] 0.032sec
[MEMUSE] memory usage (in after find_sensor_error 08db4255286f): 4436.81MB (81.5%)
0a96f4993bd7
[MEMUSE] memory usage (in before timestamp_to_step 0a96f4993bd7): 4447.00MB (81.6%)
[timestamp_to_step 0a96f4993bd7] 0.007sec
[MEMUSE] memory usage (in after timestamp_to_step 0a96f4993bd7): 4447.00MB (81.6%)
[MEMUSE] memory usage (in before make_binary_target 0a96f4993bd7): 4447.00MB (81.6%)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target_id["daily_step"] = (df_target_id['step'] + offset_step.values[0]) % Cfg.step_for_a_day


[make_binary_target 0a96f4993bd7] 0.183sec
[MEMUSE] memory usage (in after make_binary_target 0a96f4993bd7): 4447.00MB (81.6%)
[MEMUSE] memory usage (in before find_sensor_error 0a96f4993bd7): 4447.00MB (81.6%)
[find_sensor_error 0a96f4993bd7] 0.018sec
[MEMUSE] memory usage (in after find_sensor_error 0a96f4993bd7): 4447.00MB (81.6%)
0cd1e3d0ed95
[MEMUSE] memory usage (in before timestamp_to_step 0cd1e3d0ed95): 4447.00MB (81.6%)
[timestamp_to_step 0cd1e3d0ed95] 0.009sec
[MEMUSE] memory usage (in after timestamp_to_step 0cd1e3d0ed95): 4447.00MB (81.6%)
[MEMUSE] memory usage (in before make_binary_target 0cd1e3d0ed95): 4447.00MB (81.6%)
[make_binary_target 0cd1e3d0ed95] 0.097sec
[MEMUSE] memory usage (in after make_binary_target 0cd1e3d0ed95): 4447.00MB (81.6%)
[MEMUSE] memory usage (in before find_sensor_error 0cd1e3d0ed95): 4447.00MB (81.6%)
[find_sensor_error 0cd1e3d0ed95] 0.027sec
[MEMUSE] memory usage (in after find_sensor_error 0cd1e3d0ed95): 4447.00MB (81.6%)
0ce74d6d2106
[MEMUSE]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target_id["daily_step"] = (df_target_id['step'] + offset_step.values[0]) % Cfg.step_for_a_day
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target_id["daily_step"] = (df_target_id['step'] + offset_step.values[0]) % Cfg.step_for_a_day


[timestamp_to_step 0ce74d6d2106] 0.013sec
[MEMUSE] memory usage (in after timestamp_to_step 0ce74d6d2106): 4447.00MB (81.6%)
[MEMUSE] memory usage (in before make_binary_target 0ce74d6d2106): 4447.00MB (81.6%)
[make_binary_target 0ce74d6d2106] 0.765sec
[MEMUSE] memory usage (in after make_binary_target 0ce74d6d2106): 4560.50MB (82.2%)
[MEMUSE] memory usage (in before find_sensor_error 0ce74d6d2106): 4560.50MB (82.2%)
[find_sensor_error 0ce74d6d2106] 0.080sec
[MEMUSE] memory usage (in after find_sensor_error 0ce74d6d2106): 4633.58MB (82.8%)
0cfc06c129cc
[MEMUSE] memory usage (in before timestamp_to_step 0cfc06c129cc): 4637.75MB (82.8%)
[timestamp_to_step 0cfc06c129cc] 0.009sec
[MEMUSE] memory usage (in after timestamp_to_step 0cfc06c129cc): 4637.75MB (82.8%)
[MEMUSE] memory usage (in before make_binary_target 0cfc06c129cc): 4637.75MB (82.8%)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target_id["daily_step"] = (df_target_id['step'] + offset_step.values[0]) % Cfg.step_for_a_day


[make_binary_target 0cfc06c129cc] 0.339sec
[MEMUSE] memory usage (in after make_binary_target 0cfc06c129cc): 4637.75MB (82.8%)
[MEMUSE] memory usage (in before find_sensor_error 0cfc06c129cc): 4637.75MB (82.8%)
[find_sensor_error 0cfc06c129cc] 0.028sec
[MEMUSE] memory usage (in after find_sensor_error 0cfc06c129cc): 4637.75MB (82.8%)
0d0ad1e77851
[MEMUSE] memory usage (in before timestamp_to_step 0d0ad1e77851): 4637.75MB (82.8%)
[timestamp_to_step 0d0ad1e77851] 0.009sec
[MEMUSE] memory usage (in after timestamp_to_step 0d0ad1e77851): 4637.75MB (82.8%)
[MEMUSE] memory usage (in before make_binary_target 0d0ad1e77851): 4637.75MB (82.8%)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target_id["daily_step"] = (df_target_id['step'] + offset_step.values[0]) % Cfg.step_for_a_day


[make_binary_target 0d0ad1e77851] 0.363sec
[MEMUSE] memory usage (in after make_binary_target 0d0ad1e77851): 4637.75MB (82.8%)
[MEMUSE] memory usage (in before find_sensor_error 0d0ad1e77851): 4637.75MB (82.8%)
[find_sensor_error 0d0ad1e77851] 0.032sec
[MEMUSE] memory usage (in after find_sensor_error 0d0ad1e77851): 4637.75MB (82.8%)
0dee4fda51c3
[MEMUSE] memory usage (in before timestamp_to_step 0dee4fda51c3): 4637.75MB (82.8%)
[timestamp_to_step 0dee4fda51c3] 0.009sec
[MEMUSE] memory usage (in after timestamp_to_step 0dee4fda51c3): 4637.75MB (82.8%)
[MEMUSE] memory usage (in before make_binary_target 0dee4fda51c3): 4637.75MB (82.8%)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target_id["daily_step"] = (df_target_id['step'] + offset_step.values[0]) % Cfg.step_for_a_day


[make_binary_target 0dee4fda51c3] 0.349sec
[MEMUSE] memory usage (in after make_binary_target 0dee4fda51c3): 4645.52MB (82.8%)
[MEMUSE] memory usage (in before find_sensor_error 0dee4fda51c3): 4645.55MB (82.8%)
[find_sensor_error 0dee4fda51c3] 0.032sec
[MEMUSE] memory usage (in after find_sensor_error 0dee4fda51c3): 4651.27MB (82.8%)
0ec9fc461819
[MEMUSE] memory usage (in before timestamp_to_step 0ec9fc461819): 4639.53MB (82.8%)
[timestamp_to_step 0ec9fc461819] 0.013sec
[MEMUSE] memory usage (in after timestamp_to_step 0ec9fc461819): 4639.25MB (82.8%)
[MEMUSE] memory usage (in before make_binary_target 0ec9fc461819): 4639.25MB (82.8%)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target_id["daily_step"] = (df_target_id['step'] + offset_step.values[0]) % Cfg.step_for_a_day


[make_binary_target 0ec9fc461819] 0.713sec
[MEMUSE] memory usage (in after make_binary_target 0ec9fc461819): 4639.25MB (82.8%)
[MEMUSE] memory usage (in before find_sensor_error 0ec9fc461819): 4639.25MB (82.8%)
[find_sensor_error 0ec9fc461819] 0.058sec
[MEMUSE] memory usage (in after find_sensor_error 0ec9fc461819): 4639.25MB (82.8%)
0ef7d94fde99
[MEMUSE] memory usage (in before timestamp_to_step 0ef7d94fde99): 4639.25MB (82.8%)
[timestamp_to_step 0ef7d94fde99] 0.009sec
[MEMUSE] memory usage (in after timestamp_to_step 0ef7d94fde99): 4639.25MB (82.8%)
[MEMUSE] memory usage (in before make_binary_target 0ef7d94fde99): 4639.25MB (82.8%)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target_id["daily_step"] = (df_target_id['step'] + offset_step.values[0]) % Cfg.step_for_a_day


[make_binary_target 0ef7d94fde99] 0.392sec
[MEMUSE] memory usage (in after make_binary_target 0ef7d94fde99): 4698.17MB (82.9%)
[MEMUSE] memory usage (in before find_sensor_error 0ef7d94fde99): 4698.17MB (82.9%)
[find_sensor_error 0ef7d94fde99] 0.037sec
[MEMUSE] memory usage (in after find_sensor_error 0ef7d94fde99): 4706.11MB (82.9%)
0f572d690310
[MEMUSE] memory usage (in before timestamp_to_step 0f572d690310): 4698.50MB (82.9%)
[timestamp_to_step 0f572d690310] 0.011sec
[MEMUSE] memory usage (in after timestamp_to_step 0f572d690310): 4698.05MB (82.9%)
[MEMUSE] memory usage (in before make_binary_target 0f572d690310): 4698.05MB (82.9%)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target_id["daily_step"] = (df_target_id['step'] + offset_step.values[0]) % Cfg.step_for_a_day


[make_binary_target 0f572d690310] 0.440sec
[MEMUSE] memory usage (in after make_binary_target 0f572d690310): 4703.44MB (82.9%)
[MEMUSE] memory usage (in before find_sensor_error 0f572d690310): 4703.44MB (82.9%)
[find_sensor_error 0f572d690310] 0.062sec
[MEMUSE] memory usage (in after find_sensor_error 0f572d690310): 4703.44MB (82.9%)
0f9e60a8e56d
[MEMUSE] memory usage (in before timestamp_to_step 0f9e60a8e56d): 4703.44MB (82.9%)
[timestamp_to_step 0f9e60a8e56d] 0.006sec
[MEMUSE] memory usage (in after timestamp_to_step 0f9e60a8e56d): 4703.44MB (82.9%)
[MEMUSE] memory usage (in before make_binary_target 0f9e60a8e56d): 4703.44MB (82.9%)
[make_binary_target 0f9e60a8e56d] 0.000sec
[MEMUSE] memory usage (in after make_binary_target 0f9e60a8e56d): 4703.44MB (82.9%)
[MEMUSE] memory usage (in before find_sensor_error 0f9e60a8e56d): 4703.44MB (82.9%)
[find_sensor_error 0f9e60a8e56d] 0.017sec
[MEMUSE] memory usage (in after find_sensor_error 0f9e60a8e56d): 4703.44MB (82.9%)
10469f6765bf
[MEMUSE]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target_id["daily_step"] = (df_target_id['step'] + offset_step.values[0]) % Cfg.step_for_a_day
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target_id["daily_step"] = (df_target_id['step'] + offset_step.values[0]) % Cfg.step_for_a_day
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target_i

[make_binary_target 10469f6765bf] 0.054sec
[MEMUSE] memory usage (in after make_binary_target 10469f6765bf): 4703.44MB (82.9%)
[MEMUSE] memory usage (in before find_sensor_error 10469f6765bf): 4703.44MB (82.9%)
[find_sensor_error 10469f6765bf] 0.042sec
[MEMUSE] memory usage (in after find_sensor_error 10469f6765bf): 4703.44MB (82.9%)
1087d7b0ff2e
[MEMUSE] memory usage (in before timestamp_to_step 1087d7b0ff2e): 4703.44MB (82.9%)
[timestamp_to_step 1087d7b0ff2e] 0.010sec
[MEMUSE] memory usage (in after timestamp_to_step 1087d7b0ff2e): 4707.64MB (82.9%)
[MEMUSE] memory usage (in before make_binary_target 1087d7b0ff2e): 4707.64MB (82.9%)
[make_binary_target 1087d7b0ff2e] 0.442sec
[MEMUSE] memory usage (in after make_binary_target 1087d7b0ff2e): 4702.94MB (82.9%)
[MEMUSE] memory usage (in before find_sensor_error 1087d7b0ff2e): 4702.94MB (82.9%)
[find_sensor_error 1087d7b0ff2e] 0.035sec
[MEMUSE] memory usage (in after find_sensor_error 1087d7b0ff2e): 4702.94MB (82.9%)
10f8bc1f7b07
[MEMUSE]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target_id["daily_step"] = (df_target_id['step'] + offset_step.values[0]) % Cfg.step_for_a_day


[make_binary_target 10f8bc1f7b07] 0.380sec
[MEMUSE] memory usage (in after make_binary_target 10f8bc1f7b07): 4702.94MB (82.9%)
[MEMUSE] memory usage (in before find_sensor_error 10f8bc1f7b07): 4702.94MB (82.9%)
[find_sensor_error 10f8bc1f7b07] 0.031sec
[MEMUSE] memory usage (in after find_sensor_error 10f8bc1f7b07): 4702.94MB (82.9%)
12d01911d509
[MEMUSE] memory usage (in before timestamp_to_step 12d01911d509): 4702.94MB (82.9%)
[timestamp_to_step 12d01911d509] 0.018sec
[MEMUSE] memory usage (in after timestamp_to_step 12d01911d509): 4702.94MB (82.9%)
[MEMUSE] memory usage (in before make_binary_target 12d01911d509): 4702.94MB (82.9%)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target_id["daily_step"] = (df_target_id['step'] + offset_step.values[0]) % Cfg.step_for_a_day


[make_binary_target 12d01911d509] 0.759sec
[MEMUSE] memory usage (in after make_binary_target 12d01911d509): 4655.05MB (82.5%)
[MEMUSE] memory usage (in before find_sensor_error 12d01911d509): 4655.05MB (82.5%)
[find_sensor_error 12d01911d509] 0.120sec
[MEMUSE] memory usage (in after find_sensor_error 12d01911d509): 4624.95MB (82.8%)
1319a1935f48
[MEMUSE] memory usage (in before timestamp_to_step 1319a1935f48): 4624.95MB (82.8%)
[timestamp_to_step 1319a1935f48] 0.014sec
[MEMUSE] memory usage (in after timestamp_to_step 1319a1935f48): 4624.95MB (82.8%)
[MEMUSE] memory usage (in before make_binary_target 1319a1935f48): 4624.95MB (82.8%)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target_id["daily_step"] = (df_target_id['step'] + offset_step.values[0]) % Cfg.step_for_a_day


[make_binary_target 1319a1935f48] 0.821sec
[MEMUSE] memory usage (in after make_binary_target 1319a1935f48): 4618.06MB (82.7%)
[MEMUSE] memory usage (in before find_sensor_error 1319a1935f48): 4618.06MB (82.7%)
[find_sensor_error 1319a1935f48] 0.075sec
[MEMUSE] memory usage (in after find_sensor_error 1319a1935f48): 4602.53MB (82.8%)
137771d19ca2
[MEMUSE] memory usage (in before timestamp_to_step 137771d19ca2): 4612.38MB (82.8%)
[timestamp_to_step 137771d19ca2] 0.010sec
[MEMUSE] memory usage (in after timestamp_to_step 137771d19ca2): 4613.56MB (82.8%)
[MEMUSE] memory usage (in before make_binary_target 137771d19ca2): 4613.56MB (82.8%)
[make_binary_target 137771d19ca2] 0.073sec
[MEMUSE] memory usage (in after make_binary_target 137771d19ca2): 4614.47MB (82.8%)
[MEMUSE] memory usage (in before find_sensor_error 137771d19ca2): 4614.47MB (82.8%)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target_id["daily_step"] = (df_target_id['step'] + offset_step.values[0]) % Cfg.step_for_a_day
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target_id["daily_step"] = (df_target_id['step'] + offset_step.values[0]) % Cfg.step_for_a_day


[find_sensor_error 137771d19ca2] 0.039sec
[MEMUSE] memory usage (in after find_sensor_error 137771d19ca2): 4614.47MB (82.8%)
137b99e936ab
[MEMUSE] memory usage (in before timestamp_to_step 137b99e936ab): 4614.47MB (82.8%)
[timestamp_to_step 137b99e936ab] 0.012sec
[MEMUSE] memory usage (in after timestamp_to_step 137b99e936ab): 4614.47MB (82.8%)
[MEMUSE] memory usage (in before make_binary_target 137b99e936ab): 4614.47MB (82.8%)
[make_binary_target 137b99e936ab] 0.457sec
[MEMUSE] memory usage (in after make_binary_target 137b99e936ab): 4614.47MB (82.8%)
[MEMUSE] memory usage (in before find_sensor_error 137b99e936ab): 4614.47MB (82.8%)
[find_sensor_error 137b99e936ab] 0.061sec
[MEMUSE] memory usage (in after find_sensor_error 137b99e936ab): 4614.47MB (82.8%)
13b4d6a01d27
[MEMUSE] memory usage (in before timestamp_to_step 13b4d6a01d27): 4614.47MB (82.8%)
[timestamp_to_step 13b4d6a01d27] 0.010sec
[MEMUSE] memory usage (in after timestamp_to_step 13b4d6a01d27): 4614.47MB (82.8%)
[MEMUSE] m

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target_id["daily_step"] = (df_target_id['step'] + offset_step.values[0]) % Cfg.step_for_a_day
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target_id["daily_step"] = (df_target_id['step'] + offset_step.values[0]) % Cfg.step_for_a_day


[find_sensor_error 13b4d6a01d27] 0.040sec
[MEMUSE] memory usage (in after find_sensor_error 13b4d6a01d27): 4614.47MB (82.8%)
148471991ffb
[MEMUSE] memory usage (in before timestamp_to_step 148471991ffb): 4614.47MB (82.8%)
[timestamp_to_step 148471991ffb] 0.013sec
[MEMUSE] memory usage (in after timestamp_to_step 148471991ffb): 4620.70MB (82.8%)
[MEMUSE] memory usage (in before make_binary_target 148471991ffb): 4620.70MB (82.8%)
[make_binary_target 148471991ffb] 0.356sec
[MEMUSE] memory usage (in after make_binary_target 148471991ffb): 4625.45MB (82.8%)
[MEMUSE] memory usage (in before find_sensor_error 148471991ffb): 4625.45MB (82.8%)
[find_sensor_error 148471991ffb] 0.060sec
[MEMUSE] memory usage (in after find_sensor_error 148471991ffb): 4629.56MB (82.8%)
154fe824ed87
[MEMUSE] memory usage (in before timestamp_to_step 154fe824ed87): 4625.25MB (82.7%)
[timestamp_to_step 154fe824ed87] 0.013sec
[MEMUSE] memory usage (in after timestamp_to_step 154fe824ed87): 4625.25MB (82.7%)
[MEMUSE] m

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target_id["daily_step"] = (df_target_id['step'] + offset_step.values[0]) % Cfg.step_for_a_day


[make_binary_target 154fe824ed87] 0.700sec
[MEMUSE] memory usage (in after make_binary_target 154fe824ed87): 4631.91MB (82.8%)
[MEMUSE] memory usage (in before find_sensor_error 154fe824ed87): 4631.91MB (82.8%)
[find_sensor_error 154fe824ed87] 0.051sec
[MEMUSE] memory usage (in after find_sensor_error 154fe824ed87): 4629.50MB (82.8%)
16fe2798ed0f
[MEMUSE] memory usage (in before timestamp_to_step 16fe2798ed0f): 4632.11MB (82.8%)
[timestamp_to_step 16fe2798ed0f] 0.009sec
[MEMUSE] memory usage (in after timestamp_to_step 16fe2798ed0f): 4634.77MB (82.8%)
[MEMUSE] memory usage (in before make_binary_target 16fe2798ed0f): 4634.77MB (82.8%)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target_id["daily_step"] = (df_target_id['step'] + offset_step.values[0]) % Cfg.step_for_a_day


[make_binary_target 16fe2798ed0f] 0.263sec
[MEMUSE] memory usage (in after make_binary_target 16fe2798ed0f): 4627.17MB (82.7%)
[MEMUSE] memory usage (in before find_sensor_error 16fe2798ed0f): 4627.17MB (82.7%)
[find_sensor_error 16fe2798ed0f] 0.029sec
[MEMUSE] memory usage (in after find_sensor_error 16fe2798ed0f): 4627.17MB (82.7%)
1716cd4163b2
[MEMUSE] memory usage (in before timestamp_to_step 1716cd4163b2): 4627.17MB (82.7%)
[timestamp_to_step 1716cd4163b2] 0.012sec
[MEMUSE] memory usage (in after timestamp_to_step 1716cd4163b2): 4627.17MB (82.7%)
[MEMUSE] memory usage (in before make_binary_target 1716cd4163b2): 4627.17MB (82.7%)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target_id["daily_step"] = (df_target_id['step'] + offset_step.values[0]) % Cfg.step_for_a_day


[make_binary_target 1716cd4163b2] 0.514sec
[MEMUSE] memory usage (in after make_binary_target 1716cd4163b2): 4627.17MB (82.7%)
[MEMUSE] memory usage (in before find_sensor_error 1716cd4163b2): 4627.17MB (82.7%)
[find_sensor_error 1716cd4163b2] 0.050sec
[MEMUSE] memory usage (in after find_sensor_error 1716cd4163b2): 4627.17MB (82.7%)
1762ab70ec76
[MEMUSE] memory usage (in before timestamp_to_step 1762ab70ec76): 4613.16MB (82.6%)
[timestamp_to_step 1762ab70ec76] 0.010sec
[MEMUSE] memory usage (in after timestamp_to_step 1762ab70ec76): 4615.16MB (82.6%)
[MEMUSE] memory usage (in before make_binary_target 1762ab70ec76): 4615.16MB (82.6%)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target_id["daily_step"] = (df_target_id['step'] + offset_step.values[0]) % Cfg.step_for_a_day


[make_binary_target 1762ab70ec76] 0.352sec
[MEMUSE] memory usage (in after make_binary_target 1762ab70ec76): 4615.16MB (82.6%)
[MEMUSE] memory usage (in before find_sensor_error 1762ab70ec76): 4615.16MB (82.6%)
[find_sensor_error 1762ab70ec76] 0.034sec
[MEMUSE] memory usage (in after find_sensor_error 1762ab70ec76): 4615.16MB (82.6%)
188d4b7cd28b
[MEMUSE] memory usage (in before timestamp_to_step 188d4b7cd28b): 4615.16MB (82.6%)
[timestamp_to_step 188d4b7cd28b] 0.009sec
[MEMUSE] memory usage (in after timestamp_to_step 188d4b7cd28b): 4615.16MB (82.6%)
[MEMUSE] memory usage (in before make_binary_target 188d4b7cd28b): 4615.16MB (82.6%)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target_id["daily_step"] = (df_target_id['step'] + offset_step.values[0]) % Cfg.step_for_a_day


[make_binary_target 188d4b7cd28b] 0.351sec
[MEMUSE] memory usage (in after make_binary_target 188d4b7cd28b): 4615.16MB (82.6%)
[MEMUSE] memory usage (in before find_sensor_error 188d4b7cd28b): 4615.16MB (82.6%)
[find_sensor_error 188d4b7cd28b] 0.031sec
[MEMUSE] memory usage (in after find_sensor_error 188d4b7cd28b): 4615.16MB (82.6%)
18a0ca03431d
[MEMUSE] memory usage (in before timestamp_to_step 18a0ca03431d): 4615.16MB (82.6%)
[timestamp_to_step 18a0ca03431d] 0.014sec
[MEMUSE] memory usage (in after timestamp_to_step 18a0ca03431d): 4615.16MB (82.6%)
[MEMUSE] memory usage (in before make_binary_target 18a0ca03431d): 4615.16MB (82.6%)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target_id["daily_step"] = (df_target_id['step'] + offset_step.values[0]) % Cfg.step_for_a_day


[make_binary_target 18a0ca03431d] 0.734sec
[MEMUSE] memory usage (in after make_binary_target 18a0ca03431d): 4628.86MB (82.7%)
[MEMUSE] memory usage (in before find_sensor_error 18a0ca03431d): 4628.86MB (82.7%)
[find_sensor_error 18a0ca03431d] 0.074sec
[MEMUSE] memory usage (in after find_sensor_error 18a0ca03431d): 4664.23MB (82.9%)
18b61dd5aae8
[MEMUSE] memory usage (in before timestamp_to_step 18b61dd5aae8): 4680.97MB (83.0%)
[timestamp_to_step 18b61dd5aae8] 0.013sec
[MEMUSE] memory usage (in after timestamp_to_step 18b61dd5aae8): 4680.97MB (83.0%)
[MEMUSE] memory usage (in before make_binary_target 18b61dd5aae8): 4680.97MB (83.0%)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target_id["daily_step"] = (df_target_id['step'] + offset_step.values[0]) % Cfg.step_for_a_day


[make_binary_target 18b61dd5aae8] 0.689sec
[MEMUSE] memory usage (in after make_binary_target 18b61dd5aae8): 4680.97MB (83.0%)
[MEMUSE] memory usage (in before find_sensor_error 18b61dd5aae8): 4680.97MB (83.0%)
[find_sensor_error 18b61dd5aae8] 0.063sec
[MEMUSE] memory usage (in after find_sensor_error 18b61dd5aae8): 4680.97MB (83.0%)
1955d568d987
[MEMUSE] memory usage (in before timestamp_to_step 1955d568d987): 4680.97MB (83.0%)
[timestamp_to_step 1955d568d987] 0.014sec
[MEMUSE] memory usage (in after timestamp_to_step 1955d568d987): 4680.97MB (83.0%)
[MEMUSE] memory usage (in before make_binary_target 1955d568d987): 4680.97MB (83.0%)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target_id["daily_step"] = (df_target_id['step'] + offset_step.values[0]) % Cfg.step_for_a_day


[make_binary_target 1955d568d987] 0.675sec
[MEMUSE] memory usage (in after make_binary_target 1955d568d987): 4841.52MB (83.0%)
[MEMUSE] memory usage (in before find_sensor_error 1955d568d987): 4841.52MB (83.0%)
[find_sensor_error 1955d568d987] 0.058sec
[MEMUSE] memory usage (in after find_sensor_error 1955d568d987): 4810.75MB (82.8%)
1b92be89db4c
[MEMUSE] memory usage (in before timestamp_to_step 1b92be89db4c): 4812.52MB (82.8%)
[timestamp_to_step 1b92be89db4c] 0.009sec
[MEMUSE] memory usage (in after timestamp_to_step 1b92be89db4c): 4812.92MB (82.8%)
[MEMUSE] memory usage (in before make_binary_target 1b92be89db4c): 4812.92MB (82.8%)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target_id["daily_step"] = (df_target_id['step'] + offset_step.values[0]) % Cfg.step_for_a_day


[make_binary_target 1b92be89db4c] 0.350sec
[MEMUSE] memory usage (in after make_binary_target 1b92be89db4c): 4812.92MB (82.8%)
[MEMUSE] memory usage (in before find_sensor_error 1b92be89db4c): 4812.92MB (82.8%)
[find_sensor_error 1b92be89db4c] 0.031sec
[MEMUSE] memory usage (in after find_sensor_error 1b92be89db4c): 4812.92MB (82.8%)
1c7c0bad1263
[MEMUSE] memory usage (in before timestamp_to_step 1c7c0bad1263): 4812.92MB (82.8%)
[timestamp_to_step 1c7c0bad1263] 0.004sec
[MEMUSE] memory usage (in after timestamp_to_step 1c7c0bad1263): 4812.92MB (82.8%)
[MEMUSE] memory usage (in before make_binary_target 1c7c0bad1263): 4812.92MB (82.8%)
[make_binary_target 1c7c0bad1263] 0.046sec
[MEMUSE] memory usage (in after make_binary_target 1c7c0bad1263): 4812.92MB (82.8%)
[MEMUSE] memory usage (in before find_sensor_error 1c7c0bad1263): 4812.92MB (82.8%)
[find_sensor_error 1c7c0bad1263] 0.006sec
[MEMUSE] memory usage (in after find_sensor_error 1c7c0bad1263): 4812.92MB (82.8%)
1d4569cbac0f
[MEMUSE]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target_id["daily_step"] = (df_target_id['step'] + offset_step.values[0]) % Cfg.step_for_a_day
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target_id["daily_step"] = (df_target_id['step'] + offset_step.values[0]) % Cfg.step_for_a_day


[make_binary_target 1d4569cbac0f] 0.268sec
[MEMUSE] memory usage (in after make_binary_target 1d4569cbac0f): 4812.92MB (82.8%)
[MEMUSE] memory usage (in before find_sensor_error 1d4569cbac0f): 4812.92MB (82.8%)
[find_sensor_error 1d4569cbac0f] 0.057sec
[MEMUSE] memory usage (in after find_sensor_error 1d4569cbac0f): 4812.92MB (82.8%)
1e6717d93c1d
[MEMUSE] memory usage (in before timestamp_to_step 1e6717d93c1d): 4812.92MB (82.8%)
[timestamp_to_step 1e6717d93c1d] 0.015sec
[MEMUSE] memory usage (in after timestamp_to_step 1e6717d93c1d): 4812.92MB (82.8%)
[MEMUSE] memory usage (in before make_binary_target 1e6717d93c1d): 4812.92MB (82.8%)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target_id["daily_step"] = (df_target_id['step'] + offset_step.values[0]) % Cfg.step_for_a_day


[make_binary_target 1e6717d93c1d] 0.138sec
[MEMUSE] memory usage (in after make_binary_target 1e6717d93c1d): 4732.70MB (82.3%)
[MEMUSE] memory usage (in before find_sensor_error 1e6717d93c1d): 4732.70MB (82.3%)
[find_sensor_error 1e6717d93c1d] 0.048sec
[MEMUSE] memory usage (in after find_sensor_error 1e6717d93c1d): 4736.52MB (82.3%)
1f96b9668bdf
[MEMUSE] memory usage (in before timestamp_to_step 1f96b9668bdf): 4722.73MB (82.2%)
[timestamp_to_step 1f96b9668bdf] 0.013sec
[MEMUSE] memory usage (in after timestamp_to_step 1f96b9668bdf): 4715.44MB (82.2%)
[MEMUSE] memory usage (in before make_binary_target 1f96b9668bdf): 4715.44MB (82.2%)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target_id["daily_step"] = (df_target_id['step'] + offset_step.values[0]) % Cfg.step_for_a_day


[make_binary_target 1f96b9668bdf] 0.360sec
[MEMUSE] memory usage (in after make_binary_target 1f96b9668bdf): 4707.80MB (82.2%)
[MEMUSE] memory usage (in before find_sensor_error 1f96b9668bdf): 4707.80MB (82.2%)
[find_sensor_error 1f96b9668bdf] 0.054sec
[MEMUSE] memory usage (in after find_sensor_error 1f96b9668bdf): 4710.25MB (82.1%)
207eded97727
[MEMUSE] memory usage (in before timestamp_to_step 207eded97727): 4704.78MB (82.1%)
[timestamp_to_step 207eded97727] 0.009sec
[MEMUSE] memory usage (in after timestamp_to_step 207eded97727): 4704.78MB (82.1%)
[MEMUSE] memory usage (in before make_binary_target 207eded97727): 4704.78MB (82.1%)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target_id["daily_step"] = (df_target_id['step'] + offset_step.values[0]) % Cfg.step_for_a_day


[make_binary_target 207eded97727] 0.304sec
[MEMUSE] memory usage (in after make_binary_target 207eded97727): 4704.78MB (82.1%)
[MEMUSE] memory usage (in before find_sensor_error 207eded97727): 4704.78MB (82.1%)
[find_sensor_error 207eded97727] 0.030sec
[MEMUSE] memory usage (in after find_sensor_error 207eded97727): 4704.78MB (82.1%)
debug mode. only 40 dataset. break
[make_dataset] 29.238sec
[MEMUSE] memory usage (in after make_dataset): 4704.78MB (82.1%)


In [22]:

def regenerate_dataset(path, only_feature=False):
    """
    pathにあるnpyファイル名を読み込み、ファイル名とデータ長のdictのリストを返す。
    """
    files = sorted(glob.glob(os.path.join(path, "*feature.npy")))
    files_step = sorted(glob.glob(os.path.join(path, "*step.npy")))
    files_meta = sorted(glob.glob(os.path.join(path, "*meta.json")))
    dataset = []
    for i, (file, file_step, file_meta) in enumerate(zip(files, files_step, files_meta)):
        data = np.load(file)
        meta = json.load(open(file_meta))
        
        #for i in range(5):
        #    plt.plot(data[:7200, i])
        #    plt.show()
        #raise Exception
        
        #if only_feature:
        #    dataset.append({"file": file, "length": data.shape[0], "channel": data.shape[1]})
        #else:
        
        dataset.append({"file": file, "file_step": file_step, "length": data.shape[0], "channel": data.shape[1], "end_step_for_train": min(int(meta["end_step_for_train"]), data.shape[0]), "id_no": i}) #id_for classificaiton
    return dataset



# DATA Generation did not work well

"""
seed_no = 42
num_fold = 2
series_fold = pd.read_csv(f"../data/data_processed/series_id_{num_fold}fold_seed{seed_no}.csv")
id2fold = {row["series_id"]: row["fold"] for i, row in series_fold.iterrows()}

files = sorted(glob.glob(os.path.join("../data/data_processed", "*feature.npy")))
file_series_ids = [os.path.basename(f).split(".")[0].split("_")[1] for f in files]
print(len(id2fold))
print(len(file_series_ids))
file_fold = [id2fold[series_id] for series_id in file_series_ids]
files_train = [f for f, fold in zip(files, file_fold) if fold != 0]
files = [f for f, fold in zip(files, file_fold) if fold != 0] # train by generated data
print(len(files), len(files_train))



for i in range(10):
    np.random.seed(i)
    shuffle_indices = np.arange(len(files))
    np.random.shuffle(shuffle_indices)
    shuffle_files = [files[i] for i in shuffle_indices]
    for f, f2 in zip(files, shuffle_files):
        id_no = f.split("id_")[-1].split("_")[0]
        id_no2 = f2.split("id_")[-1].split("_")[0]
        print(id_no)
        processed_array = np.load(f)
        second_array = np.load(f2)

        generate_data(processed_array, id_no+"W"+id_no2, days_to_generate_ratio=0.2, min_split_minutes=60, max_split_minutes=240, save_dir=f"../data/data_generated_f{num_fold}_s{seed_no}/", plot=False, second_array=second_array, seed_no=i)
"""

'\nseed_no = 42\nnum_fold = 2\nseries_fold = pd.read_csv(f"../data/data_processed/series_id_{num_fold}fold_seed{seed_no}.csv")\nid2fold = {row["series_id"]: row["fold"] for i, row in series_fold.iterrows()}\n\nfiles = sorted(glob.glob(os.path.join("../data/data_processed", "*feature.npy")))\nfile_series_ids = [os.path.basename(f).split(".")[0].split("_")[1] for f in files]\nprint(len(id2fold))\nprint(len(file_series_ids))\nfile_fold = [id2fold[series_id] for series_id in file_series_ids]\nfiles_train = [f for f, fold in zip(files, file_fold) if fold != 0]\nfiles = [f for f, fold in zip(files, file_fold) if fold != 0] # train by generated data\nprint(len(files), len(files_train))\n\n\n\nfor i in range(10):\n    np.random.seed(i)\n    shuffle_indices = np.arange(len(files))\n    np.random.shuffle(shuffle_indices)\n    shuffle_files = [files[i] for i in shuffle_indices]\n    for f, f2 in zip(files, shuffle_files):\n        id_no = f.split("id_")[-1].split("_")[0]\n        id_no2 = f2.spli