In [1]:
import pandas as pd
import numpy as np
import gc
import time
import json
from datetime import datetime
import matplotlib.pyplot as plt
import os
import joblib
import random
import math
from tqdm import tqdm 

from scipy.interpolate import interp1d
from scipy import signal
from scipy.signal import argrelmax

from math import pi, sqrt, exp
import sklearn,sklearn.model_selection
from sklearn.metrics import mean_squared_error
import torch
from torch import nn,Tensor
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, SubsetRandomSampler
from sklearn.metrics import average_precision_score
from timm.scheduler import CosineLRScheduler
plt.style.use("ggplot")

from transformers import get_cosine_schedule_with_warmup
from transformers import AutoTokenizer, AutoConfig
from transformers import AutoModel

from pyarrow.parquet import ParquetFile
import pyarrow as pa 
import ctypes
torch.set_num_interop_threads(4)
torch.set_num_threads(4)

device = 'cuda' if torch.cuda.is_available() else 'cpu'



In [2]:
# Fundamental config
WORKERS = os.cpu_count()//2
N_FOLDS = 5

MAX_LEN = 2880 
STRIDE = MAX_LEN // 2
SEED = 8620

In [3]:
def torch_fix_seed(seed=42):
    # Python random
    random.seed(seed)
    # Numpy
    np.random.seed(seed)
    # Pytorch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # torch.backends.cudnn.deterministic = True
    # torch.use_deterministic_algorithms = True
    torch.backends.cudnn.benchmark = True

torch_fix_seed(SEED)

In [4]:
IMG_SIZE = (384, MAX_LEN)
HEIGHT = IMG_SIZE[0]
WIDTH = IMG_SIZE[1]

In [5]:
# coding: utf-8
__author__ = 'ZFTurbo: https://kaggle.com/zfturbo'


import warnings
import numpy as np


def prefilter_boxes(boxes, scores, labels, weights, thr):
    # Create dict with boxes stored by its label
    new_boxes = dict()

    for t in range(len(boxes)):

        if len(boxes[t]) != len(scores[t]):
            print('Error. Length of boxes arrays not equal to length of scores array: {} != {}'.format(len(boxes[t]), len(scores[t])))
            exit()

        if len(boxes[t]) != len(labels[t]):
            print('Error. Length of boxes arrays not equal to length of labels array: {} != {}'.format(len(boxes[t]), len(labels[t])))
            exit()

        for j in range(len(boxes[t])):
            score = scores[t][j]
            if score < thr:
                continue
            label = int(labels[t][j])
            box_part = boxes[t][j]
            x1 = float(box_part[0])
            y1 = float(box_part[1])
            x2 = float(box_part[2])
            y2 = float(box_part[3])

            # Box data checks
            if x2 < x1:
                warnings.warn('X2 < X1 value in box. Swap them.')
                x1, x2 = x2, x1
            if y2 < y1:
                warnings.warn('Y2 < Y1 value in box. Swap them.')
                y1, y2 = y2, y1
            if x1 < 0:
                warnings.warn('X1 < 0 in box. Set it to 0.')
                x1 = 0
            if x1 > 1:
                warnings.warn('X1 > 1 in box. Set it to 1. Check that you normalize boxes in [0, 1] range.')
                x1 = 1
            if x2 < 0:
                warnings.warn('X2 < 0 in box. Set it to 0.')
                x2 = 0
            if x2 > 1:
                warnings.warn('X2 > 1 in box. Set it to 1. Check that you normalize boxes in [0, 1] range.')
                print(x2)
                x2 = 1
            if y1 < 0:
                warnings.warn('Y1 < 0 in box. Set it to 0.')
                y1 = 0
            if y1 > 1:
                warnings.warn('Y1 > 1 in box. Set it to 1. Check that you normalize boxes in [0, 1] range.')
                y1 = 1
            if y2 < 0:
                warnings.warn('Y2 < 0 in box. Set it to 0.')
                y2 = 0
            if y2 > 1:
                warnings.warn('Y2 > 1 in box. Set it to 1. Check that you normalize boxes in [0, 1] range.')
                y2 = 1
            if (x2 - x1) * (y2 - y1) == 0.0:
                warnings.warn("Zero area box skipped: {}.".format(box_part))
                continue

            # [label, score, weight, model index, x1, y1, x2, y2]
            b = [int(label), float(score) * weights[t], weights[t], t, x1, y1, x2, y2]
            if label not in new_boxes:
                new_boxes[label] = []
            new_boxes[label].append(b)

    # Sort each list in dict by score and transform it to numpy array
    for k in new_boxes:
        current_boxes = np.array(new_boxes[k])
        new_boxes[k] = current_boxes[current_boxes[:, 1].argsort()[::-1]]

    return new_boxes


def get_weighted_box(boxes, conf_type='avg'):
    """
    Create weighted box for set of boxes
    :param boxes: set of boxes to fuse
    :param conf_type: type of confidence one of 'avg' or 'max'
    :return: weighted box (label, score, weight, model index, x1, y1, x2, y2)
    """

    box = np.zeros(8, dtype=np.float32)
    conf = 0
    conf_list = []
    w = 0
    for b in boxes:
        box[4:] += (b[1] * b[4:])
        conf += b[1]
        conf_list.append(b[1])
        w += b[2]
    box[0] = boxes[0][0]
    if conf_type in ('avg', 'box_and_model_avg', 'absent_model_aware_avg'):
        box[1] = conf / len(boxes)
    elif conf_type == 'max':
        box[1] = np.array(conf_list).max()
    box[2] = w
    box[3] = -1 # model index field is retained for consistency but is not used.
    box[4:] /= conf
    return box


def find_matching_box_fast(boxes_list, new_box, match_iou):
    """
        Reimplementation of find_matching_box with numpy instead of loops. Gives significant speed up for larger arrays
        (~100x). This was previously the bottleneck since the function is called for every entry in the array.
    """
    def bb_iou_array(boxes, new_box):
        # bb interesection over union
        xA = np.maximum(boxes[:, 0], new_box[0])
        yA = np.maximum(boxes[:, 1], new_box[1])
        xB = np.minimum(boxes[:, 2], new_box[2])
        yB = np.minimum(boxes[:, 3], new_box[3])

        interArea = np.maximum(xB - xA, 0) * np.maximum(yB - yA, 0)

        # compute the area of both the prediction and ground-truth rectangles
        boxAArea = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
        boxBArea = (new_box[2] - new_box[0]) * (new_box[3] - new_box[1])

        iou = interArea / (boxAArea + boxBArea - interArea)

        return iou

    if boxes_list.shape[0] == 0:
        return -1, match_iou

    # boxes = np.array(boxes_list)
    boxes = boxes_list

    ious = bb_iou_array(boxes[:, 4:], new_box[4:])

    ious[boxes[:, 0] != new_box[0]] = -1

    best_idx = np.argmax(ious)
    best_iou = ious[best_idx]

    if best_iou <= match_iou:
        best_iou = match_iou
        best_idx = -1

    return best_idx, best_iou


def weighted_boxes_fusion(
        boxes_list,
        scores_list,
        labels_list,
        weights=None,
        iou_thr=0.55,
        skip_box_thr=0.0,
        conf_type='avg',
        allows_overflow=False
):
    '''
    :param boxes_list: list of boxes predictions from each model, each box is 4 numbers.
    It has 3 dimensions (models_number, model_preds, 4)
    Order of boxes: x1, y1, x2, y2. We expect float normalized coordinates [0; 1]
    :param scores_list: list of scores for each model
    :param labels_list: list of labels for each model
    :param weights: list of weights for each model. Default: None, which means weight == 1 for each model
    :param iou_thr: IoU value for boxes to be a match
    :param skip_box_thr: exclude boxes with score lower than this variable
    :param conf_type: how to calculate confidence in weighted boxes.
        'avg': average value,
        'max': maximum value,
        'box_and_model_avg': box and model wise hybrid weighted average,
        'absent_model_aware_avg': weighted average that takes into account the absent model.
    :param allows_overflow: false if we want confidence score not exceed 1.0

    :return: boxes: boxes coordinates (Order of boxes: x1, y1, x2, y2).
    :return: scores: confidence scores
    :return: labels: boxes labels
    '''

    if weights is None:
        weights = np.ones(len(boxes_list))
    if len(weights) != len(boxes_list):
        print('Warning: incorrect number of weights {}. Must be: {}. Set weights equal to 1.'.format(len(weights), len(boxes_list)))
        weights = np.ones(len(boxes_list))
    weights = np.array(weights)

    if conf_type not in ['avg', 'max', 'box_and_model_avg', 'absent_model_aware_avg']:
        print('Unknown conf_type: {}. Must be "avg", "max" or "box_and_model_avg", or "absent_model_aware_avg"'.format(conf_type))
        exit()

    filtered_boxes = prefilter_boxes(boxes_list, scores_list, labels_list, weights, skip_box_thr)
    if len(filtered_boxes) == 0:
        return np.zeros((0, 4)), np.zeros((0,)), np.zeros((0,))

    overall_boxes = []
    for label in filtered_boxes:
        boxes = filtered_boxes[label]
        new_boxes = []
        weighted_boxes = np.empty((0, 8))

        # Clusterize boxes
        for j in range(0, len(boxes)):
            index, best_iou = find_matching_box_fast(weighted_boxes, boxes[j], iou_thr)

            if index != -1:
                new_boxes[index].append(boxes[j])
                weighted_boxes[index] = get_weighted_box(new_boxes[index], conf_type)
            else:
                new_boxes.append([boxes[j].copy()])
                weighted_boxes = np.vstack((weighted_boxes, boxes[j].copy()))

        # Rescale confidence based on number of models and boxes
        for i in range(len(new_boxes)):
            clustered_boxes = new_boxes[i]
            if conf_type == 'box_and_model_avg':
                clustered_boxes = np.array(clustered_boxes)
                # weighted average for boxes
                weighted_boxes[i, 1] = weighted_boxes[i, 1] * len(clustered_boxes) / weighted_boxes[i, 2]
                # identify unique model index by model index column
                _, idx = np.unique(clustered_boxes[:, 3], return_index=True)
                # rescale by unique model weights
                weighted_boxes[i, 1] = weighted_boxes[i, 1] *  clustered_boxes[idx, 2].sum() / weights.sum()
            elif conf_type == 'absent_model_aware_avg':
                clustered_boxes = np.array(clustered_boxes)
                # get unique model index in the cluster
                models = np.unique(clustered_boxes[:, 3]).astype(int)
                # create a mask to get unused model weights
                mask = np.ones(len(weights), dtype=bool)
                mask[models] = False
                # absent model aware weighted average
                weighted_boxes[i, 1] = weighted_boxes[i, 1] * len(clustered_boxes) / (weighted_boxes[i, 2] + weights[mask].sum())
            elif conf_type == 'max':
                weighted_boxes[i, 1] = weighted_boxes[i, 1] / weights.max()
            elif not allows_overflow:
                weighted_boxes[i, 1] = weighted_boxes[i, 1] * min(len(weights), len(clustered_boxes)) / weights.sum()
            else:
                weighted_boxes[i, 1] = weighted_boxes[i, 1] * len(clustered_boxes) / weights.sum()
        overall_boxes.append(weighted_boxes)
    overall_boxes = np.concatenate(overall_boxes, axis=0)
    overall_boxes = overall_boxes[overall_boxes[:, 1].argsort()[::-1]]
    boxes = overall_boxes[:, 4:]
    scores = overall_boxes[:, 1]
    labels = overall_boxes[:, 0]
    return boxes, scores, labels


In [6]:
class PATHS:
    MAIN_DIR = "/kaggle/input/child-mind-institute-detect-sleep-states/"
    # CSV FILES : 
    SUBMISSION = MAIN_DIR + "sample_submission.csv"
    TRAIN_EVENTS = MAIN_DIR + "train_events.csv"
    # PARQUET FILES:
    TRAIN_SERIES = MAIN_DIR + "train_series.parquet"
    TEST_SERIES = MAIN_DIR + "test_series.parquet"
class CFG:
    DEMO_MODE = True
class data_reader:
    def __init__(self, demo_mode):
        super().__init__()
        # MAPPING FOR DATA LOADING :
        self.names_mapping = {
            "submission" : {"path" : PATHS.SUBMISSION, "is_parquet" : False, "has_timestamp" : False}, 
            "train_events" : {"path" : PATHS.TRAIN_EVENTS, "is_parquet" : False, "has_timestamp" : True},
            "train_series" : {"path" : PATHS.TRAIN_SERIES, "is_parquet" : True, "has_timestamp" : True},
            "test_series" : {"path" : PATHS.TEST_SERIES, "is_parquet" : True, "has_timestamp" : True}
        }
        self.valid_names = ["submission", "train_events", "train_series", "test_series"]
        self.demo_mode = demo_mode
    
    def verify(self, data_name):
        "function for data name verification"
        if data_name not in self.valid_names:
            print("PLEASE ENTER A VALID DATASET NAME, VALID NAMES ARE : ", valid_names)
        return
    
    def cleaning(self, data):
        "cleaning function : drop na values"
        before_cleaning = len(data)
        print("Number of missing timestamps : ", len(data[data["timestamp"].isna()]))
        data = data.dropna(subset=["timestamp"])
        after_cleaning = len(data)
        print("Percentage of removed rows : {:.1f}%".format(100 * (before_cleaning - after_cleaning) / before_cleaning) )
#         print(data.isna().any())
#         data = data.bfill()
        return data
    
    @staticmethod
    def reduce_memory_usage(data):
        "iterate through all the columns of a dataframe and modify the data type to reduce memory usage."
        start_mem = data.memory_usage().sum() / 1024**2
        print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
        for col in data.columns:
            col_type = data[col].dtype    
            if col_type != object:
                c_min = data[col].min()
                c_max = data[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        data[col] = data[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        data[col] = data[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        data[col] = data[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        data[col] = data[col].astype(np.int64)  
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        data[col] = data[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        data[col] = data[col].astype(np.float32)
                    else:
                        data[col] = data[col].astype(np.float64)
            else:
                data[col] = data[col].astype('category')

        end_mem = data.memory_usage().sum() / 1024**2
        print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
        return data
    
    def load_data(self, data_name):
        "function for data loading"
        self.verify(data_name)
        data_props = self.names_mapping[data_name]
        if data_props["is_parquet"]:
            if self.demo_mode:
                pf = ParquetFile(data_props["path"]) 
                demo_rows = next(pf.iter_batches(batch_size=20_000)) 
                data = pa.Table.from_batches([demo_rows]).to_pandas()
            else:
                data = pd.read_parquet(data_props["path"])
        else:
            if self.demo_mode:
                data = pd.read_csv(data_props["path"], nrows=20_000)
            else:
                data = pd.read_csv(data_props["path"])
                
        gc.collect()
        if data_props["has_timestamp"]:
            print('cleaning')
            data = self.cleaning(data)
            gc.collect()
        #data = self.reduce_memory_usage(data)
        return data


In [7]:
reader = data_reader(demo_mode=False)
test_series = reader.load_data(data_name="test_series")
ids = test_series.series_id.unique()
gc.collect()

cleaning
Number of missing timestamps :  0
Percentage of removed rows : 0.0%


0

In [8]:
from torchvision.models.detection.faster_rcnn import *
from torchvision.models.resnet import *
from torchvision.models.detection.backbone_utils import _resnet_fpn_extractor, _validate_trainable_layers

In [9]:
import torchvision
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator
from torchvision.models.efficientnet import EfficientNet_V2_S_Weights, efficientnet_v2_s

In [10]:
from typing import Any, Dict, List, Optional, Tuple
from torchvision.models.detection.image_list import *

In [11]:
def seq_block(in_c, out_c, ksh=4, usf=2, ksw=3):
    padw = (ksw-1)//2
    padh = (ksh-usf)// 2
    return nn.Sequential(
        nn.BatchNorm2d(in_c),
        nn.LeakyReLU(0.2),
        nn.ConvTranspose2d(in_c, out_c, kernel_size=(ksh,ksw), stride=(usf,1), padding=(padh,padw)),
        nn.BatchNorm2d(out_c),
        nn.LeakyReLU(0.2),
        nn.Conv2d(out_c, out_c, ksw, padding=padw)
    )
class Feat2Img(nn.Module):
    def __init__(self, in_c=2, hr_feat=32, base=16, ksw=3):
        super(Feat2Img, self).__init__()
        self.hr_emb = nn.Embedding(24, hr_feat)
        self.fc1_hr = nn.Linear(hr_feat, hr_feat)
        self.fc2_hr = nn.Linear(hr_feat, hr_feat)
        
        self.inter_conv = nn.Conv1d(in_c+hr_feat, base*128, kernel_size=1, padding=0)
        
        usfs = [3, 2, 2, 2, 2, 2, 2, 2]
        kshs = [3, 4, 4, 4, 4, 4, 4, 4]
        hid_in = [base*128, base*64, base*32, base*16, base*8, base*4, base*2, base]
        hid_out = [base*64, base*32, base*16, base*8, base*4, base*2, base, 3]
        padw = (ksw-1)//2

        self.blks = nn.Sequential(
            *[seq_block(hid_in[i], hid_out[i], kshs[i], usfs[i]) for i in range(len(hid_in))]
            )
        self.shortcuts = nn.Sequential(
            *[nn.ConvTranspose2d(hid_in[i], hid_out[i], kernel_size=(kshs[i],ksw), stride=(usfs[i], 1), padding=((kshs[i]-usfs[i])//2,padw)) for i in range(len(hid_in))]
            )
        self.relu = nn.LeakyReLU(0.2)
    
    def forward(self, x, t=None):
        #x, h = x[...,:-1].float(), x[...,-1].long()
        x, h = x[...,:-1].half(), x[...,-1].long()
        e = self.hr_emb(h)
        e = self.relu(e)
        e = self.fc1_hr(e)
        e = self.relu(e)
        e = self.fc2_hr(e)
        x = torch.cat([x, e], dim=-1).permute(0,2,1)
        x = self.inter_conv(x)
        x = x.unsqueeze(2)
        
        for s, b in zip(self.shortcuts, self.blks):
            x = s(x) + b(x)

        image_list = ImageList(x, [[HEIGHT, WIDTH]]*x.size(0))
        return image_list, t

    def postprocess(
        self,
        result: List[Dict[str, Tensor]],
        image_shapes: List[Tuple[int, int]],
        original_image_sizes: List[Tuple[int, int]],
    ) -> List[Dict[str, Tensor]]:
        return result

In [12]:
import timm
import torchvision
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator
def get_model_convnext(in_c, mn='convnextv2_base.fcmae_ft_in22k_in1k_384', pretrained=False):

    backbone = timm.create_model(mn, pretrained=pretrained)

    backbone.out_channels = 1280
    
    anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512),),
                                     aspect_ratios=((0.5, 1.0, 2.0),))    
        
    # put the pieces together inside a FasterRCNN model
    model = FasterRCNN(backbone,
                    num_classes=4,
                    rpn_anchor_generator=anchor_generator
                      )

    model.transform = Feat2Img(in_c)


    if 'large' in mn:
        model.backbone.head = nn.Conv2d(1536, 1280, kernel_size=(1, 1), stride=(1, 1), bias=True)

    elif 'base' in mn:
        model.backbone.head = nn.Conv2d(1024, 1280, kernel_size=(1, 1), stride=(1, 1), bias=True)

    else:
        assert 0  


    return model

In [13]:
# submission
class SleepTestDataset(Dataset):
    def __init__(
        self,
        test_series,
        test_ids
    ):
        self.enmo_mean = np.load('/kaggle/input/cmi-dss-objectdet-approach/enmo_mean.npy')
        self.enmo_std = np.load('/kaggle/input/cmi-dss-objectdet-approach/enmo_std.npy')

        self.Xs = self.conv_dfs(test_series, test_ids)
        self.ids = test_ids
        
        self.feat_list = np.load('/kaggle/input/cmi-dss-objectdet-approach/feature_list.npy')
        self.label_list = ['onset', 'wakeup']
        
        self.hour_feat = ['hour']
        
            
    def conv_dfs(self, series, ids):
        res = []
        for j, viz_id in tqdm(enumerate(ids), total=len(ids)):
            viz_series = series.loc[(series.series_id==viz_id)].copy().reset_index()
            viz_series['dt'] = pd.to_datetime(viz_series.timestamp,format = '%Y-%m-%dT%H:%M:%S%z').astype("datetime64[ns, UTC-04:00]")
            viz_series['hour'] = viz_series['dt'].dt.hour
            new_df = viz_series[['step', 'anglez', 'enmo', 'hour']]
            res.append(new_df)

        return res
    

    def norm_feat_eng(self, X, init=False):
        X['anglez'] = X['anglez'] / 90.0
        X['enmo_ln1p'] = np.log1p(X['enmo'])
        X['enmo'] = (X['enmo']-self.enmo_mean) / (self.enmo_std+1e-12)

        X['anglez_2'] = X['anglez'] ** 2 
        X['enmo_ln1p_2'] = X['enmo_ln1p'] ** 2
        X['enmo_ln1p_05'] = X['enmo_ln1p'] ** 0.5
        X['enmo_ln1p_4'] = X['enmo_ln1p'] ** 4

        if init:
            self.feat_list.append('anglez_2')
            self.feat_list.append('enmo_ln1p_2')
            self.feat_list.append('enmo_ln1p_05')
            self.feat_list.append('enmo_ln1p_4')
            self.feat_list.append('enmo_ln1p')

        f = X['anglez']
        g = X['enmo']
        h = X['enmo_ln1p']
        n_grads = 2
        for i in range(n_grads):
            f = np.gradient(f)
            g = np.gradient(g)
            h = np.gradient(h)
            X['anglez_grad_' + str(i+1)] = f
            X['enmo_grad_' + str(i+1)] = g
            X['enmo_ln1p_grad_' + str(i+1)] = h
            if init:
                self.feat_list.append('anglez_grad_' + str(i+1))
                self.feat_list.append('enmo_grad_' + str(i+1))
                self.feat_list.append('enmo_ln1p_grad_' + str(i+1))
       
        for w in [1, 2, 4, 8, 16, 32]:    
            X['anglez_shift_pos_' + str(w)] = X['anglez'].shift(w).fillna(0)
            X['anglez_shift_neg_' + str(w)] = X['anglez'].shift(-w).fillna(0)
            
            X['enmo_shift_pos_' + str(w)] = X['enmo'].shift(w).fillna(0)
            X['enmo_shift_neg_' + str(w)] = X['enmo'].shift(-w).fillna(0)

            X['enmo_ln1p_shift_pos_' + str(w)] = X['enmo_ln1p'].shift(w).fillna(0)
            X['enmo_ln1p_shift_neg_' + str(w)] = X['enmo_ln1p'].shift(-w).fillna(0)            
            
            if init:
                self.feat_list.append('anglez_shift_pos_' + str(w))
                self.feat_list.append('anglez_shift_neg_' + str(w))
                
                self.feat_list.append('enmo_shift_pos_' + str(w))
                self.feat_list.append('enmo_shift_neg_' + str(w))

                self.feat_list.append('enmo_ln1p_shift_pos_' + str(w))
                self.feat_list.append('enmo_ln1p_shift_neg_' + str(w))
            
        for r in [5, 17, 33, 65, 129]:
            tmp_anglez = X['anglez'].rolling(r, center=True)
            X[f'anglez_mean_{r}'] = tmp_anglez.mean()
            X[f'anglez_std_{r}'] = tmp_anglez.std()
            
            tmp_enmo = X['enmo'].rolling(r, center=True)
            X[f'enmo_mean_{r}'] = tmp_enmo.mean()
            X[f'enmo_std_{r}'] = tmp_enmo.std()

            tmp_enmo_ln1p = X['enmo_ln1p'].rolling(r, center=True)
            X[f'enmo_ln1p_mean_{r}'] = tmp_enmo_ln1p.mean()
            X[f'enmo_ln1p_std_{r}'] = tmp_enmo_ln1p.std()
            
            if init:
                self.feat_list.append(f'anglez_mean_{r}')
                self.feat_list.append(f'anglez_std_{r}')

                self.feat_list.append(f'enmo_mean_{r}')
                self.feat_list.append(f'enmo_std_{r}')

                self.feat_list.append(f'enmo_ln1p_mean_{r}')
                self.feat_list.append(f'enmo_ln1p_std_{r}')

        X = X.fillna(0)
        return X.astype(np.float32)
    
    def __len__(self):
        return len(self.Xs)

    def __getitem__(self, index):
        X = self.Xs[index].copy()
        X = self.norm_feat_eng(X, init=False)
        x = X[self.feat_list].values.astype(np.float32)     
        t = X[self.hour_feat].values.astype(np.int32)
        x = np.concatenate([x, t], axis=1)
        x = torch.tensor(x)
        return x, self.ids[index]

test_ds = SleepTestDataset(test_series, ids)

100%|██████████| 3/3 [00:00<00:00, 95.04it/s]


In [14]:
# # Debug
# class SleepTestDataset(Dataset):
#     def __init__(
#         self,
#         test_series,
#         test_ids
#     ):
#         self.enmo_mean = np.load('/kaggle/input/cmi-dss-objectdet-approach/enmo_mean.npy')
#         self.enmo_std = np.load('/kaggle/input/cmi-dss-objectdet-approach/enmo_std.npy')

#         self.Xs, self.ids = self.read_csvs(test_series)
        
        
#         self.feat_list = np.load('/kaggle/input/cmi-dss-objectdet-approach/feature_list.npy')
#         self.label_list = ['onset', 'wakeup']
        
#         self.hour_feat = ['hour']
        
            
#     def read_csvs(self, folder):
#         res = []
#         ids = []
#         if type(folder) is str:
#             files = sorted(glob.glob(f'{folder}/*.csv'))
#         else:
#             files = folder
#         for i, f in tqdm(enumerate(files), total=len(files), leave=False):
#             df = pd.read_csv(f)
#             res.append(df)
#             name = f.split('/')[-1].split('.')[0]
#             ids.append(name)
#         return res, ids
    

#     def norm_feat_eng(self, X, init=False):
#         X['anglez'] = X['anglez'] / 90.0
#         X['enmo_ln1p'] = np.log1p(X['enmo'])
#         X['enmo'] = (X['enmo']-self.enmo_mean) / (self.enmo_std+1e-12)

#         X['anglez_2'] = X['anglez'] ** 2 
#         X['enmo_ln1p_2'] = X['enmo_ln1p'] ** 2
#         X['enmo_ln1p_05'] = X['enmo_ln1p'] ** 0.5
#         X['enmo_ln1p_4'] = X['enmo_ln1p'] ** 4

#         if init:
#             self.feat_list.append('anglez_2')
#             self.feat_list.append('enmo_ln1p_2')
#             self.feat_list.append('enmo_ln1p_05')
#             self.feat_list.append('enmo_ln1p_4')
            
#             self.feat_list.append('enmo_ln1p')

#         f = X['anglez']
#         g = X['enmo']
#         h = X['enmo_ln1p']
#         n_grads = 2
#         for i in range(n_grads):
#             f = np.gradient(f)
#             g = np.gradient(g)
#             h = np.gradient(h)
#             X['anglez_grad_' + str(i+1)] = f
#             X['enmo_grad_' + str(i+1)] = g
#             X['enmo_ln1p_grad_' + str(i+1)] = h
#             if init:
#                 self.feat_list.append('anglez_grad_' + str(i+1))
#                 self.feat_list.append('enmo_grad_' + str(i+1))
#                 self.feat_list.append('enmo_ln1p_grad_' + str(i+1))
       
#         for w in [1, 2, 4, 8, 16, 32]:    
#             X['anglez_shift_pos_' + str(w)] = X['anglez'].shift(w).fillna(0)
#             X['anglez_shift_neg_' + str(w)] = X['anglez'].shift(-w).fillna(0)
            
#             X['enmo_shift_pos_' + str(w)] = X['enmo'].shift(w).fillna(0)
#             X['enmo_shift_neg_' + str(w)] = X['enmo'].shift(-w).fillna(0)

#             X['enmo_ln1p_shift_pos_' + str(w)] = X['enmo_ln1p'].shift(w).fillna(0)
#             X['enmo_ln1p_shift_neg_' + str(w)] = X['enmo_ln1p'].shift(-w).fillna(0)            
            
#             if init:
#                 self.feat_list.append('anglez_shift_pos_' + str(w))
#                 self.feat_list.append('anglez_shift_neg_' + str(w))
                
#                 self.feat_list.append('enmo_shift_pos_' + str(w))
#                 self.feat_list.append('enmo_shift_neg_' + str(w))

#                 self.feat_list.append('enmo_ln1p_shift_pos_' + str(w))
#                 self.feat_list.append('enmo_ln1p_shift_neg_' + str(w))
            
#         for r in [5, 17, 33, 65, 129]:
#             tmp_anglez = X['anglez'].rolling(r, center=True)
#             X[f'anglez_mean_{r}'] = tmp_anglez.mean()
#             X[f'anglez_std_{r}'] = tmp_anglez.std()
            
#             tmp_enmo = X['enmo'].rolling(r, center=True)
#             X[f'enmo_mean_{r}'] = tmp_enmo.mean()
#             X[f'enmo_std_{r}'] = tmp_enmo.std()

#             tmp_enmo_ln1p = X['enmo_ln1p'].rolling(r, center=True)
#             X[f'enmo_ln1p_mean_{r}'] = tmp_enmo_ln1p.mean()
#             X[f'enmo_ln1p_std_{r}'] = tmp_enmo_ln1p.std()
            
#             if init:
#                 self.feat_list.append(f'anglez_mean_{r}')
#                 self.feat_list.append(f'anglez_std_{r}')

#                 self.feat_list.append(f'enmo_mean_{r}')
#                 self.feat_list.append(f'enmo_std_{r}')

#                 self.feat_list.append(f'enmo_ln1p_mean_{r}')
#                 self.feat_list.append(f'enmo_ln1p_std_{r}')

#         X = X.fillna(0)
#         return X.astype(np.float32)
    
#     def __len__(self):
#         return len(self.Xs)

#     def __getitem__(self, index):
#         X = self.Xs[index].copy()
#         X = self.norm_feat_eng(X, init=False)
#         x = X[self.feat_list].values.astype(np.float32)     
#         t = X[self.hour_feat].values.astype(np.int32)
#         x = np.concatenate([x, t], axis=1)
#         x = torch.tensor(x)
#         return x, self.ids[index]

# import glob
# csvfiles = sorted(glob.glob('/kaggle/input/detect-sleep-states-dataprepare/train_csvs/*.csv'))[:10]
# test_ds = SleepTestDataset(csvfiles, ids)

In [15]:
len(test_ds.feat_list)

79

In [16]:
test_ds[0][0].shape

torch.Size([150, 80])

In [17]:
test_dl = DataLoader(
    test_ds,
    batch_size=1,
    pin_memory=True,
    num_workers=WORKERS,
    shuffle=False,
    drop_last=False
)


In [18]:
def before_padding(x, tgt_len=STRIDE):
    x = F.pad(x, (0, 0, tgt_len, 0))
    return x

def after_padding(x, tgt_len=STRIDE):
    x = F.pad(x, (0, 0, 0, tgt_len))
    return x

def padding_(x, tgt_len=MAX_LEN):
    res = tgt_len - (x.size(-2) % tgt_len)
    x = F.pad(x, (0, 0, 0, res))
    return x

In [19]:
models = []

In [20]:
MODEL_PATH = '/kaggle/input/cmi-dss-objdet-convnextv2b-fcmae/fepoch40.pth'
model = get_model_convnext(in_c=len(test_ds.feat_list), mn='convnextv2_base.fcmae_ft_in22k_in1k_384', pretrained=False).to(device)
model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
model.eval().half()
models.append(model)

In [21]:
submission = pd.DataFrame()

In [22]:
with torch.no_grad():
    with tqdm(test_dl, leave=True) as pbar:
        for a, (X_batch, idname) in enumerate(pbar):
            bl = [[] for _ in range(len(models))]
            sl = [[] for _ in range(len(models))]
            ll = [[] for _ in range(len(models))]
            il = [[] for _ in range(len(models))]
            xl = [[] for _ in range(len(models))]
            X_batch = X_batch.to(device)
            x_seq_len = X_batch.shape[1]

            idname = idname[0]

            if x_seq_len%MAX_LEN != 0:
                X_batch = padding_(X_batch, MAX_LEN)
                
            new_x_seq_len = X_batch.shape[1]

            for indm, m in enumerate(models):
                POS = 0
                CENTER = MAX_LEN//2
                for b in range(0, new_x_seq_len, STRIDE):
                    X_chunk = X_batch[:, b : b + MAX_LEN].half()

                    p = m(X_chunk)
                    
                    for c in range(len(p[0]['boxes'])):
                        tmp_step = (p[0]['boxes'][c][0] + p[0]['boxes'][c][2]).cpu().numpy().astype(np.float32) // 2
                        
                        tmp_box = [p[0]['boxes'][c][0].cpu().numpy().astype(np.float32)+POS, 
                                   p[0]['boxes'][c][1].cpu().numpy().astype(np.float32), 
                                   p[0]['boxes'][c][2].cpu().numpy().astype(np.float32)+POS, 
                                   p[0]['boxes'][c][3].cpu().numpy().astype(np.float32)]
                        
                        tmp_box[0] = tmp_box[0] / X_batch.shape[1]
                        tmp_box[1] = tmp_box[1] / HEIGHT
                        tmp_box[2] = tmp_box[2] / X_batch.shape[1]
                        tmp_box[3] = tmp_box[3] / HEIGHT
                        
                        tmp_score = p[0]['scores'][c].cpu().numpy().astype(np.float32)
                        tmp_label = p[0]['labels'][c].cpu().numpy().astype(np.int32)
                        if b==0:
                            CONDITION = (0<=(tmp_step+POS)<=CENTER+STRIDE//2)
                            
                        elif b==new_x_seq_len-STRIDE:
                            CONDITION = (0<=(tmp_step+POS)<=x_seq_len)
                            
                        else:
                            CONDITION = (CENTER-STRIDE//2<=(tmp_step+POS)<=CENTER+STRIDE//2)
                            
                        if CONDITION:
                            
                            if tmp_label==1 or tmp_label==2:
                                ll[indm].append(tmp_label)
                                bl[indm].append(tmp_step)
                                sl[indm].append(tmp_score)
                                il[indm].append(idname)
                                xl[indm].append(tmp_box)


                    POS += STRIDE
                    CENTER += STRIDE
                    
            boxes, scores, labels = weighted_boxes_fusion(
                    boxes_list=xl,
                    scores_list=sl,
                    labels_list=ll,
                    weights=None,
                    iou_thr=0.55,
                    skip_box_thr=0.005,
                    conf_type='avg',
                    allows_overflow=False
            )
            
            step_list = []
            
            for p in range(len(boxes)):
                boxes[p][0] *= X_batch.shape[1]
                boxes[p][1] *= HEIGHT
                boxes[p][2] *= X_batch.shape[1]
                boxes[p][3] *= HEIGHT
                step_list.append((boxes[p][0]+boxes[p][2])//2)
                
            label_list = []
            
            for p in range(len(labels)):
                if labels[p]==1:
                    label_list.append('onset')
                if labels[p]==2:
                    label_list.append('wakeup')
                    
            tmp_sub = pd.DataFrame()
            tmp_sub['series_id'] = [idname] * len(boxes)
            tmp_sub['event'] = label_list
            tmp_sub['step'] = step_list
            tmp_sub['score'] = scores
            
            if len(test_dl)<=10:
                display(tmp_sub)
            submission = pd.concat([submission, tmp_sub], axis=0)
            
            del X_batch
            gc.collect()
            torch.cuda.empty_cache()
    

  0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,series_id,event,step,score


 33%|███▎      | 1/3 [00:13<00:27, 13.73s/it]

Unnamed: 0,series_id,event,step,score
0,03d92c9f6f8a,onset,68.0,0.0672


 67%|██████▋   | 2/3 [00:14<00:05,  5.92s/it]

Unnamed: 0,series_id,event,step,score
0,0402a003dae9,onset,131.0,0.076111


100%|██████████| 3/3 [00:14<00:00,  4.89s/it]


In [23]:
submission = submission.sort_values(['series_id','step']).reset_index(drop=True)
submission['row_id'] = submission.index.astype(int)
submission['score'] = submission['score'].fillna(submission['score'].mean())
submission = submission[['row_id','series_id','step','event','score']]
submission.to_csv('submission.csv',index=False)

In [24]:
submission

Unnamed: 0,row_id,series_id,step,event,score
0,0,03d92c9f6f8a,68.0,onset,0.0672
1,1,0402a003dae9,131.0,onset,0.076111
