In [1]:
import pandas as pd
import numpy as np
import gc
import time
import json
from datetime import datetime
import matplotlib.pyplot as plt
import os
import joblib
import random
import math
from tqdm import tqdm 

from scipy.interpolate import interp1d

from math import pi, sqrt, exp
import sklearn,sklearn.model_selection
import torch
from torch import nn,Tensor
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, SubsetRandomSampler
from sklearn.metrics import average_precision_score
from timm.scheduler import CosineLRScheduler
plt.style.use("ggplot")

from pyarrow.parquet import ParquetFile
import pyarrow as pa 
import ctypes

In [2]:
class PATHS:
    MAIN_DIR = "child-mind-institute-detect-sleep-states/"
    # CSV FILES : 
    SUBMISSION = MAIN_DIR + "sample_submission.csv"
    TRAIN_EVENTS = MAIN_DIR + "train_events.csv"
    # PARQUET FILES:
    TRAIN_SERIES = MAIN_DIR + "train_series.parquet"
    TEST_SERIES = MAIN_DIR + "test_series.parquet"

In [3]:
out_dir = 'train_csvs_objdet'
os.makedirs(out_dir, exist_ok=True)

In [4]:
class data_reader:
    def __init__(self, demo_mode):
        super().__init__()
        # MAPPING FOR DATA LOADING :
        self.names_mapping = {
            "submission" : {"path" : PATHS.SUBMISSION, "is_parquet" : False, "has_timestamp" : False}, 
            "train_events" : {"path" : PATHS.TRAIN_EVENTS, "is_parquet" : False, "has_timestamp" : True},
            "train_series" : {"path" : PATHS.TRAIN_SERIES, "is_parquet" : True, "has_timestamp" : True},
            "test_series" : {"path" : PATHS.TEST_SERIES, "is_parquet" : True, "has_timestamp" : True}
        }
        self.valid_names = ["submission", "train_events", "train_series", "test_series"]
        self.demo_mode = demo_mode
    
    def verify(self, data_name):
        "function for data name verification"
        if data_name not in self.valid_names:
            print("PLEASE ENTER A VALID DATASET NAME, VALID NAMES ARE : ", valid_names)
        return
    
    def cleaning(self, data):
        "cleaning function : drop na values"
        before_cleaning = len(data)
        print("Number of missing timestamps : ", len(data[data["timestamp"].isna()]))
        data = data.dropna(subset=["timestamp"])
        after_cleaning = len(data)
        print("Percentage of removed steps : {:.1f}%".format(100 * (before_cleaning - after_cleaning) / before_cleaning) )
#         print(data.isna().any())
#         data = data.bfill()
        return data
    
    @staticmethod
    def reduce_memory_usage(data):
        "iterate through all the columns of a dataframe and modify the data type to reduce memory usage."
        start_mem = data.memory_usage().sum() / 1024**2
        print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
        for col in data.columns:
            col_type = data[col].dtype    
            if col_type != object:
                c_min = data[col].min()
                c_max = data[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        data[col] = data[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        data[col] = data[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        data[col] = data[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        data[col] = data[col].astype(np.int64)  
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        data[col] = data[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        data[col] = data[col].astype(np.float32)
                    else:
                        data[col] = data[col].astype(np.float64)
            else:
                data[col] = data[col].astype('category')

        end_mem = data.memory_usage().sum() / 1024**2
        print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
        return data
    
    def load_data(self, data_name):
        "function for data loading"
        self.verify(data_name)
        data_props = self.names_mapping[data_name]
        if data_props["is_parquet"]:
            if self.demo_mode:
                pf = ParquetFile(data_props["path"]) 
                demo_steps = next(pf.iter_batches(batch_size=20_000)) 
                data = pa.Table.from_batches([demo_steps]).to_pandas()
            else:
                data = pd.read_parquet(data_props["path"])
        else:
            if self.demo_mode:
                data = pd.read_csv(data_props["path"], nsteps=20_000)
            else:
                data = pd.read_csv(data_props["path"])
                
        gc.collect()
        if data_props["has_timestamp"]:
            print('cleaning')
            data = self.cleaning(data)
            gc.collect()
        #data = self.reduce_memory_usage(data)
        return data

In [5]:
reader = data_reader(demo_mode=False)
series = reader.load_data(data_name="train_series")
events = reader.load_data(data_name="train_events")

cleaning
Number of missing timestamps :  0
Percentage of removed steps : 0.0%
cleaning
Number of missing timestamps :  4923
Percentage of removed steps : 33.9%


In [6]:
# SIGMA = 720 # 12 * 60
# def gauss(n=SIGMA,sigma=SIGMA*0.15):
#     # guassian distribution function
#     r = range(-int(n/2),int(n/2)+1)
#     return [1 / (sigma * sqrt(2*pi)) * exp(-float(x)**2/(2*sigma**2)) for x in r]

In [7]:
targets = []
data = []
ids = series.series_id.unique()

enmo_dfs = pd.DataFrame()

In [12]:
for j, viz_id in tqdm(enumerate(ids), total=len(ids)):
    viz_targets = []
    viz_events = events[events.series_id == viz_id]
    if j<5:
        display(viz_events)
    viz_events.to_csv(f'{out_dir}/{viz_id}.csv', index=False)

  0%|                                                   | 0/277 [00:00<?, ?it/s]

Unnamed: 0,series_id,night,event,step,timestamp
0,038441c925bb,1,onset,4992.0,2018-08-14T22:26:00-0400
1,038441c925bb,1,wakeup,10932.0,2018-08-15T06:41:00-0400
2,038441c925bb,2,onset,20244.0,2018-08-15T19:37:00-0400
3,038441c925bb,2,wakeup,27492.0,2018-08-16T05:41:00-0400
4,038441c925bb,3,onset,39996.0,2018-08-16T23:03:00-0400
5,038441c925bb,3,wakeup,44400.0,2018-08-17T05:10:00-0400
6,038441c925bb,4,onset,57240.0,2018-08-17T23:00:00-0400
7,038441c925bb,4,wakeup,62856.0,2018-08-18T06:48:00-0400
10,038441c925bb,6,onset,91296.0,2018-08-19T22:18:00-0400
11,038441c925bb,6,wakeup,97860.0,2018-08-20T07:25:00-0400


Unnamed: 0,series_id,night,event,step,timestamp
46,03d92c9f6f8a,1,onset,5928.0,2018-05-31T20:14:00-0400
47,03d92c9f6f8a,1,wakeup,13524.0,2018-06-01T06:47:00-0400
48,03d92c9f6f8a,2,onset,23220.0,2018-06-01T20:15:00-0400
49,03d92c9f6f8a,2,wakeup,30276.0,2018-06-02T06:03:00-0400
50,03d92c9f6f8a,3,onset,40668.0,2018-06-02T20:29:00-0400
51,03d92c9f6f8a,3,wakeup,47952.0,2018-06-03T06:36:00-0400
54,03d92c9f6f8a,5,onset,75756.0,2018-06-04T21:13:00-0400
55,03d92c9f6f8a,5,wakeup,82800.0,2018-06-05T07:00:00-0400
66,03d92c9f6f8a,11,onset,178464.0,2018-06-10T19:52:00-0400
67,03d92c9f6f8a,11,wakeup,186564.0,2018-06-11T07:07:00-0400


Unnamed: 0,series_id,night,event,step,timestamp
120,0402a003dae9,1,onset,8364.0,2018-12-19T00:22:00-0500
121,0402a003dae9,1,wakeup,12948.0,2018-12-19T06:44:00-0500
122,0402a003dae9,2,onset,24396.0,2018-12-19T22:38:00-0500
123,0402a003dae9,2,wakeup,29964.0,2018-12-20T06:22:00-0500
130,0402a003dae9,6,onset,94032.0,2018-12-23T23:21:00-0500
131,0402a003dae9,6,wakeup,96180.0,2018-12-24T02:20:00-0500
132,0402a003dae9,7,onset,112992.0,2018-12-25T01:41:00-0500
133,0402a003dae9,7,wakeup,116364.0,2018-12-25T06:22:00-0500
134,0402a003dae9,8,onset,128268.0,2018-12-25T22:54:00-0500
135,0402a003dae9,8,wakeup,134004.0,2018-12-26T06:52:00-0500


Unnamed: 0,series_id,night,event,step,timestamp
170,04f547b8017d,2,onset,23484.0,2018-11-29T20:37:00-0500
171,04f547b8017d,2,wakeup,30804.0,2018-11-30T06:47:00-0500
172,04f547b8017d,3,onset,40092.0,2018-11-30T19:41:00-0500
173,04f547b8017d,3,wakeup,45492.0,2018-12-01T03:11:00-0500
176,04f547b8017d,5,onset,75348.0,2018-12-02T20:39:00-0500
177,04f547b8017d,5,wakeup,82404.0,2018-12-03T06:27:00-0500
178,04f547b8017d,6,onset,92892.0,2018-12-03T21:01:00-0500
179,04f547b8017d,6,wakeup,99672.0,2018-12-04T06:26:00-0500
180,04f547b8017d,7,onset,110244.0,2018-12-04T21:07:00-0500
181,04f547b8017d,7,wakeup,117540.0,2018-12-05T07:15:00-0500


Unnamed: 0,series_id,night,event,step,timestamp
244,05e1944c3818,2,onset,20520.0,2018-11-17T22:30:00-0500
245,05e1944c3818,2,wakeup,27360.0,2018-11-18T08:00:00-0500
246,05e1944c3818,3,onset,38328.0,2018-11-18T23:14:00-0500
247,05e1944c3818,3,wakeup,45408.0,2018-11-19T09:04:00-0500
248,05e1944c3818,4,onset,55332.0,2018-11-19T22:51:00-0500
249,05e1944c3818,4,wakeup,61272.0,2018-11-20T07:06:00-0500
250,05e1944c3818,5,onset,74292.0,2018-11-21T01:11:00-0500
251,05e1944c3818,5,wakeup,78480.0,2018-11-21T07:00:00-0500
252,05e1944c3818,6,onset,90756.0,2018-11-22T00:03:00-0500
253,05e1944c3818,6,wakeup,95760.0,2018-11-22T07:00:00-0500


100%|████████████████████████████████████████| 277/277 [00:00<00:00, 923.55it/s]
