## 필요 데이터 전처리
1.  두 데이터의 수집 속도가 다름(128hz vs 100hz)이를 학습하려면 동기화가 필요
    -   업셈플링(3200hz)을 진행하면 데이터의 손실 없이 가능 하지만 연산량이 너무 많아짐
    -   100hz 다운샘플링: tdcsfog에서 약간의 손실이 있지만 대부분의 데이터에서 손실없이 사용 가능
    
2.  사람이 분석한 라밸링은 오차가 들어갈 수 있고 라밸 경계의 모호함을 표현할 수 있어, 하드라밸링이 아닌 스무딩을 통해 소프트 라밸링으로 변환
    -   다운 샘플링을 통해 희석되는 fog 라밸링 또한 소프트 라밸링을 통해 유지 가능
3.  절대 이동량, 상대 이동량, 각 방향의 델타, 이동량 델타 추가: 단순한 각 축의 가속도 뿐 아니라 이동량, 변화량(델타)을 변수로 생성
4.  변화량과 이동량을 텐서에 넣기 or 텐서 기준으로 
5.  텐서 변환: 시계열의 특성을 반영과 딥러닝을 위하여 텐서로 변환

##### 파일 로드

In [1]:
import pandas as pd
test_df = pd.read_csv(r'train\tdcsfog\0a89f859b5.csv')

### 소프트 라벨링
-   라벨링 방식: 연구자들이 환자의 비디오를 보고 FOG의 시작/종료 시점을 근거로 라밸링을 부여함
-   경계 불확실: 사람의 판단에는 불가피한 휴먼 에러가 있어, 실제 지점과 주석 지점에 오차가 있을 수 있음
-   소프트 라벨링: 시작/종료 경계구간만 완만히 스무딩 하고 내부는 하드 라벨링 유지
-   목표: 모델이 경계의 불확실성을 학습하여 경계 깜빡임을 줄이고 성능을 높이는것을 목표함

#### 소프트 라벨링 적용

$$
y(t)=
\begin{cases}
0, & t < s-\Delta, \\[4pt]
\displaystyle \tfrac{1}{2}\!\left(1 - \cos\!\frac{\pi\,(t - s + \Delta)}{2\Delta}\right), 
& s-\Delta \le t \le s+\Delta, \\[8pt]
1, & s+\Delta < t < e-\Delta, \\[4pt]
\displaystyle \tfrac{1}{2}\!\left(1 + \cos\!\frac{\pi\,(t - e + \Delta)}{2\Delta}\right), 
& e-\Delta \le t \le e+\Delta, \\[8pt]
0, & t > e+\Delta.
\end{cases}
$$
t: 시간, s: 이벤트 시작 시간, e: 이벤트 종료 시간, Δ: 스무딩 구간 길이

-   구간: 0.3, 0.2, 0.1(s)스무딩 적용
-   방식: cos lapping 진행


In [2]:
import numpy as np
import ruptures as rpt
from math import ceil

In [3]:
def search_boundary(a):
    a = np.array(a)
    point = np.flatnonzero(np.diff(a)) + 1
    starts = point[a[point] == 1].astype(int).tolist()
    ends = point[a[point] == 0].astype(int).tolist()
    return starts, ends

In [4]:
def cosine_up(len):
    trans = np.linspace(0, np.pi, len + 1, endpoint=True)
    up = (1 - np.cos(trans)) / 2.0
    return up[:-1]

def cosine_dn(len):
    trans = np.linspace(0, np.pi, len + 1, endpoint=True)
    dn = (1 + np.cos(trans)) / 2.0
    return dn[1:]

In [5]:
def smooding(array, hz, interval):
    arr = array.to_numpy(dtype=float)
    n = len(arr)
    sta, end = search_boundary(arr)

    if len(end) and (len(sta) == 0 or (len(sta) and end[0] < sta[0])):
        sta = [0] + sta

    if len(sta) > len(end):
        end = end + [n]

    out = {}

    L = int(ceil(interval * hz))
    # 간격만큼 코사인 값 생성
    up = cosine_up(L)
    dn = cosine_dn(L)
    sm = arr.copy()

    for s, e in zip(sta, end):
        # 인덱스 임으로 0보가 작은값이 들어가지 안도록
        a = max(0, s - L)
        b = s
        seg_len = b - a
        
        # a:b사이에 up을 넣기 위해서, 만약 seg_len이 0이면 아무값도 안들어 가고
        # 잘린 구간이 들어가도 잘린만큼만 들어감
        if seg_len > 0:
            sm[a:b] = np.maximum(sm[a:b], up[-seg_len:])

        a2 = e
        b2 = min(n, e + L)
        seg_len2 = b2 - a2
        if seg_len2 > 0:
            sm[a2:b2] = np.maximum(sm[a2:b2], dn[:seg_len2])

    suff = f"_cos{int(interval*1000)}ms"
    out[suff] = pd.Series(sm, index=array.index, name=array.name + suff)

    return out
    

In [6]:
def apply_cosine(df, cols, hz=100, interval=0.2):
    out_df = df.copy()
    for c in cols:
        sm_dict = smooding(out_df[c].astype(int), hz, interval)
        for suff, s in sm_dict.items():
            out_df[c + suff] = s

    return out_df

In [7]:
cols = ['StartHesitation','Turn','Walking']
for col in cols:
    s, e = search_boundary(test_df[col])
    print(s, e)

[] []
[2678] [3126]
[] []


In [8]:
cols = ['StartHesitation','Turn','Walking']
test_df2 = apply_cosine(test_df, cols,hz=128, interval=0.1)
test_df2[2650:2680]

Unnamed: 0,Time,AccV,AccML,AccAP,StartHesitation,Turn,Walking,StartHesitation_cos100ms,Turn_cos100ms,Walking_cos100ms
2650,2650,-8.201585,-2.178324,3.907174,0,0,0,0.0,0.0,0.0
2651,2651,-8.381197,-2.336757,3.92836,0,0,0,0.0,0.0,0.0
2652,2652,-8.577747,-2.452735,3.944423,0,0,0,0.0,0.0,0.0
2653,2653,-8.752196,-2.542117,3.983029,0,0,0,0.0,0.0,0.0
2654,2654,-8.8831,-2.556142,4.039213,0,0,0,0.0,0.0,0.0
2655,2655,-8.964478,-2.486284,4.040333,0,0,0,0.0,0.0,0.0
2656,2656,-9.043612,-2.370993,3.987557,0,0,0,0.0,0.0,0.0
2657,2657,-9.065429,-2.276169,3.953594,0,0,0,0.0,0.0,0.0
2658,2658,-9.010197,-2.202784,3.869914,0,0,0,0.0,0.0,0.0
2659,2659,-8.90828,-2.217674,3.795523,0,0,0,0.0,0.0,0.0


In [9]:
test_df2[3120:3155]

Unnamed: 0,Time,AccV,AccML,AccAP,StartHesitation,Turn,Walking,StartHesitation_cos100ms,Turn_cos100ms,Walking_cos100ms
3120,3120,-8.565183,-1.841417,3.977641,0,1,0,0.0,1.0,0.0
3121,3121,-8.646176,-1.83528,4.02098,0,1,0,0.0,1.0,0.0
3122,3122,-8.639738,-1.815147,4.018696,0,1,0,0.0,1.0,0.0
3123,3123,-8.599998,-1.770967,3.977779,0,1,0,0.0,1.0,0.0
3124,3124,-8.539152,-1.74909,3.962286,0,1,0,0.0,1.0,0.0
3125,3125,-8.498719,-1.738478,3.935146,0,1,0,0.0,1.0,0.0
3126,3126,-8.525236,-1.760298,3.890208,0,0,0,0.0,0.985471,0.0
3127,3127,-8.59684,-1.820324,3.828769,0,0,0,0.0,0.942728,0.0
3128,3128,-8.686045,-1.852037,3.820399,0,0,0,0.0,0.874255,0.0
3129,3129,-8.778447,-1.873362,3.802035,0,0,0,0.0,0.784032,0.0


In [10]:
def smoodings(array, hz, intervals):
    arr = array.to_numpy(dtype=float)
    n = len(arr)
    sta, end = search_boundary(arr)

    if len(end) and (len(sta) == 0 or (len(sta) and end[0] < sta[0])):
        sta = [0] + sta

    if len(sta) > len(end):
        end = end + [n]

    out = {}

    for interval in intervals:
        L = int(ceil(interval * hz))
        # 간격만큼 코사인 값 생성
        up = cosine_up(L)
        dn = cosine_dn(L)
        sm = arr.copy()

        for s, e in zip(sta, end):
            # 인덱스 임으로 0보가 작은값이 들어가지 안도록
            a = max(0, s - L)
            b = s
            seg_len = b - a
            
            # a:b사이에 up을 넣기 위해서, 만약 seg_len이 0이면 아무값도 안들어 가고
            # 잘린 구간이 들어가도 잘린만큼만 들어감
            if seg_len > 0:
                sm[a:b] = np.maximum(sm[a:b], up[-seg_len:])

            a2 = e
            b2 = min(n, e + L)
            seg_len2 = b2 - a2
            if seg_len2 > 0:
                sm[a2:b2] = np.maximum(sm[a2:b2], dn[:seg_len2])

        suff = f"_cos{int(interval*1000)}ms"
        out[suff] = pd.Series(sm, index=array.index, name=array.name + suff)

    return out

In [11]:
def apply_cosines(df, cols, hz=100, intervals=None):
    out_df = df.copy()
    for c in cols:
        sm_dict = smoodings(out_df[c].astype(int), hz, intervals)
        for suff, s in sm_dict.items():
            out_df[c + suff] = s

    return out_df

In [12]:
intervals = [0.3, 0.2, 0.1]
test_df3 = apply_cosines(test_df, cols, hz=128, intervals=intervals)
test_df3[2650:2680]

Unnamed: 0,Time,AccV,AccML,AccAP,StartHesitation,Turn,Walking,StartHesitation_cos300ms,StartHesitation_cos200ms,StartHesitation_cos100ms,Turn_cos300ms,Turn_cos200ms,Turn_cos100ms,Walking_cos300ms,Walking_cos200ms,Walking_cos100ms
2650,2650,-8.201585,-2.178324,3.907174,0,0,0,0.0,0.0,0.0,0.183777,0.0,0.0,0.0,0.0,0.0
2651,2651,-8.381197,-2.336757,3.92836,0,0,0,0.0,0.0,0.0,0.215968,0.0,0.0,0.0,0.0,0.0
2652,2652,-8.577747,-2.452735,3.944423,0,0,0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0
2653,2653,-8.752196,-2.542117,3.983029,0,0,0,0.0,0.0,0.0,0.285654,0.003646,0.0,0.0,0.0,0.0
2654,2654,-8.8831,-2.556142,4.039213,0,0,0,0.0,0.0,0.0,0.322698,0.014529,0.0,0.0,0.0,0.0
2655,2655,-8.964478,-2.486284,4.040333,0,0,0,0.0,0.0,0.0,0.360891,0.032492,0.0,0.0,0.0,0.0
2656,2656,-9.043612,-2.370993,3.987557,0,0,0,0.0,0.0,0.0,0.399987,0.057272,0.0,0.0,0.0,0.0
2657,2657,-9.065429,-2.276169,3.953594,0,0,0,0.0,0.0,0.0,0.439732,0.088508,0.0,0.0,0.0,0.0
2658,2658,-9.010197,-2.202784,3.869914,0,0,0,0.0,0.0,0.0,0.479867,0.125745,0.0,0.0,0.0,0.0
2659,2659,-8.90828,-2.217674,3.795523,0,0,0,0.0,0.0,0.0,0.520133,0.168439,0.0,0.0,0.0,0.0


## 폴더 로드 및 스무딩 후 저장
1.  라벨링 데이터 tdcsfog(128Hz), defog(100Hz)을 읽습니다.
2.  tdcsfog는 `StartHesitation`, `Turn`, `Walking`, defog는 `Event`의 라벨 경계에 **Cosine Boundary Smoothing**을 적용합니다.
3.  스무딩한 컬럼을 원본에 더해서 만든 csv는 `cosin_date/{tdcsfog,defog}`에 저장합니다.
4.  스무딩 구간은 초 단위입니다.
5.  defog는 `valid True` 구간을 나누어 학습에 사용할 수 있게 `cosin_date/defog_true`에 저장합니다.



> ```text
> BASE_DIR/
> ├─ train/
> │  ├─ tdcsfog/   # 128 Hz CSV들
> │  └─ defog/     # 100 Hz CSV들
> └─ cosin_date/   # 결과물 저장 위치(자동 생성)
>    ├─ tdcsfog/
>    └─ defog/
> ```

In [15]:
from pathlib import Path
from glob import glob

In [17]:
BASE_DIR = Path.cwd()

TDCSFOG_DIR = BASE_DIR / "train" / "tdcsfog"
DEFOG_DIR = BASE_DIR / "train" / "defog"

OUT_BASE = BASE_DIR / "cosin_date"
OUT_TDCSFOG = OUT_BASE / "tdcsfog"
OUT_DEFOG = OUT_BASE / "defog"
# OUT_TDCSFOG.mkdir(parents=True, exist_ok=True)
# OUT_DEFOG.mkdir(parents=True, exist_ok=True)

EXT = ".csv"


In [None]:
def search_boundary(arr):
    arr = np.array(arr)
    if arr.ndim != 1:
        raise ValueError("1D array만 가능")
    if len(arr) == 0:
        return [], []

    diff = np.diff(arr.astype(int), prepend=arr[0])
    starts = np.where(diff == 1)[0].tolist()
    ends = np.where(diff == -1)[0].tolist()
    return starts, ends

In [None]:
def cosine_up(len):
    trans = np.linspace(0, np.pi, len + 1, endpoint=True)
    up = (1 - np.cos(trans)) / 2.0
    return up[:-1]

def cosine_dn(len):
    trans = np.linspace(0, np.pi, len + 1, endpoint=True)
    dn = (1 + np.cos(trans)) / 2.0
    return dn[1:]

In [None]:
def smoodings(array, hz, intervals):
    arr = array.to_numpy(dtype=float)
    n = len(arr)
    sta, end = search_boundary(arr)

    if len(end) and (len(sta) == 0 or (len(sta) and end[0] < sta[0])):
        sta = [0] + sta

    if len(sta) > len(end):
        end = end + [n]

    out = {}

    for interval in intervals:
        L = int(ceil(interval * hz))
        # 간격만큼 코사인 값 생성
        up = cosine_up(L)
        dn = cosine_dn(L)
        sm = arr.copy()

        for s, e in zip(sta, end):
            # 인덱스 임으로 0보가 작은값이 들어가지 안도록
            a = max(0, s - L)
            b = s
            seg_len = b - a
            
            # a:b사이에 up을 넣기 위해서, 만약 seg_len이 0이면 아무값도 안들어 가고
            # 잘린 구간이 들어가도 잘린만큼만 들어감
            if seg_len > 0:
                sm[a:b] = np.maximum(sm[a:b], up[-seg_len:])

            a2 = e
            b2 = min(n, e + L)
            seg_len2 = b2 - a2
            if seg_len2 > 0:
                sm[a2:b2] = np.maximum(sm[a2:b2], dn[:seg_len2])

        suff = f"_cos{int(interval*1000)}ms"
        out[suff] = pd.Series(sm, index=array.index, name=array.name + suff)

    return out

In [None]:
def apply_cosines(df, cols, hz=100, intervals=None):
    out_df = df.copy()
    for c in cols:
        sm_dict = smoodings(out_df[c].astype(int), hz, intervals)
        for suff, s in sm_dict.items():
            out_df[c + suff] = s

    return out_df

In [27]:
def _list_csvs(folder_path):
    return sorted([Path(p) for p in glob(str(folder_path / f"*{EXT}"))])

def process_folder(folder_path, out_path, hz, cols=list, intervals=list):
    files = _list_csvs(folder_path)
    
    out_path.mkdir(parents=True, exist_ok=True)
    file_count = len(files)

    for i, fp in enumerate(files, 1):
        df = pd.read_csv(fp)

        out_df = apply_cosines(df, cols=cols, hz=hz, intervals=intervals)
        out_fp = out_path / fp.name

        out_df.to_csv(out_fp, index=False)

        if file_count >= 100:
            if i % 25 == 0 or i == file_count:
                print(f' - 진행도 {i}/{file_count}: {fp.name}')
        elif file_count < 100:
            if i % 5 == 0 or i == file_count:
                print(f' - 진행도 {i}/{file_count}: {fp.name}')


In [22]:
intervals = [0.1, 0.2, 0.3]
cols = ["StartHesitation", "Turn", "Walking"]

process_folder(TDCSFOG_DIR, OUT_TDCSFOG,  hz=128, cols=cols, intervals=intervals)

 - 진행도 25/833: 06422a906e.csv
 - 진행도 50/833: 0cbb7f4ed9.csv
 - 진행도 75/833: 13dd212d5a.csv
 - 진행도 100/833: 1bb45d06e1.csv
 - 진행도 125/833: 251c37f6e0.csv
 - 진행도 150/833: 2d481ad987.csv
 - 진행도 175/833: 3415b61278.csv
 - 진행도 200/833: 3c535f4851.csv
 - 진행도 225/833: 45a205e6ce.csv
 - 진행도 250/833: 4dd368b175.csv
 - 진행도 275/833: 56965f50de.csv
 - 진행도 300/833: 5e83737ac5.csv
 - 진행도 325/833: 690030e376.csv
 - 진행도 350/833: 758ddee3f7.csv
 - 진행도 375/833: 7d234837b6.csv
 - 진행도 400/833: 82fc8e0363.csv
 - 진행도 425/833: 88d3c2b077.csv
 - 진행도 450/833: 8ec5220619.csv
 - 진행도 475/833: 956d65047a.csv
 - 진행도 500/833: 9b5626b353.csv
 - 진행도 525/833: a079ea3f57.csv
 - 진행도 550/833: a7ac888810.csv
 - 진행도 575/833: af89698392.csv
 - 진행도 600/833: b670d1cddd.csv
 - 진행도 625/833: bcfd1d3dbf.csv
 - 진행도 650/833: c3ac51e605.csv
 - 진행도 675/833: cd23f96303.csv
 - 진행도 700/833: d606b42f46.csv
 - 진행도 725/833: de52475815.csv
 - 진행도 750/833: e5d573a30f.csv
 - 진행도 775/833: ef932ebc53.csv
 - 진행도 800/833: f6ea84ee34.csv
 - 진행도 825/

In [24]:
process_folder(DEFOG_DIR, OUT_DEFOG, hz=100, cols=cols, intervals=intervals)

 - 진행도 25/91: 41bc215f97.csv
 - 진행도 50/91: 850748a138.csv
 - 진행도 75/91: d99c7b2069.csv
 - 진행도 91/91: f9fc61ce85.csv


In [25]:
NOTYPE_DIR   = BASE_DIR / "train" / "notype"

OUT_NOTYPE = OUT_BASE / "notype"

OUT_NOTYPE.mkdir(parents=True, exist_ok=True)

col = ["Event"]

process_folder(NOTYPE_DIR, OUT_NOTYPE, hz=100, cols=col, intervals=intervals)

 - 진행도 25/46: 72853af746.csv
 - 진행도 46/46: e8e530a4f9.csv
