In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install nptdms

Collecting nptdms
  Downloading nptdms-1.10.0.tar.gz (181 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/181.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━[0m [32m133.1/181.5 kB[0m [31m4.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m181.5/181.5 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: nptdms
  Building wheel for nptdms (pyproject.toml) ... [?25l[?25hdone
  Created wheel for nptdms: filename=nptdms-1.10.0-py3-none-any.whl size=108456 sha256=56ef2a24ab252ec89220021f5246d8a76be60810daeaa016f95e65119dfb2c4e
  Stored in directory: /root/.cache/pip/wheels/1b/4b/17/21e8b03b37ea51ce7ec9f5570cdf0decca93f537d61c06880f
Successfully b

## RUL 생성 파이프라인

In [3]:
import os
import numpy as np
import pandas as pd
from nptdms import TdmsFile
from sklearn.linear_model import LinearRegression

데이터 불러오기

In [4]:
def load_tdms_file(file_path):
    tdms_file = TdmsFile.read(file_path)

    group_name_vibration = tdms_file.groups()[0].name
    group_name_operation = tdms_file.groups()[1].name

    vib_channels = tdms_file[group_name_vibration].channels()
    vib_data = {ch.name.strip(): ch.data for ch in vib_channels}

    operation_channels = tdms_file[group_name_operation].channels()
    operation_data = {ch.name.strip(): ch.data for ch in operation_channels}

    return vib_data, operation_data

In [5]:
def load_summary_from_tdms(train_root, target_folders):
    summary = {}

    for folder in target_folders:
        folder_path = os.path.join(train_root, folder)
        tdms_files = sorted([f for f in os.listdir(folder_path) if f.endswith(".tdms")])

        records = []

        for idx, tdms_file in enumerate(tdms_files):
            file_path = os.path.join(folder_path, tdms_file)
            _, operation_data = load_tdms_file(file_path)

            time_sec = idx * 600  # 10분 간격

            record = {
                "time_sec": time_sec,
                "TC SP Front[℃]": operation_data["TC SP Front[℃]"][0],
                "TC SP Rear[℃]": operation_data["TC SP Rear[℃]"][0],
                "Torque[Nm]": operation_data["Torque[Nm]"][0],
            }
            records.append(record)

        df = pd.DataFrame(records)
        summary[folder] = df

    return summary

중단 조건 만족 여부 판단

In [6]:
def check_stop_condition_met(df):
    # 조건 1: Torque ≤ -17 Nm
    torque_stop = df["Torque[Nm]"] <= -17

    # 조건 2: TC SP Front ≥ 200 or TC SP Rear ≥ 200
    temp_stop = (df["TC SP Front[℃]"] >= 200) | (df["TC SP Rear[℃]"] >= 200)

    # 어느 하나라도 만족하면 중단 조건 도달
    stop_condition_met = (torque_stop | temp_stop).any()
    return stop_condition_met

중단 조건 불만족 데이터의 처리 함수

In [7]:
# 선형 회귀 함수
def estimate_rul_from_temp_time_based(temp_values, start_index, interval_sec=600, file_id="Sample"):
    n = len(temp_values)
    x = np.arange(start_index, start_index + n) * interval_sec
    y = np.array(temp_values)

    x_reshaped = x.reshape(-1, 1)
    model = LinearRegression().fit(x_reshaped, y)
    m = model.coef_[0]
    c = model.intercept_

    if m <= 0:
        print(f"[{file_id}] Warning: Temperature is not increasing (slope ≤ 0).")
        return None, None

    estimated_failure_time_sec = (200 - c) / m
    rul_seconds = estimated_failure_time_sec - x[-1]

    if estimated_failure_time_sec < x[-1] or rul_seconds < 0:
        print(f"[{file_id}] Warning: Predicted failure is before current time. Invalid regression.")
        return None, None

    return estimated_failure_time_sec, rul_seconds

# Front, Rear 선택 함수
def choose_temp_column_closer_to_200(df, front_col, rear_col):
    # 마지막 측정값 기준으로 200도와의 절대 차이 비교 후 Front, Rear 선택
    front_diff = abs(df[front_col].iloc[-1] - 200)
    rear_diff = abs(df[rear_col].iloc[-1] - 200)
    if front_diff <= rear_diff:
        return front_col
    else:
        return rear_col

# 3/5 points 선택 함수
def select_best_rul_with_temp_choice(df, estimation_targets, train_id, threshold_r2=0.85, interval_sec=600):
    front_col, rear_col = estimation_targets[train_id]
    target_col = choose_temp_column_closer_to_200(df, front_col, rear_col)

    results = []

    for n in [3, 5]:
        temp_values = df[target_col].values[-n:]
        start_index = len(df) - n
        x = np.arange(start_index, start_index + n) * interval_sec
        y = np.array(temp_values)

        model = LinearRegression().fit(x.reshape(-1, 1), y)
        r2 = model.score(x.reshape(-1, 1), y)

        est_time, rul = estimate_rul_from_temp_time_based(
            temp_values=temp_values,
            start_index=start_index,
            interval_sec=interval_sec,
            file_id=f"{train_id}_{target_col}_last{n}"
        )

        if est_time is not None and rul is not None:
            results.append((r2, est_time, rul, n))

    # 우선 R²가 threshold 이상인 것 중 R²가 높은 것 선택
    filtered = [res for res in results if res[0] >= threshold_r2]

    if filtered:
        best = max(filtered, key=lambda x: x[0])
    elif results:
        best = max(results, key=lambda x: x[0])
    else:
        print(f"[{train_id}] No valid regression results.")
        return None, None, None

    r2, est_time, rul, used_n = best
    print(f"[{train_id} | {target_col}] Selected {used_n} points (R² = {r2:.4f}) → RUL = {rul:.1f} sec")
    return est_time, rul, used_n

RUL 라벨링(만족)

In [8]:
def label_rul_by_stopping(summary, interval_sec=600):
    """
    중단 조건 시점에 따라 RUL을 라벨링합니다.
    입력 summary는 {TrainID: DataFrame} 형식입니다.
    """
    for train_id, df in summary.items():
        df = df.copy()

        # 조건 1: Torque ≤ -17
        torque_stop_idx = df.index[df["Torque[Nm]"] <= -17]

        # 조건 2: TC SP Front ≥ 200 or TC SP Rear ≥ 200
        temp_stop_idx = df.index[
            (df["TC SP Front[℃]"] >= 200) | (df["TC SP Rear[℃]"] >= 200)
        ]

        # 가장 빠른 중단 시점 찾기
        all_stop_indices = pd.concat([pd.Series(torque_stop_idx), pd.Series(temp_stop_idx)]).sort_values()

        if not all_stop_indices.empty:
            failure_idx = all_stop_indices.iloc[0]
            failure_time_sec = df.loc[failure_idx, "time_sec"]
        else:
            failure_time_sec = df["time_sec"].max()

        # 라벨링
        df["RUL_sec"] = (failure_time_sec - df["time_sec"]).clip(lower=0)

        # 업데이트
        summary[train_id] = df

    return summary

RUL 라벨링(불만족)

In [9]:
def label_rul_by_estimation(summary, estimation_targets, threshold_r2=0.85, interval_sec=600):
    """
    중단 조건을 만족하지 않은 데이터셋에 대해 회귀 기반으로 RUL을 추정하여 라벨링합니다.
    입력 summary는 {TrainID: DataFrame} 형식입니다.
    """
    for train_id in estimation_targets.keys():
        df = summary[train_id]

        # 1. 회귀 기반으로 RUL 추정
        est_time, rul, used_n = select_best_rul_with_temp_choice(df,estimation_targets, train_id, threshold_r2, interval_sec)

        if est_time is None:
            print(f"[{train_id}] RUL 예측 실패로 라벨링하지 않음.")
            continue

        # 2. 라벨링
        df = df.copy()
        df['time_sec'] = np.arange(len(df)) * interval_sec
        df['RUL_sec'] = (est_time - df['time_sec']).clip(lower=0)

        # 3. 업데이트
        summary[train_id] = df

    return summary

중단 조건 만족 여부에 따른 RUL 라벨링 수행

In [10]:
def label_rul_pipeline(summary, estimation_targets, threshold_r2=0.85, interval_sec=600):
    """
    중단 조건 충족 여부에 따라 적절한 라벨링 방식 적용.
    - 중단 조건 만족: 조건 기반 라벨링
    - 중단 조건 불만족: 회귀 기반 추정 라벨링
    - estimation_targets: 중단 조건 불만족으로 회귀 예측이 필요한 데이터셋 dict
    """
    for train_id, df in summary.items():
      if check_stop_condition_met(df):
          updated = label_rul_by_stopping({train_id: df}, interval_sec=interval_sec)
          summary[train_id] = updated[train_id]
          print(f"[{train_id}] 조건 기반 라벨링 완료.")
      elif train_id in estimation_targets:
          updated = label_rul_by_estimation({train_id: df}, {train_id: estimation_targets[train_id]}, threshold_r2, interval_sec)
          summary[train_id] = updated[train_id]
          print(f"[{train_id}] 회귀 기반 라벨링 완료.")
      else:
          print(f"[{train_id}] 회귀 대상 정보가 없어 라벨링 생략.")

    return summary

train set 준비 수행

In [11]:
def get_train_folders(train_root):
    all_folders = os.listdir(train_root)
    train_folders = [f for f in all_folders if f.lower().startswith("train") and os.path.isdir(os.path.join(train_root, f))]
    return sorted(train_folders)

전처리 수행

In [12]:
train_root = "/content/drive/MyDrive/KSPHM-data-challenge/Train Set"

# 모든 train 폴더 불러오기 -> targets 생성 (front_col, rear_col 딕셔너리)
train_folders = get_train_folders(train_root)
targets = {folder: ("TC SP Front[℃]", "TC SP Rear[℃]") for folder in train_folders}

# 데이터 로딩 -> summary 생성
summary = load_summary_from_tdms(train_root, train_folders)

# 중단 조건 만족 여부 판별 -> 조건 불만족 리스트 생성
estimation_targets = {}
for train_id, df in summary.items():
    if not check_stop_condition_met(df):
        estimation_targets[train_id] = targets[train_id]

# 라벨링 파이프라인 수행
summary = label_rul_pipeline(summary, estimation_targets, threshold_r2=0.85, interval_sec=600)

# 결과 확인: 각 데이터셋의 마지막 5개 행 출력
for train_id, df in summary.items():
    print(f"=== {train_id} 라벨링 결과 (마지막 5개) ===")
    print(df.tail(5))
    print()

[Train1] 조건 기반 라벨링 완료.
[Train2 | TC SP Front[℃]] Selected 5 points (R² = 0.9353) → RUL = 215.7 sec
[Train2] 회귀 기반 라벨링 완료.
[Train3 | TC SP Front[℃]] Selected 3 points (R² = 0.7762) → RUL = 196.4 sec
[Train3] 회귀 기반 라벨링 완료.
[Train4] 조건 기반 라벨링 완료.
[Train5] 조건 기반 라벨링 완료.
[Train6 | TC SP Rear[℃]] Selected 5 points (R² = 0.7887) → RUL = 614.9 sec
[Train6] 회귀 기반 라벨링 완료.
[Train7] 조건 기반 라벨링 완료.
[Train8] 조건 기반 라벨링 완료.
=== Train1 라벨링 결과 (마지막 5개) ===
    time_sec  TC SP Front[℃]  TC SP Rear[℃]  Torque[Nm]  RUL_sec
89     53400        100.6211       153.8347   -7.372215     2400
90     54000        101.3231       156.6417   -6.857190     1800
91     54600        100.5940       181.6120   -8.219799     1200
92     55200        102.5969       175.5891   -7.352595      600
93     55800        106.0521       199.1974  -17.921889        0

=== Train2 라벨링 결과 (마지막 5개) ===
     time_sec  TC SP Front[℃]  TC SP Rear[℃]  Torque[Nm]      RUL_sec
144     86400        124.6276       110.6115   -7.380063  2615.691

In [None]:
output_dir = "/content/drive/MyDrive/KSPHM-data-challenge/data_csv/RUL_labeled"

# summary의 각 DataFrame을 CSV로 저장
for train_id, df in summary.items():
    csv_path = os.path.join(output_dir, f"{train_id}_labeled.csv")
    df.to_csv(csv_path, index=False)
    print(f"[{train_id}] → {csv_path} 저장 완료")

[Train1] → /content/drive/MyDrive/KSPHM-data-challenge/data_csv/RUL_labeled/Train1_labeled.csv 저장 완료
[Train2] → /content/drive/MyDrive/KSPHM-data-challenge/data_csv/RUL_labeled/Train2_labeled.csv 저장 완료
[Train3] → /content/drive/MyDrive/KSPHM-data-challenge/data_csv/RUL_labeled/Train3_labeled.csv 저장 완료
[Train4] → /content/drive/MyDrive/KSPHM-data-challenge/data_csv/RUL_labeled/Train4_labeled.csv 저장 완료
[Train5] → /content/drive/MyDrive/KSPHM-data-challenge/data_csv/RUL_labeled/Train5_labeled.csv 저장 완료
[Train6] → /content/drive/MyDrive/KSPHM-data-challenge/data_csv/RUL_labeled/Train6_labeled.csv 저장 완료
[Train7] → /content/drive/MyDrive/KSPHM-data-challenge/data_csv/RUL_labeled/Train7_labeled.csv 저장 완료
[Train8] → /content/drive/MyDrive/KSPHM-data-challenge/data_csv/RUL_labeled/Train8_labeled.csv 저장 완료


In [13]:
summary

{'Train1':     time_sec  TC SP Front[℃]  TC SP Rear[℃]  Torque[Nm]  RUL_sec
 0          0         47.3274        56.6218  -10.352493    55800
 1        600         69.3558        83.0182   -9.625572    55200
 2       1200         84.9585       106.7807  -10.501605    54600
 3       1800         97.5308       123.6613   -9.528453    54000
 4       2400        107.2604       142.7980  -11.160837    53400
 ..       ...             ...            ...         ...      ...
 89     53400        100.6211       153.8347   -7.372215     2400
 90     54000        101.3231       156.6417   -6.857190     1800
 91     54600        100.5940       181.6120   -8.219799     1200
 92     55200        102.5969       175.5891   -7.352595      600
 93     55800        106.0521       199.1974  -17.921889        0
 
 [94 rows x 5 columns],
 'Train2':      time_sec  TC SP Front[℃]  TC SP Rear[℃]  Torque[Nm]       RUL_sec
 0           0         51.4771        51.2413  -10.167084  89015.691895
 1         600    

In [19]:
length_dict

{'Train1/modified_KIMM Simulator_KIMM Bearing Test_20160325125839.tdms': 256000,
 'Train1/modified_KIMM Simulator_KIMM Bearing Test_20160325122639.tdms': 256000,
 'Train1/modified_KIMM Simulator_KIMM Bearing Test_20160325130839.tdms': 256000,
 'Train1/modified_KIMM Simulator_KIMM Bearing Test_20160325123839.tdms': 256000,
 'Train1/modified_KIMM Simulator_KIMM Bearing Test_20160325124839.tdms': 256000,
 'Train1/modified_KIMM Simulator_KIMM Bearing Test_20160325131839.tdms': 256000,
 'Train1/modified_KIMM Simulator_KIMM Bearing Test_20160325140839.tdms': 256000,
 'Train1/modified_KIMM Simulator_KIMM Bearing Test_20160325145839.tdms': 256000,
 'Train1/modified_KIMM Simulator_KIMM Bearing Test_20160325144839.tdms': 256000,
 'Train1/modified_KIMM Simulator_KIMM Bearing Test_20160325134839.tdms': 256000,
 'Train1/modified_KIMM Simulator_KIMM Bearing Test_20160325141839.tdms': 256000,
 'Train1/modified_KIMM Simulator_KIMM Bearing Test_20160325135839.tdms': 256000,
 'Train1/modified_KIMM Simul