# Подготовка датасета телеметрии экскаваторов

Этот ноутбук:
1. Загружает `telemetry_excavators.csv`.
2. Строит таблицы:
   - `dim_asset`
   - `raw_telemetry`
   - `ts_telemetry_5s` (ресемплинг до 5 секунд).
3. Сохраняет результат в формате Parquet.

При необходимости скорректируй пути к файлам во второй ячейке.

In [1]:
import os
import tqdm
import warnings

import pandas as pd
import numpy as np
from pathlib import Path

warnings.filterwarnings(
    "ignore",
    category=FutureWarning,
    message=".*Downcasting object dtype arrays on .fillna.*"
)

In [2]:
def optimize_dtypes(df):
    df_optimized = df.copy()

    # float64 → float32
    float_cols = df_optimized.select_dtypes(include='float64').columns
    df_optimized[float_cols] = df_optimized[float_cols].astype('float32')

    # int64 → int32
    int_cols = df_optimized.select_dtypes(include='int64').columns
    df_optimized[int_cols] = df_optimized[int_cols].astype('int32')

    # object → category (для строк, где мало уникальных значений)
    object_cols = df_optimized.select_dtypes(include='object').columns
    for col in object_cols:
        num_unique_values = df_optimized[col].nunique()
        num_total_values = len(df_optimized[col])
        if num_unique_values / num_total_values < 0.5:
            df_optimized[col] = df_optimized[col].astype('category')

    return df_optimized


def setup_pandas_options():
    """Настройка глобальных опций pandas для отображения."""
    pd.set_option("display.precision", 3)
    pd.set_option("expand_frame_repr", False)


setup_pandas_options()

## Настройки путей

Задай правильные пути к CSV с телеметрией и директории для сохранения Parquet.

In [3]:
BASE_DIR = Path("../dataset/_by_Dmitry")  # корень с данными
TELEMETRY_PATH = BASE_DIR / "telemetry" / "telemetry_excavators.csv"

# директория, куда будут сохраняться parquet-файлы
OUTPUT_DIR = Path("../dataset/raw/telemetry")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

TELEMETRY_PATH, OUTPUT_DIR

(PosixPath('../dataset/_by_Dmitry/telemetry/telemetry_excavators.csv'),
 PosixPath('../dataset/raw/telemetry'))

## Вспомогательные функции

In [4]:
def load_telemetry(path: Path) -> pd.DataFrame:
    """Загрузка исходной телеметрии экскаваторов."""
    df = pd.read_csv(path)
    df.columns = [c.strip() for c in df.columns]

    # Парсим время и приводим к UTC (в файле уже есть таймзона)
    df['create_dt'] = pd.to_datetime(df['create_dt'], utc=True)

    # Временный вариант: asset_id = mdm_object_uuid
    df['asset_id'] = df['mdm_object_uuid']

    return optimize_dtypes(df)

In [5]:
def build_dim_asset(df_telemetry: pd.DataFrame) -> pd.DataFrame:
    """dim_asset на основе телеметрии."""
    cols = [
        'asset_id',
        'mdm_object_uuid',
        'mdm_object_name',
        'mdm_model_name',
    ]
    existing = [c for c in cols if c in df_telemetry.columns]
    dim = (
        df_telemetry[existing]
        .drop_duplicates()
        .reset_index(drop=True)
    )
    return optimize_dtypes(dim)

In [6]:
def build_raw_telemetry(df_telemetry: pd.DataFrame) -> pd.DataFrame:
    """raw_telemetry: только нужные поля без агрегаций."""
    cols = [
        'asset_id', 'mdm_object_uuid', 'create_dt',
        'mdm_model_name', 'mdm_object_name',
        'lon', 'lat', 'alt', 'speed_gps', 'direction',
        'inclinom_platx', 'inclinom_platy', 'inclinom_boomx', 'inclinom_arm',
    ]
    existing = [c for c in cols if c in df_telemetry.columns]
    df_raw = df_telemetry[existing].copy()
    df_raw.rename(columns={'create_dt': 'timestamp_raw'}, inplace=True)
    return optimize_dtypes(df_raw)

In [7]:
def build_ts_telemetry_5s(df_raw: pd.DataFrame) -> pd.DataFrame:
    """
    Ресемплинг до 5 секунд по каждому asset_id и расчёт простых фичей.
    На вход ожидается raw_telemetry с колонкой timestamp_raw.
    """
    df = df_raw.copy()
    df.rename(columns={'timestamp_raw': 'timestamp'}, inplace=True)
    df.set_index('timestamp', inplace=True)

    # Числовые колонки для усреднения
    ignore_cols = {'asset_id', 'mdm_object_uuid', 'mdm_model_name', 'mdm_object_name'}
    num_cols = [c for c in df.columns if c not in ignore_cols]

    # Ресемплинг: mean по числовым
    # ✅ используем '5s' вместо '5S'
    # ✅ явно указываем observed=False в groupby, чтобы не было FutureWarning
    df_resampled = (
        df
        .groupby('asset_id', observed=False)
        .resample('5s')[num_cols]
        .mean()
        .reset_index()
    )

    # Переименование под yaml-схему
    rename_map = {
        'speed_gps': 'speed_gps_mean',
        'direction': 'direction_mean',
        'inclinom_platx': 'inclinom_platx_mean',
        'inclinom_platy': 'inclinom_platy_mean',
        'inclinom_boomx': 'inclinom_boomx_mean',
        'inclinom_arm': 'inclinom_arm_mean',
    }
    df_resampled.rename(columns=rename_map, inplace=True)

    # Сортировка для корректного diff
    df_resampled = df_resampled.sort_values(['asset_id', 'timestamp'])

    # Производные фичи
    if 'inclinom_boomx_mean' in df_resampled.columns:
        df_resampled['boom_angle_delta_5s'] = (
            df_resampled
            .groupby('asset_id', observed=False)['inclinom_boomx_mean']
            .diff()
        )
    else:
        df_resampled['boom_angle_delta_5s'] = np.nan

    if 'inclinom_arm_mean' in df_resampled.columns:
        df_resampled['arm_angle_delta_5s'] = (
            df_resampled
            .groupby('asset_id', observed=False)['inclinom_arm_mean']
            .diff()
        )
    else:
        df_resampled['arm_angle_delta_5s'] = np.nan

    # Результирующий наклон платформы
    df_resampled['platform_tilt_magnitude'] = np.sqrt(
        df_resampled.get('inclinom_platx_mean', 0).fillna(0) ** 2 +
        df_resampled.get('inclinom_platy_mean', 0).fillna(0) ** 2
    )

    # Простая категоризация режима работы
    def classify_state(row):
        speed = row.get('speed_gps_mean', np.nan)
        boom_delta = row.get('boom_angle_delta_5s', np.nan)
        arm_delta = row.get('arm_angle_delta_5s', np.nan)

        if pd.isna(speed):
            return None
        if abs(speed) < 0.1 and abs(boom_delta) < 0.01 and abs(arm_delta) < 0.01:
            return 'idle'
        elif abs(speed) > 1.0:
            return 'moving'
        else:
            return 'working'

    df_resampled['operating_state'] = df_resampled.apply(classify_state, axis=1)

    return df_resampled

In [8]:
def save_parquet(df: pd.DataFrame, path: Path):
    """Сохранение DataFrame в Parquet (требует pyarrow или fastparquet)."""
    path.parent.mkdir(parents=True, exist_ok=True)
    df.to_parquet(path, index=False)

## Запуск пайплайна

Эта ячейка:
1. Загружает CSV.
2. Строит `dim_asset`, `raw_telemetry`, `ts_telemetry_5s`.
3. Сохраняет всё в Parquet в директории `OUTPUT_DIR`.

In [9]:
print("Загружаю телеметрию из:", TELEMETRY_PATH)
df_tel = load_telemetry(TELEMETRY_PATH)
display(df_tel.head())
print(df_tel.shape)
save_parquet(df_tel, OUTPUT_DIR / 'df_tel.parquet')

Загружаю телеметрию из: ../dataset/_by_Dmitry/telemetry/telemetry_excavators.csv


Unnamed: 0.1,Unnamed: 0,create_dt,mdm_model_name,mdm_object_name,mdm_object_uuid,lon,lat,alt,speed_gps,direction,inclinom_platx,inclinom_platy,inclinom_boomx,inclinom_arm,asset_id
0,0,2024-10-01 06:55:56+00:00,Hitachi EX-3600 E GalileoSky,EX3600 E №69,49d1fb5c-995e-11ec-98ba-00155d5fc801,142.143,48.955,50.0,0.0,118.2,0,0,26095,0,49d1fb5c-995e-11ec-98ba-00155d5fc801
1,1,2024-10-01 06:55:56+00:00,ЭКГ 20,ЭКГ 20 №30,93f622e9-995f-11ec-98ba-00155d5fc801,142.172,48.958,-94.0,0.0,1.6,0,0,0,0,93f622e9-995f-11ec-98ba-00155d5fc801
2,2,2024-10-01 06:55:56+00:00,Hitachi EX-3600 E GalileoSky,EX 3600 E №72,0859678c-d12a-11ec-98bf-00155d5fc801,142.147,48.96,62.0,0.1,136.6,59728,62396,0,0,0859678c-d12a-11ec-98bf-00155d5fc801
3,3,2024-10-01 06:55:56+00:00,Hitachi EX-2600 GalileoSky,EX2600 №51,b50e48ae-c6d3-11ed-98d9-00155d5fc801,142.146,48.959,64.0,1.1,179.4,0,0,0,0,b50e48ae-c6d3-11ed-98d9-00155d5fc801
4,4,2024-10-01 06:55:56+00:00,WK 20,WK-20 №97,3385d5e0-21fd-11ee-98db-00155d5fc801,142.169,48.956,-92.0,2.4,35.5,0,0,0,0,3385d5e0-21fd-11ee-98db-00155d5fc801


(13179029, 15)


In [10]:
print("\nСтрою dim_asset...")
dim_asset = build_dim_asset(df_tel)
display(dim_asset.head())
print(dim_asset.shape)
save_parquet(dim_asset, OUTPUT_DIR / 'dim_asset.parquet')


Строю dim_asset...


Unnamed: 0,asset_id,mdm_object_uuid,mdm_object_name,mdm_model_name
0,49d1fb5c-995e-11ec-98ba-00155d5fc801,49d1fb5c-995e-11ec-98ba-00155d5fc801,EX3600 E №69,Hitachi EX-3600 E GalileoSky
1,93f622e9-995f-11ec-98ba-00155d5fc801,93f622e9-995f-11ec-98ba-00155d5fc801,ЭКГ 20 №30,ЭКГ 20
2,0859678c-d12a-11ec-98bf-00155d5fc801,0859678c-d12a-11ec-98bf-00155d5fc801,EX 3600 E №72,Hitachi EX-3600 E GalileoSky
3,b50e48ae-c6d3-11ed-98d9-00155d5fc801,b50e48ae-c6d3-11ed-98d9-00155d5fc801,EX2600 №51,Hitachi EX-2600 GalileoSky
4,3385d5e0-21fd-11ee-98db-00155d5fc801,3385d5e0-21fd-11ee-98db-00155d5fc801,WK-20 №97,WK 20


(6, 4)


In [11]:
print("\nСтрою raw_telemetry...")
raw_tel = build_raw_telemetry(df_tel)
display(raw_tel.head())
print(raw_tel.shape)
save_parquet(raw_tel, OUTPUT_DIR / 'raw_telemetry.parquet')


Строю raw_telemetry...


Unnamed: 0,asset_id,mdm_object_uuid,timestamp_raw,mdm_model_name,mdm_object_name,lon,lat,alt,speed_gps,direction,inclinom_platx,inclinom_platy,inclinom_boomx,inclinom_arm
0,49d1fb5c-995e-11ec-98ba-00155d5fc801,49d1fb5c-995e-11ec-98ba-00155d5fc801,2024-10-01 06:55:56+00:00,Hitachi EX-3600 E GalileoSky,EX3600 E №69,142.143,48.955,50.0,0.0,118.2,0,0,26095,0
1,93f622e9-995f-11ec-98ba-00155d5fc801,93f622e9-995f-11ec-98ba-00155d5fc801,2024-10-01 06:55:56+00:00,ЭКГ 20,ЭКГ 20 №30,142.172,48.958,-94.0,0.0,1.6,0,0,0,0
2,0859678c-d12a-11ec-98bf-00155d5fc801,0859678c-d12a-11ec-98bf-00155d5fc801,2024-10-01 06:55:56+00:00,Hitachi EX-3600 E GalileoSky,EX 3600 E №72,142.147,48.96,62.0,0.1,136.6,59728,62396,0,0
3,b50e48ae-c6d3-11ed-98d9-00155d5fc801,b50e48ae-c6d3-11ed-98d9-00155d5fc801,2024-10-01 06:55:56+00:00,Hitachi EX-2600 GalileoSky,EX2600 №51,142.146,48.959,64.0,1.1,179.4,0,0,0,0
4,3385d5e0-21fd-11ee-98db-00155d5fc801,3385d5e0-21fd-11ee-98db-00155d5fc801,2024-10-01 06:55:56+00:00,WK 20,WK-20 №97,142.169,48.956,-92.0,2.4,35.5,0,0,0,0


(13179029, 14)


In [12]:
print("\nСтрою ts_telemetry_5s (ресемплинг 5 секунд)...")
ts_5s = build_ts_telemetry_5s(raw_tel)
display(ts_5s.head())
print(ts_5s.shape)
save_parquet(ts_5s, OUTPUT_DIR / 'ts_telemetry_5s.parquet')

print("\nГотово! Файлы сохранены в:", OUTPUT_DIR.resolve())


Строю ts_telemetry_5s (ресемплинг 5 секунд)...


Unnamed: 0,asset_id,timestamp,lon,lat,alt,speed_gps_mean,direction_mean,inclinom_platx_mean,inclinom_platy_mean,inclinom_boomx_mean,inclinom_arm_mean,boom_angle_delta_5s,arm_angle_delta_5s,platform_tilt_magnitude,operating_state
0,0859678c-d12a-11ec-98bf-00155d5fc801,2024-09-30 13:00:00+00:00,142.147,48.96,67.0,0.14,40.2,63075.0,58903.6,0.0,0.0,,,86302.316,working
1,0859678c-d12a-11ec-98bf-00155d5fc801,2024-09-30 13:00:05+00:00,142.147,48.96,67.8,2.54,169.0,62160.8,59894.8,0.0,0.0,0.0,0.0,86321.215,moving
2,0859678c-d12a-11ec-98bf-00155d5fc801,2024-09-30 13:00:10+00:00,142.147,48.96,68.0,1.32,155.84,60512.6,61650.2,0.0,0.0,0.0,0.0,86385.89,moving
3,0859678c-d12a-11ec-98bf-00155d5fc801,2024-09-30 13:00:15+00:00,142.147,48.96,68.0,3.68,195.48,59422.6,63133.2,0.0,0.0,0.0,0.0,86699.748,moving
4,0859678c-d12a-11ec-98bf-00155d5fc801,2024-09-30 13:00:20+00:00,142.147,48.96,69.0,0.36,176.36,61074.6,62361.4,0.0,0.0,0.0,0.0,87287.175,working


(3214080, 15)

Готово! Файлы сохранены в: /Users/bobrsubr/PycharmProjects/breakdowns_of_mining_trucks/dataset/raw/telemetry


1. Подготовка расширенного датасета
1.1. Дозагрузить и обработать дополнительные источники
Если проект учебный — можно пропустить, но идеальный вариант:
- данные масла (oil_samples)
- рейсы самосвалов (truck_trips)
- ремонтные события (если дадут)

Цель:
имеем связь "телеметрия → масло → ремонт → рейсы".

1.2. Обогатить ts_telemetry_5s признаками
Пока там только углы, скорость, наклоны.

Для поиска аномалий должны появиться признаки работы механики и электрооборудования:
- вибрации (rolling std, rolling diff)
- резкие скачки (rate of change per second)
- кластеризация режимов (ручка, поворот, копание)
- тепловые индексы (если появятся температурные параметры)

Эти derived features будут основой для моделей.

In [13]:
# Пути к сырым данным
BASE_DIR = Path("../dataset/_by_Dmitry")
TRIPS_PATH = BASE_DIR / "reference" / "truck_trips.csv"
OIL_PATH = BASE_DIR / "oil" / "Масляная лаборатория 1.xlsx"

# Пути к уже подготовленным parquet
RAW_TELEMETRY_DIR = Path("../dataset/raw/telemetry")
RAW_TELEMETRY_DIR.mkdir(parents=True, exist_ok=True)

DIM_ASSET_PATH = RAW_TELEMETRY_DIR / "dim_asset.parquet"
TS_5S_PATH = RAW_TELEMETRY_DIR / "ts_telemetry_5s.parquet"

In [14]:
# Перечитываем dim_asset и ts_5s на всякий случай (если не в памяти)
dim_asset = pd.read_parquet(DIM_ASSET_PATH)
ts_5s = pd.read_parquet(TS_5S_PATH)
print("dim_asset:", dim_asset.shape)
print("ts_5s:", ts_5s.shape)

dim_asset: (6, 4)
ts_5s: (3214080, 15)


In [15]:
print("\nЗагружаю truck_trips:", TRIPS_PATH)
trips = pd.read_csv(TRIPS_PATH)
trips.columns = [c.strip() for c in trips.columns]


Загружаю truck_trips: ../dataset/_by_Dmitry/reference/truck_trips.csv


In [16]:
# Приведение времени к datetime с UTC
trips["start_time"] = pd.to_datetime(trips["start_time"], utc=True)
trips["end_time"] = pd.to_datetime(trips["end_time"], utc=True)

### Маппинг shovel_uuid -> asset_id (экскаватор)

In [17]:
uuid_to_asset = (
    dim_asset[["mdm_object_uuid", "asset_id"]]
    .drop_duplicates()
    .rename(columns={"mdm_object_uuid": "shovel_uuid"})
)

trips_enriched = trips.merge(uuid_to_asset, on="shovel_uuid", how="left")
trips_enriched.rename(columns={"asset_id": "shovel_asset_id"}, inplace=True)

print("Пример trips_enriched:")
display(trips_enriched[[
    "start_time", "end_time", "meta_model_name",
    "meta_object_name", "object_uuid",
    "shovel_uuid", "shovel_asset_id", "weight", "distance"
]].head())

TRIPS_PARQUET_PATH = RAW_TELEMETRY_DIR / "truck_trips.parquet"
trips_enriched.to_parquet(TRIPS_PARQUET_PATH, index=False)
print("Сохранил:", TRIPS_PARQUET_PATH.resolve())

Пример trips_enriched:


Unnamed: 0,start_time,end_time,meta_model_name,meta_object_name,object_uuid,shovel_uuid,shovel_asset_id,weight,distance
0,2024-10-12 23:12:46+00:00,2024-10-12 23:20:40+00:00,БелАЗ 75306 Weichai,1909,3caf2cc7-79cd-11ed-98d7-00155d5fc801,3385d5e0-21fd-11ee-98db-00155d5fc801,3385d5e0-21fd-11ee-98db-00155d5fc801,164,1980.91
1,2024-10-12 23:09:11+00:00,2024-10-12 23:21:01+00:00,БелАЗ 75306 Weichai,2122,923c086a-bf2c-11ee-98ec-bc2411df91be,b50e48ae-c6d3-11ed-98d9-00155d5fc801,b50e48ae-c6d3-11ed-98d9-00155d5fc801,188,3659.75
2,2024-10-12 23:12:18+00:00,2024-10-12 23:24:18+00:00,БелАЗ 75306 Weichai,1870,d01e7398-4743-11ed-98ce-00155d5fc801,49d1fb5c-995e-11ec-98ba-00155d5fc801,49d1fb5c-995e-11ec-98ba-00155d5fc801,201,3470.09
3,2024-10-13 05:38:26+00:00,2024-10-13 05:53:04+00:00,БелАЗ 75306 Weichai,1969,4c3036bc-d7da-11ed-98d9-00155d5fc801,0859678c-d12a-11ec-98bf-00155d5fc801,0859678c-d12a-11ec-98bf-00155d5fc801,201,4100.7
4,2024-10-12 23:10:52+00:00,2024-10-12 23:24:39+00:00,БелАЗ 75306 Weichai,2135,a4b984b7-bf2d-11ee-98ec-bc2411df91be,0859678c-d12a-11ec-98bf-00155d5fc801,0859678c-d12a-11ec-98bf-00155d5fc801,198,3859.39


Сохранил: /Users/bobrsubr/PycharmProjects/breakdowns_of_mining_trucks/dataset/raw/telemetry/truck_trips.parquet


### Масляная лаборатория

In [18]:
print("\nЗагружаю oil_samples:", OIL_PATH)
oil = pd.read_excel(OIL_PATH)
oil.columns = [str(c).strip() for c in oil.columns]
display(oil)


Загружаю oil_samples: ../dataset/_by_Dmitry/oil/Масляная лаборатория 1.xlsx


Unnamed: 0,SampleId,ComponentRef,UnitRef,CustomerId,ReportedDate,TakenDate,ReceivedDate,ViscMode,Particle,Parker,...,FleetIdField,ComponentDescField,ComponentTypeField,LocationField,ComponentEngineSizeField,TurboField,GearTypeField,ComponentModelField,ComponentMakeField,ComponentIdField
0,144357,ML200011MOS944,ML200011KOM184,ML200011OOO953,2024-08-21 17:58:24,2024-08-20 00:00:00,2024-08-20 00:00:00,Measured,False,True,...,0,CP-00003250,GearBox,REAR,0,False,DIFFERENTIAL,KOMATSU,KOMATSU,MOST HD785 100
1,144356,ML200011KPP774,ML200011KOM184,ML200011OOO953,2024-08-21 17:42:17,2024-08-20 00:00:00,2024-08-20 00:00:00,Measured,False,True,...,0,CP-00003249,GearBox,CENTER,0,False,BRAKE GEAR BOX,KOMATSU,KOMATSU,KPP 100
2,144355,ML200011DVS246,ML200011KOM184,ML200011OOO953,2024-08-21 17:25:37,2024-08-20 00:00:00,2024-08-20 00:00:00,Measured,False,True,...,0,CP-00003247,Diesel Engine,CENTER,0,True,,SAA12V140E,KOMATSU,DVS 100
3,144354,ML200011GS 493,ML200011KOM184,ML200011OOO953,2024-08-21 17:06:07,2024-08-20 00:00:00,2024-08-20 00:00:00,Measured,True,True,...,0,CP-00003248,Hydraulic,CENTER,0,False,,HD 785,KOMATSU,GS 100
4,144353,ML200011DVS208,ML200011BEL290,ML200011OOO953,2024-08-21 16:28:34,2024-08-20 00:00:00,2024-08-20 00:00:00,Measured,False,True,...,0,CP-00003246,Diesel Engine,CENTER,0,True,,QSK60,BELAZ,DVS 1404
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13822,5,ML200011NEWOILCOMP001,ML200011NEWOIL001,ML200011NEW264,2020-11-19 18:39:44,2020-11-19 18:39:37,2020-11-19 18:39:37,Measured,False,True,...,0,,Diesel Engine,CENTER,0,True,CENTER,GENERAL,GENERAL,NEWOILCOMP001
13823,4,ML200011NEWOILCOMP001,ML200011NEWOIL001,ML200011NEW264,2020-11-19 17:28:51,2020-11-19 17:28:22,2020-11-19 17:28:22,Measured,False,True,...,0,,Diesel Engine,CENTER,0,True,CENTER,GENERAL,GENERAL,NEWOILCOMP001
13824,3,ML200011LIE866,ML200011LIE605,ML200011OOO953,2020-11-16 20:03:45,2020-11-16 00:00:00,2020-11-16 00:00:00,Measured,False,True,...,0,,Diesel Engine,CENTER,0,True,,PR754,LIEBHERR,LIEBHERR PR754N3ODSCENTER
13825,2,ML200011NEWOILCOMP001,ML200011NEWOIL001,ML200011NEW264,2020-11-16 19:28:17,2020-11-16 19:27:23,2020-11-16 19:27:23,Measured,False,True,...,0,,Diesel Engine,CENTER,0,True,CENTER,GENERAL,GENERAL,NEWOILCOMP001


In [19]:
# Приведение дат
for col in ["ReportedDate", "TakenDate", "ReceivedDate"]:
    if col in oil.columns:
        oil[col] = pd.to_datetime(oil[col], errors="coerce", utc=True)

In [20]:
# Оставим ключевые поля (можно расширять)
keep_cols = [
    "SampleId", "ComponentRef", "UnitRef", "CustomerId",
    "ReportedDate", "TakenDate", "ReceivedDate",
    "ViscMode", "Particle", "Parker", "OilCapacityUnits", "AnalysisMode",
    "OilTime", "OilWeight", "OilBrand", "OilType", "SumpCapacity",
    "UnitTimeUnits", "OilChanged", "TimeOnComponent", "Condition",
    "DiagnosticStatement", "EvaluationComment",
    "CustUnitDescField", "UnitModelField", "UnitNumberField", "UnitMakeField",
    "UnitYearField", "CustUnitIdField", "FleetIdField",
    "ComponentDescField", "ComponentTypeField", "LocationField",
    "ComponentEngineSizeField", "TurboField", "GearTypeField",
    "ComponentModelField", "ComponentMakeField", "ComponentIdField",
]
existing_cols = [c for c in keep_cols if c in oil.columns]
oil_small = oil[existing_cols].copy()

In [21]:
# Простейший ключ техники из масла (понадобится позже, если будем стыковать)
if "UnitNumberField" in oil_small.columns:
    oil_small["asset_key_from_oil"] = oil_small["UnitNumberField"].astype(str)
else:
    oil_small["asset_key_from_oil"] = np.nan

print("Пример oil_small:")
display(oil_small.head())

Пример oil_small:


Unnamed: 0,SampleId,ComponentRef,UnitRef,CustomerId,ReportedDate,TakenDate,ReceivedDate,ViscMode,Particle,Parker,...,ComponentDescField,ComponentTypeField,LocationField,ComponentEngineSizeField,TurboField,GearTypeField,ComponentModelField,ComponentMakeField,ComponentIdField,asset_key_from_oil
0,144357,ML200011MOS944,ML200011KOM184,ML200011OOO953,2024-08-21 17:58:24+00:00,2024-08-20 00:00:00+00:00,2024-08-20 00:00:00+00:00,Measured,False,True,...,CP-00003250,GearBox,REAR,0,False,DIFFERENTIAL,KOMATSU,KOMATSU,MOST HD785 100,100
1,144356,ML200011KPP774,ML200011KOM184,ML200011OOO953,2024-08-21 17:42:17+00:00,2024-08-20 00:00:00+00:00,2024-08-20 00:00:00+00:00,Measured,False,True,...,CP-00003249,GearBox,CENTER,0,False,BRAKE GEAR BOX,KOMATSU,KOMATSU,KPP 100,100
2,144355,ML200011DVS246,ML200011KOM184,ML200011OOO953,2024-08-21 17:25:37+00:00,2024-08-20 00:00:00+00:00,2024-08-20 00:00:00+00:00,Measured,False,True,...,CP-00003247,Diesel Engine,CENTER,0,True,,SAA12V140E,KOMATSU,DVS 100,100
3,144354,ML200011GS 493,ML200011KOM184,ML200011OOO953,2024-08-21 17:06:07+00:00,2024-08-20 00:00:00+00:00,2024-08-20 00:00:00+00:00,Measured,True,True,...,CP-00003248,Hydraulic,CENTER,0,False,,HD 785,KOMATSU,GS 100,100
4,144353,ML200011DVS208,ML200011BEL290,ML200011OOO953,2024-08-21 16:28:34+00:00,2024-08-20 00:00:00+00:00,2024-08-20 00:00:00+00:00,Measured,False,True,...,CP-00003246,Diesel Engine,CENTER,0,True,,QSK60,BELAZ,DVS 1404,1404


In [22]:
OIL_PARQUET_PATH = RAW_TELEMETRY_DIR / "oil_samples.parquet"
oil_small.to_parquet(OIL_PARQUET_PATH, index=False)
print("Сохранил:", OIL_PARQUET_PATH.resolve())

Сохранил: /Users/bobrsubr/PycharmProjects/breakdowns_of_mining_trucks/dataset/raw/telemetry/oil_samples.parquet


### ШАГ 1.2. Обогащение ts_telemetry_5s признаками

In [23]:
# Убедимся, что timestamp в datetime и отсортирован
ts_5s["timestamp"] = pd.to_datetime(ts_5s["timestamp"], utc=True)
ts_5s = ts_5s.sort_values(["asset_id", "timestamp"])
ts_5s = ts_5s.set_index("timestamp")
display(ts_5s)

Unnamed: 0_level_0,asset_id,lon,lat,alt,speed_gps_mean,direction_mean,inclinom_platx_mean,inclinom_platy_mean,inclinom_boomx_mean,inclinom_arm_mean,boom_angle_delta_5s,arm_angle_delta_5s,platform_tilt_magnitude,operating_state
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2024-09-30 13:00:00+00:00,0859678c-d12a-11ec-98bf-00155d5fc801,142.147,48.96,67.0,0.14,40.20,63075.0,58903.6,0.0,0.0,,,86302.316,working
2024-09-30 13:00:05+00:00,0859678c-d12a-11ec-98bf-00155d5fc801,142.147,48.96,67.8,2.54,169.00,62160.8,59894.8,0.0,0.0,0.0,0.0,86321.215,moving
2024-09-30 13:00:10+00:00,0859678c-d12a-11ec-98bf-00155d5fc801,142.147,48.96,68.0,1.32,155.84,60512.6,61650.2,0.0,0.0,0.0,0.0,86385.890,moving
2024-09-30 13:00:15+00:00,0859678c-d12a-11ec-98bf-00155d5fc801,142.147,48.96,68.0,3.68,195.48,59422.6,63133.2,0.0,0.0,0.0,0.0,86699.748,moving
2024-09-30 13:00:20+00:00,0859678c-d12a-11ec-98bf-00155d5fc801,142.147,48.96,69.0,0.36,176.36,61074.6,62361.4,0.0,0.0,0.0,0.0,87287.175,working
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-10-31 12:59:35+00:00,b50e48ae-c6d3-11ed-98d9-00155d5fc801,142.146,48.96,62.0,0.18,63.30,0.0,0.0,0.0,25248.6,0.0,5653.2,0.000,working
2024-10-31 12:59:40+00:00,b50e48ae-c6d3-11ed-98d9-00155d5fc801,142.146,48.96,61.8,0.56,90.08,0.0,0.0,0.0,12388.4,0.0,-12860.2,0.000,working
2024-10-31 12:59:45+00:00,b50e48ae-c6d3-11ed-98d9-00155d5fc801,142.146,48.96,61.0,2.40,168.12,0.0,0.0,0.0,17830.0,0.0,5441.6,0.000,moving
2024-10-31 12:59:50+00:00,b50e48ae-c6d3-11ed-98d9-00155d5fc801,142.146,48.96,61.2,3.66,257.24,0.0,0.0,0.0,24440.4,0.0,6610.4,0.000,moving


In [24]:
# Группировка по технике
grouped = ts_5s.groupby("asset_id", group_keys=False, observed=False)

# --- вибрации: rolling std за 60 секунд -----------------------------------
ts_5s["boom_vibration_std_60s"] = grouped["inclinom_boomx_mean"].rolling("60s").std().values
ts_5s["arm_vibration_std_60s"] = grouped["inclinom_arm_mean"].rolling("60s").std().values
ts_5s["speed_std_60s"] = grouped["speed_gps_mean"].rolling("60s").std().values

# сглаженные средние
ts_5s["speed_mean_60s"] = grouped["speed_gps_mean"].rolling("60s").mean().values
ts_5s["platform_tilt_mean_60s"] = grouped["platform_tilt_magnitude"].rolling("60s").mean().values

# --- резкие скачки: rate of change ----------------------------------------
ts_5s = ts_5s.reset_index().sort_values(["asset_id", "timestamp"])

dt = 5.0  # шаг 5 секунд

# уже есть boom_angle_delta_5s / arm_angle_delta_5s -> переводим в град/сек
ts_5s["boom_angle_roc_deg_per_s"] = ts_5s["boom_angle_delta_5s"] / dt
ts_5s["arm_angle_roc_deg_per_s"] = ts_5s["arm_angle_delta_5s"] / dt

# изменения скорости
ts_5s["speed_gps_diff_5s"] = ts_5s.groupby("asset_id")["speed_gps_mean"].diff()
ts_5s["speed_roc_kmh_per_s"] = ts_5s["speed_gps_diff_5s"] / dt

print("\nПример обогащённого ts_5s:")
display(ts_5s.head())


Пример обогащённого ts_5s:


  ts_5s["speed_gps_diff_5s"] = ts_5s.groupby("asset_id")["speed_gps_mean"].diff()


Unnamed: 0,timestamp,asset_id,lon,lat,alt,speed_gps_mean,direction_mean,inclinom_platx_mean,inclinom_platy_mean,inclinom_boomx_mean,...,operating_state,boom_vibration_std_60s,arm_vibration_std_60s,speed_std_60s,speed_mean_60s,platform_tilt_mean_60s,boom_angle_roc_deg_per_s,arm_angle_roc_deg_per_s,speed_gps_diff_5s,speed_roc_kmh_per_s
0,2024-09-30 13:00:00+00:00,0859678c-d12a-11ec-98bf-00155d5fc801,142.147,48.96,67.0,0.14,40.2,63075.0,58903.6,0.0,...,working,,,,0.14,86302.316,,,,
1,2024-09-30 13:00:05+00:00,0859678c-d12a-11ec-98bf-00155d5fc801,142.147,48.96,67.8,2.54,169.0,62160.8,59894.8,0.0,...,moving,0.0,0.0,1.697,1.34,86311.765,0.0,0.0,2.4,0.48
2,2024-09-30 13:00:10+00:00,0859678c-d12a-11ec-98bf-00155d5fc801,142.147,48.96,68.0,1.32,155.84,60512.6,61650.2,0.0,...,moving,0.0,0.0,1.2,1.333,86336.473,0.0,0.0,-1.22,-0.244
3,2024-09-30 13:00:15+00:00,0859678c-d12a-11ec-98bf-00155d5fc801,142.147,48.96,68.0,3.68,195.48,59422.6,63133.2,0.0,...,moving,0.0,0.0,1.529,1.92,86427.292,0.0,0.0,2.36,0.472
4,2024-09-30 13:00:20+00:00,0859678c-d12a-11ec-98bf-00155d5fc801,142.147,48.96,69.0,0.36,176.36,61074.6,62361.4,0.0,...,working,0.0,0.0,1.496,1.608,86599.269,0.0,0.0,-3.32,-0.664


In [25]:
# --- простая связь с рейсами: payload_last_trip ---------------------------
trips_enriched = pd.read_parquet(TRIPS_PARQUET_PATH)

# берем только рейсы, где знаем экскаватор
trips_valid = trips_enriched[~trips_enriched["shovel_asset_id"].isna()].copy()

# подготовим правую таблицу для merge_asof
right = trips_valid[["shovel_asset_id", "end_time", "weight"]].copy()
right = right.rename(columns={
    "shovel_asset_id": "asset_id",
    "end_time": "trip_end_time",
    "weight": "payload_last_trip"
})

# левую таблицу (ts_5s) отсортируем, но merge_asof будем делать по asset_id по отдельности
left = ts_5s.copy()
left["timestamp"] = pd.to_datetime(left["timestamp"], utc=True)

In [26]:
result_parts = []
for asset_id, df_left in left.groupby("asset_id", sort=False):
    df_left = df_left.sort_values("timestamp")

    # Проверка наличия 'asset_id' в df_left
    if 'asset_id' not in df_left.columns:
        print(f"Warning: asset_id отсутствует в df_left для asset_id={asset_id}")
    else:
        print(f"asset_id присутствует в df_left для asset_id={asset_id}")

    # Если для этого экскаватора нет рейсов, добавляем строки с NaN в payload_last_trip
    df_right = right[right["asset_id"] == asset_id].copy()
    if df_right.empty:
        # Для этого экскаватора нет рейсов, но строки должны быть добавлены
        df_left["payload_last_trip"] = np.nan
        result_parts.append(df_left)
        continue  # Переходим к следующему экскаватору

    df_right = df_right.sort_values("trip_end_time")

    # Сохраняем asset_id в правой таблице
    if 'asset_id' not in df_right.columns:
        print(f"Warning: asset_id отсутствует в df_right для asset_id={asset_id}")

    # Выполняем merge_asof для этого экскаватора
    merged = pd.merge_asof(
        df_left,
        df_right,
        left_on="timestamp",
        right_on="trip_end_time",
        direction="backward",
        allow_exact_matches=True
    )

    # Проверка наличия 'asset_id' после merge_asof
    if 'asset_id' not in merged.columns:
        print(f"Warning: asset_id отсутствует после merge_asof для asset_id={asset_id}")
        merged['asset_id'] = asset_id  # Восстанавливаем asset_id, если он исчез

    result_parts.append(merged)

  for asset_id, df_left in left.groupby("asset_id", sort=False):


asset_id присутствует в df_left для asset_id=0859678c-d12a-11ec-98bf-00155d5fc801
asset_id присутствует в df_left для asset_id=3385d5e0-21fd-11ee-98db-00155d5fc801
asset_id присутствует в df_left для asset_id=3f934675-21fd-11ee-98db-00155d5fc801
asset_id присутствует в df_left для asset_id=49d1fb5c-995e-11ec-98ba-00155d5fc801
asset_id присутствует в df_left для asset_id=93f622e9-995f-11ec-98ba-00155d5fc801
asset_id присутствует в df_left для asset_id=b50e48ae-c6d3-11ed-98d9-00155d5fc801


In [28]:
# Если result_parts пуст, возможно нет данных для всех экскаваторов
if result_parts:
    ts_5s_enriched = pd.concat(result_parts, ignore_index=True)
    print("\nПример ts_5s_enriched с payload_last_trip:")
    display(ts_5s_enriched[[
        "asset_id", "timestamp", "speed_gps_mean",
        "boom_vibration_std_60s", "arm_vibration_std_60s",
        "payload_last_trip"
    ]].head())
else:
    print("Нет данных для объединения.")



Пример ts_5s_enriched с payload_last_trip:


Unnamed: 0,asset_id,timestamp,speed_gps_mean,boom_vibration_std_60s,arm_vibration_std_60s,payload_last_trip
0,0859678c-d12a-11ec-98bf-00155d5fc801,2024-09-30 13:00:00+00:00,0.14,,,
1,0859678c-d12a-11ec-98bf-00155d5fc801,2024-09-30 13:00:05+00:00,2.54,0.0,0.0,
2,0859678c-d12a-11ec-98bf-00155d5fc801,2024-09-30 13:00:10+00:00,1.32,0.0,0.0,
3,0859678c-d12a-11ec-98bf-00155d5fc801,2024-09-30 13:00:15+00:00,3.68,0.0,0.0,
4,0859678c-d12a-11ec-98bf-00155d5fc801,2024-09-30 13:00:20+00:00,0.36,0.0,0.0,


In [29]:
# Сохраняем обогащённый датасет
TS_5S_ENRICHED_PATH = RAW_TELEMETRY_DIR / "ts_telemetry_5s_enriched.parquet"
ts_5s_enriched.to_parquet(TS_5S_ENRICHED_PATH, index=False)
print("\nОбогащённый ts_telemetry_5s_enriched.parquet сохранён в:", TS_5S_ENRICHED_PATH.resolve())



Обогащённый ts_telemetry_5s_enriched.parquet сохранён в: /Users/bobrsubr/PycharmProjects/breakdowns_of_mining_trucks/dataset/raw/telemetry/ts_telemetry_5s_enriched.parquet
