# Предобработка телеметрии самосвалов
В этом ноутбуке выполняется загрузка сырых файлов телеметрии, объединение, оптимизация типов данных и базовая предобработка (удаление мусорных и слабоинформативных признаков, обработка пропусков).

In [1]:
import os
import tqdm
import warnings

import pandas as pd
from pandas.api.types import is_numeric_dtype

In [2]:
warnings.filterwarnings(
    "ignore",
    category=FutureWarning,
    message=".*Downcasting object dtype arrays on .fillna.*"
)

In [3]:
def optimize_dtypes(df):
    df_optimized = df.copy()

    # float64 → float32
    float_cols = df_optimized.select_dtypes(include='float64').columns
    df_optimized[float_cols] = df_optimized[float_cols].astype('float32')

    # int64 → int32
    int_cols = df_optimized.select_dtypes(include='int64').columns
    df_optimized[int_cols] = df_optimized[int_cols].astype('int32')

    # object → category (для строк, где мало уникальных значений)
    object_cols = df_optimized.select_dtypes(include='object').columns
    for col in object_cols:
        num_unique_values = df_optimized[col].nunique()
        num_total_values = len(df_optimized[col])
        if num_unique_values / num_total_values < 0.5:
            df_optimized[col] = df_optimized[col].astype('category')

    return df_optimized


def setup_pandas_options():
    """Настройка глобальных опций pandas для отображения."""
    pd.set_option("display.precision", 3)
    pd.set_option("expand_frame_repr", False)


setup_pandas_options()

In [4]:
# Loading raw data
data_root = '../dataset'
source_root = '../dataset/_by_Hack/telemetry'
files = os.listdir(f'{source_root}')
print(files)

['telemetry4.csv', 'telemetry3.csv', 'telemetry2.csv', 'telemetry1.csv']


In [5]:
data_frames = []
for file in files:
    if 'telemetry' in file and file.endswith('.csv'):
        print(f'Loading {file} for combine..')
        df = pd.read_csv(os.path.join(f'{source_root}', file))
        data_frames.append(df)
print('Объединение всех DataFrame в один..')
combined_data = pd.concat(data_frames, ignore_index=True)

Loading telemetry4.csv for combine..
Loading telemetry3.csv for combine..
Loading telemetry2.csv for combine..
Loading telemetry1.csv for combine..
Объединение всех DataFrame в один..


In [6]:
print('Optimize dtypes..')
combined_data_optimized = optimize_dtypes(combined_data)
print(f"Потребляемые ресурсы до оптимизации:")
print(combined_data.info(memory_usage='deep'))
print(f"\nПотребляемые ресурсы после оптимизации:")
print(combined_data_optimized.info(memory_usage='deep'))

Optimize dtypes..
Потребляемые ресурсы до оптимизации:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42854690 entries, 0 to 42854689
Data columns (total 51 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   Unnamed: 0                    int64  
 1   create_dt                     object 
 2   mdm_object_id                 int64  
 3   mdm_object_name               int64  
 4   mdm_model_id                  int64  
 5   mdm_model_name                object 
 6   alt                           float64
 7   speed_gps                     float64
 8   inst_fuel                     int64  
 9   weight                        int64  
 10  temp_engine                   int64  
 11  turn_engine                   int64  
 12  load_engine                   int64  
 13  pres_coolant_nn               int64  
 14  pres_rail_injector_nn         int64  
 15  pres_temp_engine_nn           int64  
 16  torque_nn                     int64  
 17  pres_des

In [7]:
display(combined_data_optimized)
print(combined_data_optimized.shape)

Unnamed: 0.1,Unnamed: 0,create_dt,mdm_object_id,mdm_object_name,mdm_model_id,mdm_model_name,alt,speed_gps,inst_fuel,weight,...,fuel_level_can,accelerator_pedal_position,crankcase_purge_pressure,engine_oil_level,error_belaz_11,error_belaz_12,spn,fmi,sutep_error,spn_weichai
0,0,2024-01-01 12:23:42+11:00,1661,1395,62,БелАЗ 75306 Cummins,-66.0,0.0,293,19,...,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000
1,1,2024-01-01 12:23:47+11:00,1661,1395,62,БелАЗ 75306 Cummins,-66.0,0.0,257,34,...,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000
2,2,2024-01-01 12:23:52+11:00,1661,1395,62,БелАЗ 75306 Cummins,-66.0,0.0,263,35,...,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000
3,3,2024-01-01 12:23:57+11:00,1661,1395,62,БелАЗ 75306 Cummins,-66.0,0.0,494,35,...,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000
4,4,2024-01-01 12:24:02+11:00,1661,1395,62,БелАЗ 75306 Cummins,-66.0,0.0,431,34,...,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42854685,14399729,2023-06-30 17:43:05+11:00,1661,1395,62,БелАЗ 75306 Cummins,144.0,18.6,8001,204,...,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000
42854686,14399730,2023-06-30 17:43:06+11:00,1661,1395,62,БелАЗ 75306 Cummins,144.0,17.2,8124,204,...,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000
42854687,14399731,2023-06-30 17:43:07+11:00,1661,1395,62,БелАЗ 75306 Cummins,144.0,15.9,7850,186,...,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000
42854688,14399732,2023-06-30 17:43:09+11:00,1661,1395,62,БелАЗ 75306 Cummins,145.0,15.1,7290,194,...,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000,-1000000


(42854690, 51)


In [8]:
print(f'\nПроверка на пропуски в данных:\n{combined_data_optimized.isnull().sum()}')
print(f'\nПроверим типы данных:\n{combined_data_optimized.dtypes}')


Проверка на пропуски в данных:
Unnamed: 0                      0
create_dt                       0
mdm_object_id                   0
mdm_object_name                 0
mdm_model_id                    0
mdm_model_name                  0
alt                             0
speed_gps                       0
inst_fuel                       0
weight                          0
temp_engine                     0
turn_engine                     0
load_engine                     0
pres_coolant_nn                 0
pres_rail_injector_nn           0
pres_temp_engine_nn             0
torque_nn                       0
pres_des_rail_injector_nn       0
distance_nn                     0
tweather_nn                     0
purgepressure_nn                0
finjection                      0
pres_turbo                      0
temp_oil_engine_nn              0
dynamic_brake                   0
dfm_in_sum                      0
dfm_in_hour                     0
dfm_out_sum                     0
dfm_out_hour    

In [9]:
print(f'\nСтатистическое описание данных:')
display(combined_data_optimized.describe())


Статистическое описание данных:


Unnamed: 0.1,Unnamed: 0,mdm_object_id,mdm_object_name,mdm_model_id,alt,speed_gps,inst_fuel,weight,temp_engine,turn_engine,...,fuel_level_can,accelerator_pedal_position,crankcase_purge_pressure,engine_oil_level,error_belaz_11,error_belaz_12,spn,fmi,sutep_error,spn_weichai
count,42850000.0,42850000.0,42850000.0,42850000.0,42850000.0,42850000.0,42850000.0,42850000.0,42850000.0,42850000.0,...,42850000.0,42850000.0,42850000.0,42850000.0,42850000.0,42850000.0,42850000.0,42850000.0,42850000.0,42850000.0
mean,6159000.0,4395.0,1395.0,62.0,50.46,13.45,1171.0,-113700.0,-992.8,6606.0,...,-1000000.0,-1000000.0,-1000000.0,-1000000.0,-1000000.0,-1000000.0,-1000000.0,-1000000.0,-1000000.0,-1000000.0
std,4140000.0,6965.0,44.58,0.0,465.3,12.13,32730.0,317500.0,32520.0,33150.0,...,3075.0,0.0,3074.0,3074.0,0.0,0.0,0.0,0.0,877.5,0.0
min,0.0,1381.0,1349.0,62.0,-1000000.0,0.0,-1000000.0,-1000000.0,-1000000.0,-1000000.0,...,-1000000.0,-1000000.0,-1000000.0,-1000000.0,-1000000.0,-1000000.0,-1000000.0,-1000000.0,-1000000.0,-1000000.0
25%,2678000.0,1383.0,1374.0,62.0,-3.0,0.0,49.0,0.0,63.0,5645.0,...,-1000000.0,-1000000.0,-1000000.0,-1000000.0,-1000000.0,-1000000.0,-1000000.0,-1000000.0,-1000000.0,-1000000.0
50%,5393000.0,1581.0,1381.0,62.0,54.0,13.3,449.0,2.0,77.0,7960.0,...,-1000000.0,-1000000.0,-1000000.0,-1000000.0,-1000000.0,-1000000.0,-1000000.0,-1000000.0,-1000000.0,-1000000.0
75%,9366000.0,1661.0,1395.0,62.0,95.0,21.5,3867.0,212.0,87.0,11180.0,...,-1000000.0,-1000000.0,-1000000.0,-1000000.0,-1000000.0,-1000000.0,-1000000.0,-1000000.0,-1000000.0,-1000000.0
max,15050000.0,21190.0,1497.0,62.0,26190.0,863.9,9979.0,797.0,123.0,17200.0,...,187.0,-1000000.0,0.0,0.0,-1000000.0,-1000000.0,-1000000.0,-1000000.0,0.0,-1000000.0


In [10]:
print('parquet telemetry saving..')
os.makedirs(os.path.join(data_root, 'raw/telemetry'), exist_ok=True)
combined_data_optimized.to_parquet(os.path.join(data_root, 'raw/telemetry/telemetry.parquet'), index=False)

parquet telemetry saving..


In [11]:
# Чтение из Parquet вместо CSV
combined_data_optimized = pd.read_parquet(os.path.join(data_root, 'raw/telemetry/telemetry.parquet'))
# print(f"Потребляемые ресурсы:")
# print(combined_data_optimized.info(memory_usage='deep'))

Потребляемые ресурсы:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42854690 entries, 0 to 42854689
Data columns (total 51 columns):
 #   Column                        Dtype   
---  ------                        -----   
 0   Unnamed: 0                    int32   
 1   create_dt                     object  
 2   mdm_object_id                 int32   
 3   mdm_object_name               int32   
 4   mdm_model_id                  int32   
 5   mdm_model_name                category
 6   alt                           float32 
 7   speed_gps                     float32 
 8   inst_fuel                     int32   
 9   weight                        int32   
 10  temp_engine                   int32   
 11  turn_engine                   int32   
 12  load_engine                   int32   
 13  pres_coolant_nn               int32   
 14  pres_rail_injector_nn         int32   
 15  pres_temp_engine_nn           int32   
 16  torque_nn                     int32   
 17  pres_des_rail_injector

In [12]:
# Удалим не нужные данные
if 'Unnamed: 0' in combined_data_optimized.columns:
    combined_data_optimized = combined_data_optimized.drop(columns=['Unnamed: 0'])
# Удалим признаки, где нет разнообразия..
nunique = combined_data_optimized.nunique()
zero_variance_cols = nunique[nunique <= 1].index.tolist()
print("Бесполезные признаки:", zero_variance_cols)
combined_data_optimized = combined_data_optimized.drop(columns=zero_variance_cols)
print(combined_data_optimized.shape)

Бесполезные признаки: ['mdm_model_id', 'distance_nn', 'tweather_nn', 'purgepressure_nn', 'finjection', 'pres_turbo', 'temp_oil_engine_nn', 'meta_model_id', 'transmission_oil_temperature', 'accelerator_pedal_position', 'error_belaz_11', 'error_belaz_12', 'spn', 'fmi', 'spn_weichai']
(42854690, 35)


In [13]:
# Заменим -1000000 на NaN
combined_data_optimized.replace(-1000000, pd.NA, inplace=True)
combined_data_optimized.fillna(
    combined_data_optimized.median(numeric_only=True),
    inplace=True
)

In [14]:
# Заполнение пропусков медианой
if 'create_dt' in combined_data_optimized.columns:
    combined_data_optimized['create_dt'] = pd.to_datetime(
        combined_data_optimized['create_dt'], errors='coerce'
    )

In [15]:
# Удалим признаки, где более 70% пустых значений
threshold = 0.3  # нужно хотя бы 30% не-пустых значений
min_non_na = len(combined_data_optimized) * threshold
na_counts = combined_data_optimized.isna().sum()
valid_cols = na_counts[na_counts < (len(combined_data_optimized) - min_non_na)].index  # фильтрацию по Series
filtered_data = combined_data_optimized[valid_cols].copy()  # выбор колонок по списку
print(filtered_data.shape)

(42854690, 23)


In [16]:
filtered_path = os.path.join(data_root, 'raw/telemetry/telemetry_filtered.parquet')
filtered_data.to_parquet(filtered_path, index=False)
print('Размер фильтрованного датасета:', filtered_data.shape)
print('Проверка на пропуски после фильтрации:\n', filtered_data.isnull().sum())

Размер фильтрованного датасета: (42854690, 23)
Проверка на пропуски после фильтрации:
 create_dt                           0
mdm_object_id                       0
mdm_object_name                     0
mdm_model_name                      0
alt                                 9
speed_gps                           0
inst_fuel                       45355
weight                        4874706
temp_engine                     45355
turn_engine                     45355
load_engine                         9
pres_coolant_nn                 45355
pres_rail_injector_nn           45355
pres_temp_engine_nn          18393078
torque_nn                           9
pres_des_rail_injector_nn       45355
dynamic_brake                12469090
dfm_in_hour                    465865
dfm_out_sum                    465865
dfm_out_hour                   465865
mdm_object_uuid                     0
meta_object_name                    0
meta_model_name                     0
dtype: int64


In [17]:
# Чтение из Parquet вместо CSV
filtered_data_optimized = pd.read_parquet(os.path.join(data_root, 'raw/telemetry/telemetry_filtered.parquet'))
# filtered_data_optimized = optimize_dtypes(filtered_data)
# print(f"Потребляемые ресурсы:")
# print(filtered_data.info(memory_usage='deep'))

In [18]:
# Заполним оставшиеся пропуски
# Для категориальных — самым частым значением (mode)
# Для числовых данных — медианой (устойчиво к выбросам)
for col in filtered_data_optimized.columns:
    s = filtered_data_optimized[col]

    if is_numeric_dtype(s):
        median_val = s.median()  # Числовые: заполняем медианой
        filtered_data_optimized.loc[:, col] = s.fillna(median_val)
    else:
        if s.dropna().empty:
            continue
        mode_val = s.mode()[0]  # Всё остальное (object, category, datetime, bool) — модой
        filtered_data_optimized.loc[:, col] = s.fillna(mode_val)

print(f'Информация о финальном датасете:\n{filtered_data_optimized.info(memory_usage="deep")}')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42854690 entries, 0 to 42854689
Data columns (total 23 columns):
 #   Column                     Dtype                                
---  ------                     -----                                
 0   create_dt                  datetime64[ns, pytz.FixedOffset(660)]
 1   mdm_object_id              int32                                
 2   mdm_object_name            int32                                
 3   mdm_model_name             category                             
 4   alt                        float64                              
 5   speed_gps                  float32                              
 6   inst_fuel                  float64                              
 7   weight                     float64                              
 8   temp_engine                float64                              
 9   turn_engine                float64                              
 10  load_engine                float64      

In [19]:
final_path = os.path.join(data_root, 'raw/telemetry/telemetry_filtered_filled.parquet')
filtered_data_optimized.to_parquet(final_path, index=False)
print(f'Финальный очищенный датасет сохранён в: {final_path}')

Финальный очищенный датасет сохранён в: ../dataset/raw/telemetry/telemetry_filtered_filled.parquet


In [20]:
# Чтение из Parquet вместо CSV
filtered_data_filled = pd.read_parquet(os.path.join(data_root, 'raw/telemetry/telemetry_filtered_filled.parquet'))
filtered_data_optimized = optimize_dtypes(filtered_data_filled)

In [21]:
print(f'\ntelemetry:\n{filtered_data_optimized.shape}')
display(filtered_data_optimized.head())
print(f'\nПроверка на пропуски в данных:\n{filtered_data_optimized.isnull().sum()}')


telemetry:
(42854690, 23)


Unnamed: 0,create_dt,mdm_object_id,mdm_object_name,mdm_model_name,alt,speed_gps,inst_fuel,weight,temp_engine,turn_engine,...,pres_temp_engine_nn,torque_nn,pres_des_rail_injector_nn,dynamic_brake,dfm_in_hour,dfm_out_sum,dfm_out_hour,mdm_object_uuid,meta_object_name,meta_model_name
0,2024-01-01 12:23:42+11:00,1661,1395,БелАЗ 75306 Cummins,-66.0,0.0,293.0,19.0,66.0,6425.0,...,69.0,126.0,94.0,0.0,0.0,0.0,0.0,83397e13-90c4-11ec-98b9-00155d5fc801,1395,БелАЗ 75306 Cummins
1,2024-01-01 12:23:47+11:00,1661,1395,БелАЗ 75306 Cummins,-66.0,0.0,257.0,34.0,66.0,6395.0,...,69.0,127.0,99.0,0.0,0.0,0.0,0.0,83397e13-90c4-11ec-98b9-00155d5fc801,1395,БелАЗ 75306 Cummins
2,2024-01-01 12:23:52+11:00,1661,1395,БелАЗ 75306 Cummins,-66.0,0.0,263.0,35.0,66.0,6409.0,...,69.0,127.0,98.0,0.0,0.0,0.0,0.0,83397e13-90c4-11ec-98b9-00155d5fc801,1395,БелАЗ 75306 Cummins
3,2024-01-01 12:23:57+11:00,1661,1395,БелАЗ 75306 Cummins,-66.0,0.0,494.0,35.0,66.0,6392.0,...,69.0,127.0,101.0,0.0,0.0,0.0,0.0,83397e13-90c4-11ec-98b9-00155d5fc801,1395,БелАЗ 75306 Cummins
4,2024-01-01 12:24:02+11:00,1661,1395,БелАЗ 75306 Cummins,-66.0,0.0,431.0,34.0,66.0,6387.0,...,69.0,127.0,88.0,0.0,0.0,0.0,0.0,83397e13-90c4-11ec-98b9-00155d5fc801,1395,БелАЗ 75306 Cummins



Проверка на пропуски в данных:
create_dt                    0
mdm_object_id                0
mdm_object_name              0
mdm_model_name               0
alt                          0
speed_gps                    0
inst_fuel                    0
weight                       0
temp_engine                  0
turn_engine                  0
load_engine                  0
pres_coolant_nn              0
pres_rail_injector_nn        0
pres_temp_engine_nn          0
torque_nn                    0
pres_des_rail_injector_nn    0
dynamic_brake                0
dfm_in_hour                  0
dfm_out_sum                  0
dfm_out_hour                 0
mdm_object_uuid              0
meta_object_name             0
meta_model_name              0
dtype: int64


In [22]:
print(f'\nСтатистическое описание данных:')
display(filtered_data_optimized.describe())


Статистическое описание данных:


Unnamed: 0,mdm_object_id,mdm_object_name,alt,speed_gps,inst_fuel,weight,temp_engine,turn_engine,load_engine,pres_coolant_nn,pres_rail_injector_nn,pres_temp_engine_nn,torque_nn,pres_des_rail_injector_nn,dynamic_brake,dfm_in_hour,dfm_out_sum,dfm_out_hour,meta_object_name
count,42850000.0,42850000.0,42850000.0,42850000.0,42850000.0,42850000.0,42850000.0,42850000.0,42850000.0,42850000.0,42850000.0,42850000.0,42850000.0,42850000.0,42850000.0,42850000.0,42850000.0,42850000.0,42850000.0
mean,4395.0,1395.0,50.67,13.45,2230.0,84.18,65.62,7673.0,29.98,39.07,54.76,73.74,129.1,112.9,2406.0,-151900000.0,39750000.0,-151800000.0,1395.0
std,6965.0,44.58,80.64,12.13,3046.0,103.9,34.8,5020.0,39.56,32.6,81.75,31.96,71.65,81.18,6971.0,577900000.0,139500000.0,577900000.0,44.58
min,1381.0,1349.0,-1097.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2147000000.0,0.0,-2147000000.0,1349.0
25%,1383.0,1374.0,-3.0,0.0,50.0,0.0,63.0,5669.0,0.0,16.0,2.0,84.0,125.0,73.0,0.0,0.0,0.0,0.0,1374.0
50%,1581.0,1381.0,54.0,13.3,450.0,5.0,77.0,7967.0,4.0,30.0,10.0,86.0,128.0,112.0,0.0,0.0,0.0,0.0,1381.0
75%,1661.0,1395.0,95.0,21.5,3867.0,212.0,87.0,11180.0,66.0,60.0,63.0,87.0,180.0,169.0,0.0,0.0,0.0,0.0,1395.0
max,21190.0,1497.0,26190.0,863.9,9979.0,797.0,123.0,17200.0,125.0,205.0,492.0,125.0,246.0,662.0,33690.0,2147000000.0,676100000.0,2147000000.0,1497.0


In [23]:
# Общая информация о данных
print(f"\nРаспределение по уникальным объектам (самосвалам):\n{filtered_data_optimized['mdm_object_id'].value_counts()}")


Распределение по уникальным объектам (самосвалам):
mdm_object_id
1581     10358781
1661      8847628
1381      7021025
21186     6289937
1383      5329156
1384      5008163
Name: count, dtype: int64


In [24]:
print('Оставим только температуру двигателя > 0..')
filtered_data_optimized = filtered_data_optimized[filtered_data_optimized['temp_engine'] > 0]
print(filtered_data_optimized.shape)

Оставим только температуру двигателя > 0..
(34314698, 23)


In [25]:
print('Оставим только разумные значения скорости (например, < 200 км/ч)..')
filtered_data_optimized = filtered_data_optimized[filtered_data_optimized['speed_gps'] < 200]
filtered_data_optimized['is_stopped'] = (filtered_data_optimized['speed_gps'] == 0).astype(int)
print(filtered_data_optimized.shape)

Оставим только разумные значения скорости (например, < 200 км/ч)..
(34314060, 24)


In [26]:
final_path = os.path.join(data_root, 'raw/telemetry/telemetry_filtered_optimized.parquet')
filtered_data_optimized.to_parquet(final_path, index=False)
print(f'Финальный датасет (без аномалий) сохранён в: {final_path}')

Финальный датасет (без аномалий) сохранён в: ../dataset/raw/telemetry/telemetry_filtered_optimized.parquet


In [27]:
filtered_data_optimized = pd.read_parquet(os.path.join(data_root, 'raw/telemetry/telemetry_filtered_optimized.parquet'))
print(f'Информация о финальном датасете:\n{filtered_data_optimized.info(memory_usage="deep")}')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34314060 entries, 0 to 34314059
Data columns (total 24 columns):
 #   Column                     Dtype                                
---  ------                     -----                                
 0   create_dt                  datetime64[ns, pytz.FixedOffset(660)]
 1   mdm_object_id              int32                                
 2   mdm_object_name            int32                                
 3   mdm_model_name             category                             
 4   alt                        float32                              
 5   speed_gps                  float32                              
 6   inst_fuel                  float32                              
 7   weight                     float32                              
 8   temp_engine                float32                              
 9   turn_engine                float32                              
 10  load_engine                float32      