In [59]:
import pandas as pd
import numpy as np
import warnings
import sys

from pathlib import Path
from scipy import stats

warnings.filterwarnings('ignore')

In [60]:
DATA_DIR = Path('Data')
dynamic_data_path = DATA_DIR / 'VED_DynamicData'
csv_files = []

veh_folders = sorted([f for f in dynamic_data_path.iterdir() if f.is_dir() and f.name.startswith('VehId_')])

for veh_folder in veh_folders:
    csv_in_folder = sorted(veh_folder.glob('*.csv'))
    csv_files.extend(csv_in_folder)

csv_files = [str(f) for f in csv_files]

print(f'Cantidad de csv: {len(csv_files)}')


Cantidad de csv: 32552


In [61]:
# archivo dinámico
f = csv_files[0]
df0 = pd.read_csv(f, nrows=1000)
print('Dtypes:')
print(df0.dtypes)

Dtypes:
DayNum                            float64
VehId                               int64
Trip                                int64
Timestamp(ms)                       int64
Latitude[deg]                     float64
Longitude[deg]                    float64
Vehicle Speed[km/h]               float64
MAF[g/sec]                        float64
Engine RPM[RPM]                   float64
Absolute Load[%]                  float64
OAT[DegC]                         float64
Fuel Rate[L/hr]                   float64
Air Conditioning Power[kW]        float64
Air Conditioning Power[Watts]     float64
Heater Power[Watts]               float64
HV Battery Current[A]             float64
HV Battery SOC[%]                 float64
HV Battery Voltage[V]             float64
Short Term Fuel Trim Bank 1[%]    float64
Short Term Fuel Trim Bank 2[%]    float64
Long Term Fuel Trim Bank 1[%]     float64
Long Term Fuel Trim Bank 2[%]     float64
dtype: object


In [62]:
LABELS = {'DayNum', 'VehId', 'Trip', 'Timestamp(ms)', 'Latitude[deg]', 'Longitude[deg]'}

Y = {'Fuel Rate[L/hr]', 'HV Battery Current[A]'}

X = {
    # Engine parameters
    'Engine RPM[RPM]',
    'Absolute Load[%]',
    
    # Vehicle operation
    'Vehicle Speed[km/h]',
    'MAF[g/sec]',  # Mass Air Flow - indicator of engine load
    
    # Environmental
    'OAT[DegC]',  # Outside Air Temperature
    
    # Auxiliary power
    'Air Conditioning Power[kW]',
    'Air Conditioning Power[Watts]',
    'Heater Power[Watts]',
    
    # Battery system
    'HV Battery SOC[%]',
    'HV Battery Voltage[V]',
    
    # Fuel trim (affects fuel injection)
    'Short Term Fuel Trim Bank 1[%]',
    'Short Term Fuel Trim Bank 2[%]',
    'Long Term Fuel Trim Bank 1[%]',
    'Long Term Fuel Trim Bank 2[%]'
}

In [64]:
dir_path = DATA_DIR / 'VED_DynamicData'
dynamic_files = sorted(dir_path.glob('VehId_*/Trip_*.csv'))

if not dynamic_files:
    dynamic_files = sorted(dir_path.glob('*.csv'))

dynamic_files = [str(f) for f in dynamic_files]

Y_ONLY_COLUMNS = {
    'Fuel Rate[L/hr]',
    'HV Battery Current[A]',
    'HV Battery Voltage[V]',
}

X_FEATURES = X - Y_ONLY_COLUMNS

IDENTIFIER_COLUMNS = {'DayNum', 'VehId', 'Trip', 'Timestamp(ms)', 'Latitude[deg]', 'Longitude[deg]'}
X_ONLY_COLUMNS = X_FEATURES - IDENTIFIER_COLUMNS


def calculate_metrics(series):
    series = series.fillna(0)
    metrics = {
        'mean': series.mean(),
        'median': series.median(),
        'std': series.std(),
        'min': series.min(),
        'max': series.max(),
        'q25': series.quantile(0.25),
        'q75': series.quantile(0.75),
        'range': series.max() - series.min(),
    }
    return metrics

def apply_fourier_analysis(series, n_components=10):
    series = series.fillna(0).values
    
    if len(series) < 4:
        return {f'fft_magnitude_{i}': 0 for i in range(n_components)} | {f'fft_frequency_{i}': 0 for i in range(n_components)} | {'fft_energy': 0, 'fft_spectral_centroid': 0}
    
    fft_result = np.fft.fft(series)
    magnitude = np.abs(fft_result)
    frequencies = np.fft.fftfreq(len(series))
    
    n_samples = len(series)
    positive_freq_end = n_samples // 2
    
    mag_positive = magnitude[1:positive_freq_end]
    freq_positive = frequencies[1:positive_freq_end]
    
    if len(mag_positive) == 0:
        return {f'fft_magnitude_{i}': 0 for i in range(n_components)} | {f'fft_frequency_{i}': 0 for i in range(n_components)} | {'fft_energy': 0, 'fft_spectral_centroid': 0}
    
    sorted_indices = np.argsort(mag_positive)[::-1]
    n_comp = min(n_components, len(mag_positive))
    top_indices = sorted_indices[:n_comp]
    
    result = {}
    for i in range(n_comp):
        if i < len(top_indices):
            idx = top_indices[i]
            result[f'fft_magnitude_{i}'] = mag_positive[idx]
            result[f'fft_frequency_{i}'] = freq_positive[idx]
        else:
            result[f'fft_magnitude_{i}'] = 0
            result[f'fft_frequency_{i}'] = 0
    
    result['fft_energy'] = np.sum(magnitude ** 2)
    if np.sum(mag_positive) > 0:
        result['fft_spectral_centroid'] = np.sum(freq_positive * mag_positive) / np.sum(mag_positive)
    else:
        result['fft_spectral_centroid'] = 0
    
    return result

all_samples = []

for file_path in dynamic_files:
    file_name = Path(file_path).stem
    df = pd.read_csv(file_path)
    
    if len(df) < 2:
        continue
    
    time_intervals_hours = np.diff(df['Timestamp(ms)'].values) / (1000 * 3600)
    distances_km = df['Vehicle Speed[km/h]'].iloc[:-1].values * time_intervals_hours
    total_distance_km = np.sum(distances_km[distances_km > 0])
    
    if total_distance_km < 0.01:
        continue
    
    # Y1: Combustion consumption in L/100km
    total_fuel_liters = df['Fuel Rate[L/hr]'].fillna(0).sum() * np.mean(time_intervals_hours)
    y_combustion = (total_fuel_liters / total_distance_km * 100)
    
    # Y2: Electric consumption in kWh/km
    avg_voltage = df['HV Battery Voltage[V]'].fillna(0).mean()
    if avg_voltage > 0:
        battery_power_kw = (df['HV Battery Current[A]'].fillna(0) * avg_voltage / 1000)
        total_energy_kwh = battery_power_kw.sum() * np.mean(time_intervals_hours)
        y_electric = total_energy_kwh / total_distance_km
    else:
        y_electric = 0.0
    
    sample_row = {
        'filename': file_name,
        'VehId': df['VehId'].iloc[0],
        'DayNum': df['DayNum'].iloc[0],
        'Trip': df['Trip'].iloc[0],
        'Y_consumption_combustion_L_per_100km': max(0, y_combustion),
        'Y_consumption_electric_kWh_per_km': max(0, y_electric),
    }
    
    # 1. METRICS for each X feature
    for col in sorted(X_ONLY_COLUMNS):
        if col in df.columns:
            metrics = calculate_metrics(df[col])
            for metric_name, metric_value in metrics.items():
                sample_row[f'{col}_{metric_name}'] = metric_value
    
    # 2. FOURIER ANALYSIS for each X feature
    for col in sorted(X_ONLY_COLUMNS):
        if col in df.columns:
            fft_metrics = apply_fourier_analysis(df[col], n_components=5)
            for metric_name, metric_value in fft_metrics.items():
                sample_row[f'{col}_fft_{metric_name}'] = metric_value
    
    all_samples.append(sample_row)

df_combined = pd.DataFrame(all_samples)
display(df_combined.head())


Unnamed: 0,filename,VehId,DayNum,Trip,Y_consumption_combustion_L_per_100km,Y_consumption_electric_kWh_per_km,Absolute Load[%]_mean,Absolute Load[%]_median,Absolute Load[%]_std,Absolute Load[%]_min,...,Vehicle Speed[km/h]_fft_fft_magnitude_1,Vehicle Speed[km/h]_fft_fft_frequency_1,Vehicle Speed[km/h]_fft_fft_magnitude_2,Vehicle Speed[km/h]_fft_fft_frequency_2,Vehicle Speed[km/h]_fft_fft_magnitude_3,Vehicle Speed[km/h]_fft_fft_frequency_3,Vehicle Speed[km/h]_fft_fft_magnitude_4,Vehicle Speed[km/h]_fft_fft_frequency_4,Vehicle Speed[km/h]_fft_fft_energy,Vehicle Speed[km/h]_fft_fft_spectral_centroid
0,Trip_1558_VED_171101_week,10,1.719774,1558,0.0,0.0,0.0,0.0,0.0,0.0,...,583.824378,0.007752,389.20646,0.023256,277.893493,0.031008,212.250744,0.03876,25512220.0,0.080801
1,Trip_1561_VED_171101_week,10,2.893902,1561,0.0,0.0,0.0,0.0,0.0,0.0,...,2694.767652,0.009363,1823.436242,0.022472,1570.977231,0.033708,1544.284283,0.018727,221674400.0,0.073211
2,Trip_1567_VED_171101_week,10,4.540203,1567,0.0,0.0,0.0,0.0,0.0,0.0,...,1436.382084,0.016349,1296.185519,0.019074,1256.855049,0.027248,1232.222074,0.002725,76195950.0,0.071282
3,Trip_1568_VED_171101_week,10,4.575309,1568,0.0,0.0,0.0,0.0,0.0,0.0,...,1563.018969,0.007042,664.597742,0.010563,553.45147,0.038732,541.041985,0.014085,80529230.0,0.084564
4,Trip_1572_VED_171101_week,10,4.94122,1572,0.0,0.0,0.0,0.0,0.0,0.0,...,1895.963771,0.002232,1478.4527,0.011161,1403.259099,0.004464,1318.212136,0.013393,167411500.0,0.070505


In [65]:
# Separate and save X (Metrics and Fourier) and Y datasets into different files

# Create output directories
x_dir = DATA_DIR / 'X'
y_dir = DATA_DIR / 'Y'

x_dir.mkdir(parents=True, exist_ok=True)
y_dir.mkdir(parents=True, exist_ok=True)

# Define column groups
Y_columns = [col for col in df_combined.columns if col.startswith('Y_')]
ID_columns = ['filename', 'VehId', 'DayNum', 'Trip']
X_columns = [col for col in df_combined.columns if col not in Y_columns and col not in ID_columns]

# Separate X columns into Metrics and Fourier
X_metrics_columns = [col for col in X_columns if '_fft_' not in col]
X_fourier_columns = [col for col in X_columns if '_fft_' in col]

df_X_metrics = df_combined[ID_columns + X_metrics_columns].copy()
df_X_fourier = df_combined[ID_columns + X_fourier_columns].copy()
df_Y = df_combined[ID_columns + Y_columns].copy()

x_metrics_file = x_dir / 'X_metrics.csv'
df_X_metrics.to_csv(x_metrics_file, index=False)

x_fourier_file = x_dir / 'X_fourier.csv'
df_X_fourier.to_csv(x_fourier_file, index=False)

y_output_file = y_dir / 'Y.csv'
df_Y.to_csv(y_output_file, index=False)