In [1]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# File paths
scaled_file = 'C:/Users/serha/PycharmProjects/Temp/scaledData/scaledData1.csv'
file_mcu = '/BatterySolutions/Validation_Dataset/Neocity_Parse_Dosyaları_18_Şubat_19_Mart_2025/mergedparse_Arac200727_MCU_18Subat_19Mart.csv'
file_dcdc = '/BatterySolutions/Validation_Dataset/Neocity_Parse_Dosyaları_18_Şubat_19_Mart_2025/mergedparse_Arac200728_DCDC_18Subat_19Mart.csv'

# Define feature mapping
feature_map = {
    '1_F230_SOC': 'soc',
    '1_F231_Avarage_Battery_Voltage': 'pack_voltage (V)',
    '1_F231_Current_Charge_Discharge': 'charge_current (A)',
    '1_F232_High_temp': 'max_temperature (℃)',
    '1_F232_Low_temp': 'min_temperature (℃)',
    '1_F230_The_Remaining_Capacity': 'available_capacity (Ah)',
}

# Load and prepare the scaled training data
train_cols = list(feature_map.values())
chunk_iterator = pd.read_csv(scaled_file, usecols=train_cols, chunksize=10000)
df_train_chunk = next(chunk_iterator)

# Split into features and target
X_train = df_train_chunk.drop(columns='available_capacity (Ah)')
y_train = df_train_chunk['available_capacity (Ah)']

# Train the XGBoost model
model = XGBRegressor(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)

# Function to clean and prepare test data
def prepare_test_data(df, column_map, feature_names):
    available_cols = [col for col in column_map if col in df.columns]
    renamed = df[available_cols].rename(columns={k: column_map[k] for k in available_cols})
    for col in renamed.columns:
        renamed[col] = renamed[col].astype(str).str.replace(',', '.')
        renamed[col] = pd.to_numeric(renamed[col], errors='coerce')
    renamed = renamed.dropna()
    renamed = renamed[[column_map[col] for col in available_cols]]  # Ensure correct column order
    renamed = renamed.reindex(columns=feature_names + ['available_capacity (Ah)'], fill_value=0)  # Include target column
    return renamed

# Extract feature names from training data
feature_names = X_train.columns.tolist()

# Load and clean MCU data
df_mcu = pd.read_csv(file_mcu, delimiter=';', on_bad_lines='skip')
df_mcu_clean = prepare_test_data(df_mcu, feature_map, feature_names)
df_mcu_clean['predicted_available_capacity (Ah)'] = model.predict(df_mcu_clean[feature_names])

# Load and clean DCDC data
df_dcdc = pd.read_csv(file_dcdc, delimiter=';', on_bad_lines='skip')
df_dcdc_clean = prepare_test_data(df_dcdc, feature_map, feature_names)
df_dcdc_clean['predicted_available_capacity (Ah)'] = model.predict(df_dcdc_clean[feature_names])

# Check if 'available_capacity (Ah)' column exists before calculating error rates
if 'available_capacity (Ah)' in df_mcu_clean.columns:
    y_true_mcu = df_mcu_clean['available_capacity (Ah)']
    y_pred_mcu = df_mcu_clean['predicted_available_capacity (Ah)']
    mae_mcu = mean_absolute_error(y_true_mcu, y_pred_mcu)
    mse_mcu = mean_squared_error(y_true_mcu, y_pred_mcu)
    print(f"MCU Data - MAE: {mae_mcu}, MSE: {mse_mcu}")
else:
    print("MCU Data - 'available_capacity (Ah)' column not found")

if 'available_capacity (Ah)' in df_dcdc_clean.columns:
    y_true_dcdc = df_dcdc_clean['available_capacity (Ah)']
    y_pred_dcdc = df_dcdc_clean['predicted_available_capacity (Ah)']
    mae_dcdc = mean_absolute_error(y_true_dcdc, y_pred_dcdc)
    mse_dcdc = mean_squared_error(y_true_dcdc, y_pred_dcdc)
    print(f"DCDC Data - MAE: {mae_dcdc}, MSE: {mse_dcdc}")
else:
    print("DCDC Data - 'available_capacity (Ah)' column not found")

# Display result samples
print(df_mcu_clean.head())
print(df_dcdc_clean.head())

  df_mcu = pd.read_csv(file_mcu, delimiter=';', on_bad_lines='skip')
  df_dcdc = pd.read_csv(file_dcdc, delimiter=';', on_bad_lines='skip')


MCU Data - MAE: 1199.968505859375, MSE: 1439924.125
DCDC Data - MAE: 1005.2636587872819, MSE: 1014259.3584482796
   soc  pack_voltage (V)  charge_current (A)  max_temperature (℃)  \
0    0                 0                   0                    0   
1    0                 0                   0                    0   
2    0                 0                   0                    0   
3    0                 0                   0                    0   
4    0                 0                   0                    0   

   min_temperature (℃)  available_capacity (Ah)  \
0                    0                        0   
1                    0                        0   
2                    0                        0   
3                    0                        0   
4                    0                        0   

   predicted_available_capacity (Ah)  
0                        1199.968384  
1                        1199.968384  
2                        1199.968384  
3        

In [2]:
print(f"MCU Data - MAE: {mae_mcu}, MSE: {mse_mcu}")

MCU Data - MAE: 1199.968505859375, MSE: 1439924.125
