# use Daniel's dataset

In [1]:
import pandas as pd
df_d_overv = pd.read_csv('shared_overview.csv')
df_d_overv.shape

(70, 15)

In [2]:
df_d_overv.columns

Index(['Trip', 'Date', 'Route/Area', 'Weather',
       'Battery Temperature (Start) [°C]', 'Battery Temperature (End)',
       'Battery State of Charge (Start)', 'Battery State of Charge (End)',
       'Ambient Temperature (Start) [°C]', 'Target Cabin Temperature',
       'Distance [km]', 'Duration [min]', 'Fan',
       'Mean Battery Temperature [°C]', 'SOC'],
      dtype='object')

In [3]:
cols = ['Date', 'Trip', 'Battery Temperature (Start) [°C]',
       'Battery Temperature (End)', 'Battery State of Charge (Start)',
       'Battery State of Charge (End)', 'Fan']
df_d_overv.drop(columns=cols, inplace=True, errors='ignore')

In [4]:
df_d_overv.head(3)

Unnamed: 0,Route/Area,Weather,Ambient Temperature (Start) [°C],Target Cabin Temperature,Distance [km],Duration [min],Mean Battery Temperature [°C],SOC
0,Munich East,sunny,25.5,23.0,7.42769,16.82,21.9401,0.048
1,Munich East,sunny,32.0,23.0,23.509709,23.55,24.973,0.13
2,Munich East,sunny,21.5,27.0,12.820846,11.18,25.1006,0.084


In [5]:
df_d_overv.columns

Index(['Route/Area', 'Weather', 'Ambient Temperature (Start) [°C]',
       'Target Cabin Temperature', 'Distance [km]', 'Duration [min]',
       'Mean Battery Temperature [°C]', 'SOC'],
      dtype='object')

In [6]:
# object_cols = ['Route/Area', 'Weather']
object_cols = df_d_overv.select_dtypes(include=['object']).columns
df_d_overv = pd.get_dummies(df_d_overv, columns=object_cols, dtype=int)

In [7]:
df_d_overv.head()

Unnamed: 0,Ambient Temperature (Start) [°C],Target Cabin Temperature,Distance [km],Duration [min],Mean Battery Temperature [°C],SOC,Route/Area_FTMRoute,Route/Area_FTMRoute (2x),Route/Area_FTMRoute reverse,Route/Area_Highway,...,Route/Area_Munich Northeast,Route/Area_Munich South,Weather_cloudy,Weather_dark,"Weather_dark, little rainy",Weather_rainy,Weather_slightly cloudy,Weather_sunny,Weather_sunrise,Weather_sunset
0,25.5,23.0,7.42769,16.82,21.9401,0.048,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,32.0,23.0,23.509709,23.55,24.973,0.13,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,21.5,27.0,12.820846,11.18,25.1006,0.084,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,24.0,22.0,10.727491,6.87,26.818,0.084,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,24.5,24.0,12.393223,22.776667,27.0,0.065,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [8]:
df_d_overv.columns

Index(['Ambient Temperature (Start) [°C]', 'Target Cabin Temperature',
       'Distance [km]', 'Duration [min]', 'Mean Battery Temperature [°C]',
       'SOC', 'Route/Area_FTMRoute', 'Route/Area_FTMRoute (2x)',
       'Route/Area_FTMRoute reverse', 'Route/Area_Highway',
       'Route/Area_Munich East', 'Route/Area_Munich North',
       'Route/Area_Munich North + Fast Charging',
       'Route/Area_Munich Northeast', 'Route/Area_Munich South',
       'Weather_cloudy', 'Weather_dark', 'Weather_dark, little rainy',
       'Weather_rainy', 'Weather_slightly cloudy', 'Weather_sunny',
       'Weather_sunrise', 'Weather_sunset'],
      dtype='object')

In [9]:
from sklearn.model_selection import train_test_split
# Assuming 'soc difference/distance' is the target variable

# Create new target variable: SOC/distance
y_overv = df_d_overv['SOC'] / df_d_overv['Distance [km]']*1000

# Create features, dropping SOC and Distance
X_overv = df_d_overv.drop(['SOC', 'Distance [km]', 'Duration [min]'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    X_overv,
    y_overv,
    test_size=0.2,
    random_state=30, # set random seed for reproducibility
)
y_overv

0     6.462305
1     5.529630
2     6.551830
3     7.830349
4     5.244802
        ...   
65    2.843643
66    9.014676
67    7.128320
68    9.047908
69    8.570431
Length: 70, dtype: float64

In [10]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

# feature scaling for numerical data (except for those one-hot encoded columns)
numerical_features = [
    'Ambient Temperature (Start) [°C]',
    'Target Cabin Temperature', 
    'Mean Battery Temperature [°C]',
]
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
    ],
    remainder='passthrough'  # This will leave non-numerical columns unchanged
)
# Create pandas DataFrames with proper column names
feature_names = (numerical_features + 
                [col for col in X_overv.columns if col not in numerical_features])

# Fit and transform the training data
X_train_scaled = preprocessor.fit_transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=feature_names)

# Transform the test data
X_test_scaled = preprocessor.transform(X_test)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=feature_names)
X_train_scaled


Unnamed: 0,Ambient Temperature (Start) [°C],Target Cabin Temperature,Mean Battery Temperature [°C],Route/Area_FTMRoute,Route/Area_FTMRoute (2x),Route/Area_FTMRoute reverse,Route/Area_Highway,Route/Area_Munich East,Route/Area_Munich North,Route/Area_Munich North + Fast Charging,Route/Area_Munich Northeast,Route/Area_Munich South,Weather_cloudy,Weather_dark,"Weather_dark, little rainy",Weather_rainy,Weather_slightly cloudy,Weather_sunny,Weather_sunrise,Weather_sunset
0,-1.130005,-0.581894,-1.79537,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,-0.880477,-0.581894,-1.008872,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.880477,-0.581894,-0.724448,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,-0.73076,-0.581894,-0.579413,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.666596,0.845234,0.595723,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5,-1.229816,-0.581894,-1.468109,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
6,0.516879,-1.057604,0.873889,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
7,1.215557,0.369524,1.569302,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8,1.315368,0.845234,0.806267,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
9,-0.680854,-0.581894,-0.265323,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


## tree based baseline model

In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.tree import plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.inspection import permutation_importance

### decision tree

In [17]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Create and train the model
dt_regressor = DecisionTreeRegressor(random_state=42)
dt_regressor.fit(X_train_scaled, y_train)

# === Make predictions on the training set ===
y_train_pred = dt_regressor.predict(X_train_scaled)
# Calculate MSE on the training set
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
print(f"Decision Tree Training MSE: {mse_train:.4f}")

# === Make predictions on the testing set ===
y_pred = dt_regressor.predict(X_test_scaled)
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f"Decision Tree Testing MSE: {mse:.4f}")
# print(f"Decision Tree RMSE: {rmse:.4f}")

r2 = r2_score(y_test, y_pred)
print(f"Decision Tree R² Score: {r2:.4f}")


print(f"Decision Tree feature importance:")
# Get feature importances and names
importances = dt_regressor.feature_importances_
feature_names = X_train.columns
# Create a DataFrame
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
})
# Sort by importance (descending order)
feature_importance_df = feature_importance_df.sort_values('Importance', ascending=False)
# Reset index for clean display
feature_importance_df = feature_importance_df.reset_index(drop=True)
# Display the table
print(feature_importance_df.to_string(index=False))

Decision Tree Training MSE: 0.0000
Decision Tree Testing MSE: 2.1595
Decision Tree R² Score: 0.3353
Decision Tree feature importance:
                                Feature  Importance
          Mean Battery Temperature [°C]    0.687399
       Ambient Temperature (Start) [°C]    0.100692
               Target Cabin Temperature    0.048932
            Route/Area_Munich Northeast    0.047316
                Route/Area_Munich North    0.045481
                         Weather_cloudy    0.018052
                          Weather_rainy    0.014773
                          Weather_sunny    0.014047
Route/Area_Munich North + Fast Charging    0.010933
                Weather_slightly cloudy    0.007666
                    Route/Area_FTMRoute    0.004084
                     Route/Area_Highway    0.000542
                           Weather_dark    0.000052
                 Route/Area_Munich East    0.000031
               Route/Area_FTMRoute (2x)    0.000000
            Route/Area_FTMRoute re

### random forest

In [13]:
from sklearn.ensemble import RandomForestRegressor

# Create and train the model
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=30)
rf_regressor.fit(X_train_scaled, y_train)

# === Make predictions on the training set ===
y_train_pred = rf_regressor.predict(X_train_scaled)
# Calculate MSE on the training set
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
print(f"Random Forest Training MSE: {mse_train:.4f}")

# === Make predictions on the testing set ===
y_pred = rf_regressor.predict(X_test_scaled)
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f"Random Forest Testing MSE: {mse:.4f}")
# print(f"Random Forest RMSE: {rmse}")

r2 = r2_score(y_test, y_pred)
print(f"Random Forest R² Score: {r2:.4f}")


print(f"Random Forest feature importance:")
# Create a DataFrame with feature names and importances
feature_importance_df = pd.DataFrame({
    'Feature': X_train_scaled.columns,
    'Importance': rf_regressor.feature_importances_
})
# Sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values('Importance', ascending=False)
# Reset the index for a cleaner display
feature_importance_df = feature_importance_df.reset_index(drop=True)
# Display the table
print(feature_importance_df.to_string(index=False))

Random Forest Training MSE: 0.2358
Random Forest Testing MSE: 1.9793
Random Forest R² Score: 0.3908
Random Forest feature importance:
                                Feature  Importance
          Mean Battery Temperature [°C]    0.534681
       Ambient Temperature (Start) [°C]    0.243554
               Target Cabin Temperature    0.084609
            Route/Area_Munich Northeast    0.051834
                         Weather_cloudy    0.023180
                Route/Area_Munich North    0.014181
                Weather_slightly cloudy    0.011336
                          Weather_rainy    0.008650
Route/Area_Munich North + Fast Charging    0.008515
                          Weather_sunny    0.006761
                    Route/Area_FTMRoute    0.003717
                           Weather_dark    0.003259
                 Route/Area_Munich East    0.002678
                        Weather_sunrise    0.000844
                Route/Area_Munich South    0.000827
            Route/Area_FTMRoute re

### xgboost

In [14]:
import xgboost as xgb

# Clean feature names by replacing problematic characters
X_train_xgb = X_train_scaled.copy()
X_test_xgb = X_test_scaled.copy()

# Clean up column names
X_train_xgb.columns = [col.replace('[', '').replace(']', '').replace('(', '').replace(')', '').replace(' ', '_').replace(',', '') 
                       for col in X_train_xgb.columns]
X_test_xgb.columns = [col.replace('[', '').replace(']', '').replace('(', '').replace(')', '').replace(' ', '_').replace(',', '') 
                      for col in X_test_xgb.columns]

# Create DMatrix for XGBoost with cleaned feature names
dtrain = xgb.DMatrix(X_train_xgb, label=y_train)
dtest = xgb.DMatrix(X_test_xgb, label=y_test)

# Set parameters
params = {
    'max_depth': 5,
    'eta': 0.3,
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse'
}

# Train the model
num_round = 100
xgb_model = xgb.train(params, dtrain, num_round)

# Make predictions
y_pred = xgb_model.predict(dtest)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f"XGBoost RMSE: {rmse}")
r2 = r2_score(y_test, y_pred)
print(f"XGBoost R² Score: {r2:.4f}")
# Get feature importance
importance = xgb_model.get_score(importance_type='weight')
feature_importance_df = pd.DataFrame({
    'Feature': list(importance.keys()),
    'Importance': list(importance.values())
})
feature_importance_df = feature_importance_df.sort_values('Importance', ascending=False)
print("\nXGBoost feature importance:")
print(feature_importance_df.to_string(index=False))

XGBoost RMSE: 1.530061954295467
XGBoost R² Score: 0.2794

XGBoost feature importance:
                                Feature  Importance
           Ambient_Temperature_Start_°C       422.0
            Mean_Battery_Temperature_°C       416.0
               Target_Cabin_Temperature       118.0
                          Weather_rainy        29.0
                Route/Area_Munich_North        26.0
Route/Area_Munich_North_+_Fast_Charging        25.0
                    Route/Area_FTMRoute        24.0
                Weather_slightly_cloudy        24.0
            Route/Area_Munich_Northeast        21.0
                         Weather_cloudy        18.0
                           Weather_dark        13.0
                          Weather_sunny        11.0
                Route/Area_Munich_South         7.0
                        Weather_sunrise         6.0
              Weather_dark_little_rainy         5.0
                         Weather_sunset         3.0
                 Route/Area_Mu