In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
from sklearn.compose import ColumnTransformer

df = pd.read_csv('D:/umd/ds602/my_final_project/database/shared_overview.csv')

df1 = df.drop(['Trip','Date','Battery Temperature (Start) [°C]','Battery Temperature (End)','Battery State of Charge (Start)', 'Battery State of Charge (End)','Fan'], axis = 1)
object_cols = df1.select_dtypes(include=['object']).columns
df1 = pd.get_dummies(df1, columns=object_cols, dtype=int)

# Do the linear regression of whole overview data
X = df1.drop(['SOC', 'Distance [km]','Duration [min]'], axis=1)
y = df1['SOC']  / df1['Distance [km]']
# print(X)
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.613
Model:,OLS,Adj. R-squared:,0.477
Method:,Least Squares,F-statistic:,4.491
Date:,"Tue, 03 Dec 2024",Prob (F-statistic):,1.18e-05
Time:,19:02:48,Log-Likelihood:,379.67
No. Observations:,70,AIC:,-721.3
Df Residuals:,51,BIC:,-678.6
Df Model:,18,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0067,0.001,4.534,0.000,0.004,0.010
Ambient Temperature (Start) [°C],-0.0001,5.53e-05,-2.088,0.042,-0.000,-4.43e-06
Target Cabin Temperature,-3.617e-06,7.88e-05,-0.046,0.964,-0.000,0.000
Mean Battery Temperature [°C],1.572e-05,6.27e-05,0.251,0.803,-0.000,0.000
Route/Area_FTMRoute,0.0012,0.001,1.903,0.063,-6.7e-05,0.003
Route/Area_FTMRoute (2x),0.0007,0.001,0.567,0.573,-0.002,0.003
Route/Area_FTMRoute reverse,0.0008,0.001,0.642,0.524,-0.002,0.003
Route/Area_Highway,0.0008,0.001,1.010,0.317,-0.001,0.002
Route/Area_Munich East,0.0006,0.001,1.066,0.292,-0.001,0.002

0,1,2,3
Omnibus:,44.341,Durbin-Watson:,2.146
Prob(Omnibus):,0.0,Jarque-Bera (JB):,201.277
Skew:,-1.767,Prob(JB):,1.96e-44
Kurtosis:,10.518,Cond. No.,8.92e+17


In [8]:
# Use linear regression in sklearn to make the prediction
# Separate features and target variable
X = df1.drop('SOC', axis=1)
y = df1['SOC']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Initialize the scaler and fit it on the training data
# feature scaling for numerical data (except for those one-hot encoded columns)
numerical_features = [
    'Ambient Temperature (Start) [°C]',
    'Target Cabin Temperature', 
    'Distance [km]', 
    'Duration [min]',
    'Mean Battery Temperature [°C]',
]
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
    ],
    remainder='passthrough'  # This will leave non-numerical columns unchanged
)
# Fit on training data
X_train_scaled = preprocessor.fit_transform(X_train)

# Transform test data
X_test_scaled = preprocessor.transform(X_test)

# Fit the model on the standardized data
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Predict on the train and test set
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

# Calculate MSE for training data
MSE_train = mean_squared_error(y_train, y_train_pred)
print('MSE of linear regression model on training data:', MSE_train)
# Calculate MSE for testing data
MSE_test = mean_squared_error(y_test, y_test_pred)
print('MSE of linear regression on testing model:', MSE_test)

# Calculate R-squared
R_square = r2_score(y_test, y_test_pred)
print('R_square of linear regression model:', R_square)

MSE of linear regression model on training data: 0.00036505293770798525
MSE of linear regression on testing model: 0.0005970968409005965
R_square of linear regression model: 0.7768121946206586


In [9]:
from sklearn.linear_model import Ridge, Lasso

# Fit Lasso and Ridge models (with intercept by default)
lasso_model = Lasso(alpha=0.01)
ridge_model = Ridge(alpha=1)

lasso_model.fit(X_train_scaled, y_train)
ridge_model.fit(X_train_scaled, y_train)

# Evaluate results
lasso_pred_train = lasso_model.predict(X_train_scaled)
ridge_pred_train = ridge_model.predict(X_train_scaled)

lasso_pred_test = lasso_model.predict(X_test_scaled)
ridge_pred_test = ridge_model.predict(X_test_scaled)

# Performance metrics
print("Lasso Regression Results:")
print(f"Intercept: {lasso_model.intercept_}")
print(f"Coefficients: {lasso_model.coef_}")
print(f"Mean Squared Error on training data: {mean_squared_error(y_train, lasso_pred_train)}")
print(f"Mean Squared Error on testing data: {mean_squared_error(y_test, lasso_pred_test)}")
print(f"R^2 Score: {r2_score(y_test, lasso_pred_test)}")

print("\nRidge Regression Results:")
print(f"Intercept: {ridge_model.intercept_}")
print(f"Coefficients: {ridge_model.coef_}")
print(f"Mean Squared Error on training data: {mean_squared_error(y_train, ridge_pred_train)}")
print(f"Mean Squared Error on testing data: {mean_squared_error(y_test, ridge_pred_test)}")
print(f"R^2 Score: {r2_score(y_test, ridge_pred_test)}")

Lasso Regression Results:
Intercept: 0.14005357142857136
Coefficients: [-0.0118153  -0.          0.07058091 -0.         -0.00220942  0.
  0.          0.          0.         -0.          0.          0.
 -0.         -0.          0.          0.          0.         -0.
  0.         -0.          0.          0.        ]
Mean Squared Error on training data: 0.0013863410803106114
Mean Squared Error on testing data: 0.00032182369154514276
R^2 Score: 0.8797060735965339

Ridge Regression Results:
Intercept: 0.1477827941855803
Coefficients: [-0.00983167 -0.0019799   0.09575213 -0.02763476 -0.01165923  0.00074703
  0.01231576  0.          0.03868517 -0.010994   -0.01739116  0.03596721
 -0.02043256 -0.03889745  0.00393978  0.01157793 -0.00114821 -0.02348773
 -0.00016072  0.00388779  0.00102451  0.00436666]
Mean Squared Error on training data: 0.00046703816622017735
Mean Squared Error on testing data: 0.0003627680848403324
R^2 Score: 0.8644015389613166
