In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
from sklearn.compose import ColumnTransformer

df = pd.read_csv('D:/umd/ds602/my_final_project/database/shared_overview.csv')

df1 = df.drop(['Trip','Date','Battery Temperature (Start) [°C]','Battery Temperature (End)','Battery State of Charge (Start)', 'Battery State of Charge (End)','Fan'], axis = 1)
object_cols = df1.select_dtypes(include=['object']).columns
df1 = pd.get_dummies(df1, columns=object_cols, dtype=int)

# Do the linear regression of whole overview data
X = df1.drop(['SOC', 'Distance [km]','Duration [min]'], axis=1)
y = (df1['SOC']  / df1['Distance [km]'])*1000   
# print(X)
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.613
Model:,OLS,Adj. R-squared:,0.477
Method:,Least Squares,F-statistic:,4.491
Date:,"Tue, 03 Dec 2024",Prob (F-statistic):,1.18e-05
Time:,19:46:11,Log-Likelihood:,-103.87
No. Observations:,70,AIC:,245.7
Df Residuals:,51,BIC:,288.5
Df Model:,18,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,6.6945,1.476,4.534,0.000,3.731,9.658
Ambient Temperature (Start) [°C],-0.1155,0.055,-2.088,0.042,-0.226,-0.004
Target Cabin Temperature,-0.0036,0.079,-0.046,0.964,-0.162,0.155
Mean Battery Temperature [°C],0.0157,0.063,0.251,0.803,-0.110,0.141
Route/Area_FTMRoute,1.2183,0.640,1.903,0.063,-0.067,2.504
Route/Area_FTMRoute (2x),0.6896,1.216,0.567,0.573,-1.751,3.131
Route/Area_FTMRoute reverse,0.8173,1.272,0.642,0.524,-1.737,3.372
Route/Area_Highway,0.7502,0.743,1.010,0.317,-0.741,2.241
Route/Area_Munich East,0.6484,0.608,1.066,0.292,-0.573,1.870

0,1,2,3
Omnibus:,44.341,Durbin-Watson:,2.146
Prob(Omnibus):,0.0,Jarque-Bera (JB):,201.277
Skew:,-1.767,Prob(JB):,1.96e-44
Kurtosis:,10.518,Cond. No.,8.92e+17


In [11]:
# Use linear regression in sklearn to make the prediction
# Separate features and target variable
X = df1.drop(['SOC', 'Distance [km]','Duration [min]'], axis=1)
y = df1['SOC']  / df1['Distance [km]']*1000

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Initialize the scaler and fit it on the training data
# feature scaling for numerical data (except for those one-hot encoded columns)
numerical_features = [
    'Ambient Temperature (Start) [°C]',
    'Target Cabin Temperature',  
    'Mean Battery Temperature [°C]',
]
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
    ],
    remainder='passthrough'  # This will leave non-numerical columns unchanged
)
# Fit on training data
X_train_scaled = preprocessor.fit_transform(X_train)

# Transform test data
X_test_scaled = preprocessor.transform(X_test)

# Fit the model on the standardized data
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Predict on the train and test set
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

# Calculate MSE for training data
MSE_train = mean_squared_error(y_train, y_train_pred)
print('MSE of linear regression model on training data:', MSE_train)
# Calculate MSE for testing data
MSE_test = mean_squared_error(y_test, y_test_pred)
print('MSE of linear regression on testing model:', MSE_test)

# Calculate R-squared
R_square = r2_score(y_test, y_test_pred)
print('R_square of linear regression model:', R_square)

MSE of linear regression model on training data: 1.2815539468426127
MSE of linear regression on testing model: 0.8062782175106913
R_square of linear regression model: 0.6034398722475607


In [12]:
from sklearn.linear_model import Ridge, Lasso

# Fit Lasso and Ridge models (with intercept by default)
lasso_model = Lasso(alpha=0.01)
ridge_model = Ridge(alpha=1)

lasso_model.fit(X_train_scaled, y_train)
ridge_model.fit(X_train_scaled, y_train)

# Evaluate results
lasso_pred_train = lasso_model.predict(X_train_scaled)
ridge_pred_train = ridge_model.predict(X_train_scaled)

lasso_pred_test = lasso_model.predict(X_test_scaled)
ridge_pred_test = ridge_model.predict(X_test_scaled)

# Performance metrics
print("Lasso Regression Results:")
print(f"Intercept: {lasso_model.intercept_}")
print(f"Coefficients: {lasso_model.coef_}")
print(f"Mean Squared Error on training data: {mean_squared_error(y_train, lasso_pred_train)}")
print(f"Mean Squared Error on testing data: {mean_squared_error(y_test, lasso_pred_test)}")
print(f"R^2 Score: {r2_score(y_test, lasso_pred_test)}")

print("\nRidge Regression Results:")
print(f"Intercept: {ridge_model.intercept_}")
print(f"Coefficients: {ridge_model.coef_}")
print(f"Mean Squared Error on training data: {mean_squared_error(y_train, ridge_pred_train)}")
print(f"Mean Squared Error on testing data: {mean_squared_error(y_test, ridge_pred_test)}")
print(f"R^2 Score: {r2_score(y_test, ridge_pred_test)}")

Lasso Regression Results:
Intercept: 7.101429420215124
Coefficients: [-1.10451333 -0.         -0.          0.26725893 -0.          0.
  0.         -0.11097009  0.42050146  1.78031961 -1.38467663 -2.00654172
 -0.0407623   0.34647582 -0.         -0.98228251 -0.08949163  0.20980224
  0.12927546  0.        ]
Mean Squared Error on training data: 1.3062611108675815
Mean Squared Error on testing data: 0.6418256779015007
R^2 Score: 0.6843242601676263

Ridge Regression Results:
Intercept: 6.984783562754568
Coefficients: [-9.40701661e-01 -9.93254371e-04 -8.11805758e-02  5.16990149e-01
  8.43704381e-03  0.00000000e+00  1.67951933e-01 -8.38575542e-02
  5.25481841e-01  1.24748322e+00 -9.98274129e-01 -1.38421251e+00
 -1.49588546e-02  4.64176401e-01 -1.24533862e-01 -8.38947059e-01
 -9.12891558e-02  1.80714719e-01  3.04364655e-01  1.20473156e-01]
Mean Squared Error on training data: 1.3501067923309515
Mean Squared Error on testing data: 0.5515479092681292
R^2 Score: 0.7287265058006359
