In [3]:
# Import libraries
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error, root_mean_squared_error

In [4]:
df = pd.read_csv("data/train.csv")
df.head()

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out,pressure
0,1,1,20,50,0.0,0.083334,0,5.837492
1,2,1,20,50,0.033652,18.383041,0,5.907794
2,3,1,20,50,0.067514,22.509278,0,7.876254
3,4,1,20,50,0.101542,22.808822,0,11.742872
4,5,1,20,50,0.135756,25.35585,0,12.234987


In [5]:
# drop 'id', 'breath_id', and 'pressure' from the predictors
X = df[['R', 'C', 'time_step', 'u_in', 'u_out']]
y = df['pressure']

In [6]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## Split the data

In [7]:
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [8]:
param_grid = {
    'learning_rate': ['constant', 'adaptive', 'invscaling'],
    'eta0': [0.001, 0.01],
    'max_iter': [1000, 2000],
    'tol': [1e-3, 1e-4]
}

grid_search = GridSearchCV(SGDRegressor(penalty=None, random_state=42),
                           param_grid=param_grid,
                           cv=5,
                           scoring='neg_mean_absolute_error',
                           n_jobs=-1,
                           verbose=1)

# Fit the grid search on the training data
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [9]:
# Output the best hyperparameters
best_params = grid_search.best_params_
print("Best hyperparameters:", best_params)
print("Best CV Score (negative MAE):", grid_search.best_score_)

# Use the best estimator
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

Best hyperparameters: {'eta0': 0.01, 'learning_rate': 'invscaling', 'max_iter': 1000, 'tol': 0.0001}
Best CV Score (negative MAE): -3.945712508078801


## Evaluate the model performance

In [10]:
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

mse  = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (MSE):", mse)

rmse  = root_mean_squared_error(y_test, y_pred)
print("Root Mean Squared Error (RMSE) Score:", rmse)

r2 = r2_score(y_test, y_pred)
print("R2 Score:", r2)

Mean Absolute Error (MAE): 3.967444728687525
Mean Squared Error (MSE): 40.55415672882915
Root Mean Squared Error (RMSE) Score: 6.368214563661399
R2 Score: 0.3834978833183824


## Prepare submission using the test dataset

In [11]:
df_test = pd.read_csv("data/test.csv")

# Prepare test features
X_test = df_test[['R', 'C', 'time_step', 'u_in', 'u_out']]

# Predict the pressure values for the test set
test_predictions = best_model.predict(X_test)

# Create a submission
submission_df = pd.DataFrame({
    "id": df_test["id"],
    "pressure": test_predictions
})

# Save the submission file as CSV
submission_df.to_csv("submission-simple_lr_with_gd_2.csv", index=False)



In [12]:
# # Visualize the predicted vs actual pressure values
# plt.figure(figsize=(8, 6))
# plt.scatter(y_test, y_pred, alpha=0.5)
# plt.xlabel("Actual Pressure")
# plt.ylabel("Predicted Pressure")
# plt.title("Actual vs. Predicted Pressure")
# plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--') 
# plt.show()