In [1]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error, root_mean_squared_error

In [2]:
df = pd.read_csv("data/train.csv")
df.head()

NameError: name 'pd' is not defined

In [3]:
# drop 'id', 'breath_id', and 'pressure' from the predictors
X = df[['R', 'C', 'time_step', 'u_in', 'u_out']]
y = df['pressure']

## Split the data

In [4]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set shape:", X_train.shape)
print("Test set shape:", X_val.shape)

Training set shape: (4828800, 5)
Test set shape: (1207200, 5)


In [6]:
rf_model = RandomForestRegressor(random_state=42)

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid,
                           cv=5, scoring='neg_mean_absolute_error', 
                           verbose=1, n_jobs=-1)

grid_search.fit(X_train, y_train)

# Print the best parameters and the corresponding score
print("Best Parameters:", grid_search.best_params_)
print("Best CV Score (Negative MAE):", grid_search.best_score_)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


KeyboardInterrupt: 

## Evaluate the model performance

In [None]:
# Evaluate the best model on the test set
best_rf = grid_search.best_estimator_
y_val_pred = best_rf.predict(X_val)

mae = mean_absolute_error(y_val, y_val_pred)
print("Mean Absolute Error (MAE):", mae)

mse  = mean_squared_error(y_val, y_val_pred)
print("Mean Squared Error (MSE):", mse)

rmse  = root_mean_squared_error(y_val, y_val_pred)
print("Root Mean Squared Error (RMSE) Score:", rmse)

r2 = r2_score(y_val, y_val_pred)
print("R2 Score:", r2)

## Prepare submission using the test dataset

In [None]:
df_test = pd.read_csv("data/test.csv")

# Prepare test features
X_test = df_test[['R', 'C', 'time_step', 'u_in', 'u_out']]

# Predict the pressure values for the test set
test_predictions = best_rf.predict(X_test)

# Create a submission
submission_df = pd.DataFrame({
    "id": df_test["id"],
    "pressure": test_predictions
})

# Save the submission file as CSV
submission_df.to_csv("RFRegressor.csv", index=False)
print("Submission file 'RFRegressor.csv' created successfully.")

In [None]:
# Visualize Actual vs Predicted Pressure on the validation set
plt.figure(figsize=(8, 6))
plt.scatter(y_val, y_val_pred, alpha=0.5)
plt.xlabel("Actual Pressure")
plt.ylabel("Predicted Pressure")
plt.title("Actual vs Predicted Pressure on Validation Set")
plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'r--')  # Diagonal line for ideal predictions
plt.show()