In [14]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [15]:
# Paths to processed CSVs
X_train = pd.read_csv("../data/processed/X_train.csv")
X_test  = pd.read_csv("../data/processed/X_test.csv")
# Read target and clean '5+' entries, convert to float
# Squeeze returns a Series
raw_y_train = pd.read_csv("../data/processed/y_train.csv").squeeze()
raw_y_test  = pd.read_csv("../data/processed/y_test.csv").squeeze()
y_train = raw_y_train.replace('5+', 6).astype(float)
y_test  = raw_y_test.replace('5+', 6).astype(float)
print("Training set shape:", X_train.shape, y_train.shape)
print("Test set shape:    ", X_test.shape,  y_test.shape)


Training set shape: (80000, 27) (80000,)
Test set shape:     (20000, 27) (20000,)


In [16]:
# Clean target variables: replace '5+' with 6 and convert to numeric
y_train = y_train.replace('5+', 6).astype(float)
y_test = y_test.replace('5+', 6).astype(float)
print("Unique values in y_train after cleaning:", y_train.unique())

Unique values in y_train after cleaning: [ 5.  7.  6.  8.  2.  3.  9.  1. 10.  4. 11. 12. 16. 13. 14. 15. 17.]


In [17]:
# 1. Instantiate
model = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

# 2. Fit on training data
model.fit(X_train, y_train)
print("Model training complete.")


ValueError: could not convert string to float: '5+'

In [None]:
# Predict on train and test sets
y_pred_train = model.predict(X_train)
y_pred_test  = model.predict(X_test)


NameError: name 'model' is not defined

In [None]:
# Define a helper for printing results
def print_metrics(true, pred, label):
    mae = mean_absolute_error(true, pred)
    rmse = mean_squared_error(true, pred, squared=False)
    r2   = r2_score(true, pred)
    print(f"{label} MAE:  {mae:.2f}")
    print(f"{label} RMSE: {rmse:.2f}")
    print(f"{label} R²:   {r2:.3f}\n")

# Evaluate
print_metrics(y_train, y_pred_train, "Train")
print_metrics(y_test,  y_pred_test,  "Test")


In [None]:
# Create a DataFrame of feature importances
importances = pd.DataFrame({
    'feature': X_train.columns,
    'importance': model.feature_importances_
}).sort_values(by='importance', ascending=False)

display(importances.head(10))


In [None]:
import joblib
joblib.dump(model, "../src/rf_baseline_model.joblib")
print("Model saved to src/rf_baseline_model.joblib")
