### The notebooks are use to evaluate the performance of our Model created on new unseen data.

In [1]:
import pandas as pd
import joblib
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Load original labeled data (with target)
data = pd.read_csv('/home/kenbaker-gif/ML-Projects/data/train.csv')

# Prepare features and target
X = data[['rm', 'lstat', 'crim']]
y = data['medv']

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Load scaler and model
scaler = joblib.load('scaler.pkl')
model = joblib.load('sgd_regressor_lr_model.pkl')

# Preprocess test features same way
X_test_scaled = scaler.transform(X_test.values)

# Predict on test data
y_pred = model.predict(X_test_scaled)

# Evaluate
print('R2-score:', r2_score(y_test, y_pred))
print('MSE:', mean_squared_error(y_test, y_pred))


R2-score: 0.6861759772077816
MSE: 28.243654507881192


In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import pandas as pd

# Load your data
data = pd.read_csv('/home/kenbaker-gif/ML-Projects/data/train.csv')

# Select features and target
X = data[['rm', 'lstat', 'crim']]
y = data['medv']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Initialize the Random Forest model with default params (100 trees)
rf = RandomForestRegressor(random_state=42)

# Train the model
rf.fit(X_train, y_train)

# Predict on the test set
y_pred = rf.predict(X_test)

# Evaluate
print("R2:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))

R2: 0.8540893259337793
MSE: 13.131724686567187


In [46]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import pandas as pd

# Load data
data = pd.read_csv('/home/kenbaker-gif/ML-Projects/data/train.csv')
X = data[['rm', 'lstat', 'crim']]
y = data['medv']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Initialize model
rf = RandomForestRegressor(random_state=42)

# Minimal param grid to keep runtime short
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
}

# GridSearch with 3-fold CV for faster tuning
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=3,
    n_jobs=-1,
    scoring='r2',
    verbose=1
)

# Fit GridSearch
grid_search.fit(X_train, y_train)

# Best model and evaluation
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)

print("Best parameters:", grid_search.best_params_)
print("R2:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))

Fitting 3 folds for each of 12 candidates, totalling 36 fits
Best parameters: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}
R2: 0.8576066560420686
MSE: 12.815170665352353
