# Model Training for House Price Prediction


In [41]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load the cleaned datasets (generated from EDA notebook)
train_cleaned = pd.read_csv('../data/train_cleaned.csv')
test_cleaned = pd.read_csv('../data/test_cleaned.csv')

In [43]:
train_cleaned.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SalePrice
0,60.0,65.0,8450.0,7.0,5.0,2003.0,2003.0,196.0,706.0,0.0,...,False,False,False,True,False,False,False,True,False,2.583824
1,20.0,80.0,9600.0,6.0,8.0,1976.0,1976.0,0.0,978.0,0.0,...,False,False,False,True,False,False,False,True,False,2.5733
2,60.0,68.0,11250.0,7.0,5.0,2001.0,2002.0,162.0,486.0,0.0,...,False,False,False,True,False,False,False,True,False,2.589054
3,70.0,60.0,9550.0,7.0,5.0,1915.0,1970.0,0.0,216.0,0.0,...,False,False,False,True,False,False,False,False,False,2.553297
4,60.0,84.0,14260.0,8.0,5.0,2000.0,2000.0,350.0,655.0,0.0,...,False,False,False,True,False,False,False,True,False,2.597433


In [45]:
# Separate features and target variable from training data
X = train_cleaned.drop('SalePrice', axis=1)
y = train_cleaned['SalePrice']

In [47]:
# Split the data into training and validation sets (80% training, 20% validation)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)


In [49]:
# Initialize models
lin_reg = LinearRegression()
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)

In [51]:
# Function to evaluate models using cross-validation
def evaluate_model(model, X_train, y_train):
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    rmse_scores = np.sqrt(-scores)  # Convert negative MSE to RMSE
    return rmse_scores.mean()

In [53]:
# Evaluate each model using cross-validation
print(f"Linear Regression RMSE: {evaluate_model(lin_reg, X_train, y_train)}")
print(f"Random Forest RMSE: {evaluate_model(rf_model, X_train, y_train)}")
print(f"Gradient Boosting RMSE: {evaluate_model(gb_model, X_train, y_train)}")

Linear Regression RMSE: 0.010821585994265616
Random Forest RMSE: 0.010877981635915957
Gradient Boosting RMSE: 0.00962728470820798


In [55]:
# Train the best model (let's assume Gradient Boosting performs the best)
gb_model.fit(X_train, y_train)

In [57]:
# Make predictions on the validation set
gb_preds = gb_model.predict(X_valid)

In [59]:
# Evaluate the model's performance on the validation set using RMSE
gb_rmse = np.sqrt(mean_squared_error(y_valid, gb_preds))
print(f"Gradient Boosting RMSE on Validation Set: {gb_rmse}")

Gradient Boosting RMSE on Validation Set: 0.010358479518198866


## Hyperparameter Tuning with GridSearchCV

In [62]:
# Define a parameter grid for Gradient Boosting
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}


In [64]:
# Use GridSearchCV to tune hyperparameters for Gradient Boosting
grid_search = GridSearchCV(estimator=gb_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [66]:
# Get the best estimator from GridSearchCV
best_gb_model = grid_search.best_estimator_
print(best_gb_model)

GradientBoostingRegressor(learning_rate=0.05, n_estimators=300, random_state=42)


In [68]:
# Evaluate the tuned model on the validation set
best_gb_preds = best_gb_model.predict(X_valid)
best_gb_rmse = np.sqrt(mean_squared_error(y_valid, best_gb_preds))

In [70]:
print(f"Best Gradient Boosting RMSE after tuning: {best_gb_rmse}")
print(f"Best Hyperparameters: {grid_search.best_params_}")

Best Gradient Boosting RMSE after tuning: 0.010134659812292245
Best Hyperparameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 300}


## Final Predictions on Test Data and Submission

In [73]:
# Make predictions on the test set using the best model
test_preds = np.exp(best_gb_model.predict(test_cleaned))  # Revert log transformation


In [77]:
# Prepare the submission file
submission = pd.DataFrame({
    'Id': pd.read_csv('../data/test.csv')['Id'],
    'SalePrice': test_preds
})

In [79]:
# Save the submission to a CSV file
submission.to_csv('submission.csv', index=False)

print("Submission file created successfully!")

Submission file created successfully!
