# 1. Importing basic libraries and the dataset

In [59]:
# Basic libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

In [60]:
# Load the dataset
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_train.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [61]:
# Merge the training data and test data together
df_total = pd.concat([df_train.drop('SalePrice', axis=1), df_test], axis=0)

In [62]:
# Check the datatypes of each column
df_total.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MiscVal            int64
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
Length: 80, dtype: object

# 2. Data cleaning

In [63]:
# Remove the column for ID
df_total.drop('Id', axis=1, inplace=True)

In [64]:
# Fill the empty numerical data with 0
for col in df_total.columns:
    if df_train[col].dtype != 'object':
        df_train[col] = df_train[col].fillna(0)

In [65]:
df_total.head(5)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


# 3. Converting categorical values into numerical values with one-hot encoding

In [34]:
# Obtain categorical columns
cat_cols = df_total.select_dtypes(include='object').columns.tolist()

# One-hot encoding for categorical columns (inputs)
df1 = pd.get_dummies(df_total, columns=cat_cols)

In [35]:
# Check dataset structure to confirm that one-hot encoding has been applied
print(df1.head(5))

   MSSubClass  LotFrontage  LotArea  OverallQual  OverallCond  YearBuilt  \
0          60         65.0     8450            7            5       2003   
1          20         80.0     9600            6            8       1976   
2          60         68.0    11250            7            5       2001   
3          70         60.0     9550            7            5       1915   
4          60         84.0    14260            8            5       2000   

   YearRemodAdd  MasVnrArea  BsmtFinSF1  BsmtFinSF2  ...  SaleType_ConLw  \
0          2003       196.0       706.0         0.0  ...           False   
1          1976         0.0       978.0         0.0  ...           False   
2          2002       162.0       486.0         0.0  ...           False   
3          1970         0.0       216.0         0.0  ...           False   
4          2000       350.0       655.0         0.0  ...           False   

   SaleType_New  SaleType_Oth  SaleType_WD  SaleCondition_Abnorml  \
0         False  

# 4. Training the model

In [44]:
# Features (X) and Target (y)
X = df1.iloc[:len(df_train)]
X_main_test = df1.iloc[len(df_train):]
y = np.log1p(df_train['SalePrice'])

In [45]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

In [46]:
# Step 1: Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False]  # Whether bootstrap samples are used when building trees
}

In [47]:
# Step 2: Initialize the RandomForestRegressor
rf_model = RandomForestRegressor(random_state=30, n_jobs=-1)

In [48]:
# Step 3: Set up GridSearchCV with cross-validation
grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    cv=5,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
    verbose=2
)

In [49]:
# Step 4: Fit GridSearchCV to the training data
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


In [50]:
# Step 5: Output the best parameters found during the grid search
print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}


In [51]:
# Step 6: Make predictions on the test set using the best estimator
best_rf_model = grid_search.best_estimator_
y_pred = best_rf_model.predict(X_test)

In [52]:
best_rmse = -grid_search.best_score_
print(f"Best CV RMSE: {best_rmse:.4f}")

Best CV RMSE: 0.1519


# 5. Input test data into the random forest model to obtain the predicted SalePrice

In [53]:
best_model = grid_search.best_estimator_
best_model.fit(X, y)

In [55]:
test_preds_log = best_model.predict(X_main_test)
test_preds = np.expm1(test_preds_log)

In [57]:
df_final = pd.DataFrame({
    'Id': df_test['Id'],
    'SalePrice': test_preds
})

In [58]:
df_final.head(5)

Unnamed: 0,Id,SalePrice
0,1461,128796.903655
1,1462,151617.10455
2,1463,178692.583634
3,1464,181232.698436
4,1465,196016.505094
