In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import StackingRegressor

Matplotlib is building the font cache; this may take a moment.


In [3]:
# Load datasets
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [4]:
# Fill missing values for numerical features
numerical_features = train.select_dtypes(include=['int64', 'float64']).columns
numerical_features = numerical_features.drop('SalePrice')  # Exclude target variable

for feature in numerical_features:
    train[feature] = train[feature].fillna(train[feature].median())
#     if feature in test.columns:
    test[feature] = test[feature].fillna(test[feature].median())

# Fill missing values for categorical features
categorical_features = train.select_dtypes(include=['object']).columns
for feature in categorical_features:
    train[feature] = train[feature].fillna('None')
#     if feature in test.columns:
    test[feature] = test[feature].fillna('None')

In [5]:
#  Encode categorical variables using one-hot encoding
train = pd.get_dummies(train)
test = pd.get_dummies(test)

# Align the train and test dataframes by the columns
train, test = train.align(test, join='left', axis=1, fill_value=0)

In [6]:
# Adding these features creates a new feature (TotalSF) that represents the total living space area in the house,
# which can be a strong predictor of the house price
train['TotalSF'] = train['TotalBsmtSF'] + train['1stFlrSF'] + train['2ndFlrSF']
test['TotalSF'] = test['TotalBsmtSF'] + test['1stFlrSF'] + test['2ndFlrSF']

In [7]:
corr_matrix = train.corr()

In [8]:
# Selecting top 10 correlated features with SalePrice
top_features = corr_matrix['SalePrice'].abs().sort_values(ascending=False).head(11).index
print(top_features)

Index(['SalePrice', 'OverallQual', 'TotalSF', 'GrLivArea', 'GarageCars',
       'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'ExterQual_TA', 'FullBath',
       'BsmtQual_Ex'],
      dtype='object')


In [9]:
X = train[top_features].drop('SalePrice', axis=1) # Exclude the target variable
y = train['SalePrice']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
models = {
    'Linear Regression': LinearRegression(),
    'Lasso': Lasso(alpha=0.001, max_iter=10000),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    rmsle = mean_squared_log_error(y_val, preds) ** 0.5
    print(f'{name} RMSLE: {rmsle}')

Linear Regression RMSLE: 0.191803109046764
Lasso RMSLE: 0.1918031087105276
Random Forest RMSLE: 0.17917303543476454
Gradient Boosting RMSLE: 0.17424701111391305


In [12]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30]
}

grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=42), param_grid=param_grid, cv=5, scoring='neg_mean_squared_log_error')
grid_search.fit(X, y)
best_model = grid_search.best_estimator_
print(grid_search.best_params_)

{'max_depth': 10, 'n_estimators': 300}
