In [1]:
# importing libraries
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from sklearn.linear_model import Ridge
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor

In [2]:
# loading dataset
train = pd.read_csv(r'/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv(r'/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
# shape
train.shape

(1460, 81)

In [4]:
# features
print(f"Features - {train.columns.tolist()}")

Features - ['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea

In [5]:
# null data count
print(f"Null Values in training data - {train.isnull().sum().sum()}")
print(f"Null Values in testing data - {test.isnull().sum().sum()}")

Null Values in training data - 7829
Null Values in testing data - 7878


In [6]:
# Features with null values
print("Null Data-\n",dict(train.isnull().sum()))

Null Data-
 {'Id': 0, 'MSSubClass': 0, 'MSZoning': 0, 'LotFrontage': 259, 'LotArea': 0, 'Street': 0, 'Alley': 1369, 'LotShape': 0, 'LandContour': 0, 'Utilities': 0, 'LotConfig': 0, 'LandSlope': 0, 'Neighborhood': 0, 'Condition1': 0, 'Condition2': 0, 'BldgType': 0, 'HouseStyle': 0, 'OverallQual': 0, 'OverallCond': 0, 'YearBuilt': 0, 'YearRemodAdd': 0, 'RoofStyle': 0, 'RoofMatl': 0, 'Exterior1st': 0, 'Exterior2nd': 0, 'MasVnrType': 872, 'MasVnrArea': 8, 'ExterQual': 0, 'ExterCond': 0, 'Foundation': 0, 'BsmtQual': 37, 'BsmtCond': 37, 'BsmtExposure': 38, 'BsmtFinType1': 37, 'BsmtFinSF1': 0, 'BsmtFinType2': 38, 'BsmtFinSF2': 0, 'BsmtUnfSF': 0, 'TotalBsmtSF': 0, 'Heating': 0, 'HeatingQC': 0, 'CentralAir': 0, 'Electrical': 1, '1stFlrSF': 0, '2ndFlrSF': 0, 'LowQualFinSF': 0, 'GrLivArea': 0, 'BsmtFullBath': 0, 'BsmtHalfBath': 0, 'FullBath': 0, 'HalfBath': 0, 'BedroomAbvGr': 0, 'KitchenAbvGr': 0, 'KitchenQual': 0, 'TotRmsAbvGrd': 0, 'Functional': 0, 'Fireplaces': 0, 'FireplaceQu': 690, 'GarageTy

In [7]:
# Dropping Useless Features
columns=['Alley','MasVnrType','FireplaceQu','PoolQC','Fence','MiscFeature','Utilities','Street']
train=train.drop(columns,axis=1)
test=test.drop(columns,axis=1)

In [8]:
bsmt_garage_features = [
    'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 
    'Electrical', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
    'LotFrontage', 'MasVnrArea', 'GarageYrBlt', 'BsmtFinSF1', 'BsmtFinSF2', 
    'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 
    'GarageCars', 'GarageArea'
]

# Filling missing values in both the training and test datasets for the specified features using mean
for feature in bsmt_garage_features:
    train[feature] = train[feature].fillna(train[feature].mode()[0])  
    test[feature] = test[feature].fillna(test[feature].mode()[0])    

other_features = ['MSZoning', 'Exterior1st', 'Exterior2nd', 'KitchenQual', 'Functional', 'SaleType']

# Filling missing values in the test dataset for the specified features using mode
for feature in other_features:
    test[feature] = test[feature].fillna(test[feature].mode()[0])


In [9]:
# Feature Encoding 
encoder = LabelEncoder()

for i in train.columns:
    if train[i].dtype=='object':
        train[i]=encoder.fit_transform(train[i])
        
for i in test.columns:
    if test[i].dtype=='object':
        test[i]=encoder.fit_transform(test[i])

In [10]:
# Applying logarithmic transformation
train['SalePrice']=np.log1p(train['SalePrice'])

In [11]:
# splitting into dependent & independent features
X = train.drop(['SalePrice', 'Id'],axis=1)
y = train['SalePrice']

In [12]:
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Base models
xgb_model = xgb.XGBRegressor(n_estimators=3200, random_state=42)
rf_model = RandomForestRegressor(n_estimators=2000, random_state=42)
lgb_model = lgb.LGBMRegressor(n_estimators=3200, random_state=42)
gboost_model = GradientBoostingRegressor(n_estimators=3300, random_state =42)
adaboost_model = AdaBoostRegressor(n_estimators=3200, random_state =42)

# Train base models
xgb_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)
lgb_model.fit(X_train, y_train)
gboost_model.fit(X_train, y_train)
adaboost_model.fit(X_train, y_train)

# Generate predictions from the base models
xgb_pred_train = xgb_model.predict(X_train)
rf_pred_train = rf_model.predict(X_train)
lgb_pred_train = lgb_model.predict(X_train)
gboost_pred_train = gboost_model.predict(X_train)
adaboost_pred_train = adaboost_model.predict(X_train)

xgb_pred_test = xgb_model.predict(X_test)
rf_pred_test = rf_model.predict(X_test)
lgb_pred_test = lgb_model.predict(X_test)
gboost_pred_test = gboost_model.predict(X_test)
adaboost_pred_test = adaboost_model.predict(X_test)

# Create a new dataset from the predictions of the base models
train_stack = np.column_stack((xgb_pred_train, rf_pred_train, lgb_pred_train, gboost_pred_train, adaboost_pred_train))
test_stack = np.column_stack((xgb_pred_test, rf_pred_test, lgb_pred_test, gboost_pred_test, adaboost_pred_test))

# Meta-model (Stacking model)
meta_model = Ridge()

# Train the meta-model on base model predictions
meta_model.fit(train_stack, y_train)

# Predict using the meta-model
final_predictions = meta_model.predict(test_stack)

# Evaluate the model
mse = mean_squared_error(y_test, final_predictions)
print(f"Root Mean Squared Error of Stacked Model: {np.sqrt(mse)}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005137 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3095
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 68
[LightGBM] [Info] Start training from score 12.030658
Root Mean Squared Error of Stacked Model: 0.13697688923495255


In [14]:
# Generate predictions from the base models for the test data (without the 'Id' column)
test_data = test.drop(columns=['Id'], axis=1)
xgb_pred_test = xgb_model.predict(test_data)
rf_pred_test = rf_model.predict(test_data)
lgb_pred_test = lgb_model.predict(test_data)
gboost_pred_test = gboost_model.predict(test_data)
adaboost_pred_test = adaboost_model.predict(test_data)

# Stack the predictions from the base models
test_stack = np.column_stack((xgb_pred_test, rf_pred_test, lgb_pred_test, gboost_pred_test, adaboost_pred_test))

# Use the meta-model to predict 'SalePrice' for the test data
stacked_predictions = meta_model.predict(test_stack)

# Reverse the log transformation (if applied during training)
final_predictions = np.exp(stacked_predictions)

# Prepare the submission DataFrame
submission = pd.DataFrame()
submission['Id'] = test['Id']  # Make sure the 'Id' column exists in the test dataset
submission['SalePrice'] = final_predictions

# Save the submission file
submission.to_csv('submission.csv', index=False)

print("Submission file created successfully!")

Submission file created successfully!
