In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [2]:
# Load the datasets
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

# Check the shape of train and test datasets
print("Original train dataset shape:", train.shape)
print("Original test dataset shape:", test.shape)


Original train dataset shape: (1460, 81)
Original test dataset shape: (1459, 80)


In [3]:
# Load the train and test datasets
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

# Display the first few rows of each dataset to confirm successful loading
print("Train Dataset Head:")
print(train.head())
print("\nTest Dataset Head:")
print(test.head())


Train Dataset Head:
   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0

In [4]:
# Add a marker to differentiate between training and test datasets
train['Dataset'] = 'train'
test['Dataset'] = 'test'

# Save the target variable and remove it from the training set
train_labels = train['SalePrice']
train.drop(columns=['SalePrice'], inplace=True)

# Combine the datasets
combined = pd.concat([train, test], axis=0, ignore_index=True)

print("Combined dataset shape:", combined.shape)
print("Combined dataset head:")
print(combined.head())


Combined dataset shape: (2919, 81)
Combined dataset head:
   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  

# dropping top 10 missing features

In [5]:
# List of features to remove due to high missing values
features_to_remove = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'MasVnrType', 
                      'FireplaceQu', 'LotFrontage', 'GarageYrBlt', 'GarageFinish', 'GarageQual']

# Drop these features from the combined dataset
combined = combined.drop(columns=features_to_remove)

# Check the shape of the dataset after dropping the features
print("Shape of combined dataset after dropping features:", combined.shape)

# Check if there are any remaining missing values
print("Remaining missing values in combined dataset:")
print(combined.isnull().sum().sort_values(ascending=False).head(10))


Shape of combined dataset after dropping features: (2919, 71)
Remaining missing values in combined dataset:
GarageCond      159
GarageType      157
BsmtExposure     82
BsmtCond         82
BsmtQual         81
BsmtFinType2     80
BsmtFinType1     79
MasVnrArea       23
MSZoning          4
BsmtFullBath      2
dtype: int64


# imputing remaining missing features

In [6]:
from sklearn.impute import SimpleImputer

# Separate the features into numerical and categorical
numeric_features = combined.select_dtypes(include=[np.number]).columns
categorical_features = combined.select_dtypes(include=[object]).columns

# Impute missing values in numerical features with the median
numeric_imputer = SimpleImputer(strategy='median')
combined[numeric_features] = numeric_imputer.fit_transform(combined[numeric_features])

# Impute missing values in categorical features with the most frequent value
categorical_imputer = SimpleImputer(strategy='most_frequent')
combined[categorical_features] = categorical_imputer.fit_transform(combined[categorical_features])

# Check if there are any remaining missing values
print("Remaining missing values in combined dataset after imputation:")
print(combined.isnull().sum().sort_values(ascending=False).head(10))


Remaining missing values in combined dataset after imputation:
Id              0
BsmtHalfBath    0
TotRmsAbvGrd    0
KitchenQual     0
KitchenAbvGr    0
BedroomAbvGr    0
HalfBath        0
FullBath        0
BsmtFullBath    0
Fireplaces      0
dtype: int64


# adding house age and time since last renovation

In [7]:
# Create the 'HouseAge' feature
combined['HouseAge'] = combined['YrSold'] - combined['YearBuilt']

# Create the 'LastRenovated' feature
combined['LastRenovated'] = combined['YrSold'] - combined['YearRemodAdd']

# A quick look at the new features
print("New features 'HouseAge' and 'LastRenovated' added:")
print(combined[['HouseAge', 'LastRenovated']].head())


New features 'HouseAge' and 'LastRenovated' added:
   HouseAge  LastRenovated
0       5.0            5.0
1      31.0           31.0
2       7.0            6.0
3      91.0           36.0
4       8.0            8.0


In [8]:
# Convert 'HouseAge' and 'LastRenovated' to integers
combined['HouseAge'] = combined['HouseAge'].astype(int)
combined['LastRenovated'] = combined['LastRenovated'].astype(int)

# Verify the conversion
print(combined[['HouseAge', 'LastRenovated']].dtypes)


HouseAge         int64
LastRenovated    int64
dtype: object


In [9]:
# Identify categorical features
categorical_features = combined.select_dtypes(include=[object]).columns.tolist()

# Display the categorical features
print("Categorical features to be encoded:")
print(categorical_features)

# Display unique values in categorical features to determine the appropriate encoding method
for feature in categorical_features:
    print(f"\n{feature}: {combined[feature].unique()}")


Categorical features to be encoded:
['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'GarageType', 'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition', 'Dataset']

MSZoning: ['RL' 'RM' 'C (all)' 'FV' 'RH']

Street: ['Pave' 'Grvl']

LotShape: ['Reg' 'IR1' 'IR2' 'IR3']

LandContour: ['Lvl' 'Bnk' 'Low' 'HLS']

Utilities: ['AllPub' 'NoSeWa']

LotConfig: ['Inside' 'FR2' 'Corner' 'CulDSac' 'FR3']

LandSlope: ['Gtl' 'Mod' 'Sev']

Neighborhood: ['CollgCr' 'Veenker' 'Crawfor' 'NoRidge' 'Mitchel' 'Somerst' 'NWAmes'
 'OldTown' 'BrkSide' 'Sawyer' 'NridgHt' 'NAmes' 'SawyerW' 'IDOTRR'
 'MeadowV' 'Edwards' 'Timber' 'Gilbert' 'StoneBr' 'ClearCr' 

# converting data

In [10]:
from sklearn.preprocessing import LabelEncoder

# Define a dictionary for manual label encoding
ordinal_mapping = {
    'ExterQual': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'ExterCond': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'BsmtQual': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'BsmtCond': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'HeatingQC': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'KitchenQual': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'GarageCond': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'PavedDrive': {'N': 0, 'P': 1, 'Y': 2},
}

# Apply label encoding based on the ordinal mapping
for feature, mapping in ordinal_mapping.items():
    combined[feature] = combined[feature].map(mapping)

print("Ordinal features label encoded.")


Ordinal features label encoded.


In [11]:
# List of features to one-hot encode (excluding the already label-encoded ones)
one_hot_features = list(set(categorical_features) - set(ordinal_mapping.keys()) - {'Dataset'})

# Apply one-hot encoding
combined = pd.get_dummies(combined, columns=one_hot_features, drop_first=True)

print("Nominal features one-hot encoded.")
print("Shape of combined dataset after one-hot encoding:", combined.shape)


Nominal features one-hot encoded.
Shape of combined dataset after one-hot encoding: (2919, 207)


# splitting data back to train and test sets

In [12]:
# Split the data back into the original training and test sets
processed_train = combined[combined['Dataset'] == 'train'].drop(columns=['Dataset'])
processed_test = combined[combined['Dataset'] == 'test'].drop(columns=['Dataset'])

# Reattach the target variable to the training set
processed_train['SalePrice'] = train_labels.values

print("Final processed train shape:", processed_train.shape)
print("Final processed test shape:", processed_test.shape)


Final processed train shape: (1460, 207)
Final processed test shape: (1459, 206)


In [13]:
# Find columns that are in train but not in test
train_not_in_test = set(processed_train.columns) - set(processed_test.columns)

# Find columns that are in test but not in train
test_not_in_train = set(processed_test.columns) - set(processed_train.columns)

# Print the results
print("Columns in train but not in test:", train_not_in_test)
print("Columns in test but not in train:", test_not_in_train)


Columns in train but not in test: {'SalePrice'}
Columns in test but not in train: set()


# using xgboost model

In [14]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


In [15]:
# Prepare features and target
X = processed_train.drop(columns=['SalePrice'])
y = processed_train['SalePrice']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert data into DMatrix format (optimized for XGBoost)
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(processed_test)  # Test set


In [16]:
# Define the parameters for XGBoost
params = {
    'objective': 'reg:squarederror',  # Regression task
    'max_depth': 6,                   # Maximum depth of a tree
    'learning_rate': 0.1,             # Step size shrinkage
    'n_estimators': 1000,             # Number of boosting rounds
    'subsample': 0.8,                 # Subsample ratio of the training instance
    'colsample_bytree': 0.8,          # Subsample ratio of columns when constructing each tree
    'eval_metric': 'rmse'             # Evaluation metric for validation
}

# Set up the evaluation set
evals = [(dtrain, 'train'), (dval, 'eval')]


In [17]:
# Train the model
model = xgb.train(params, dtrain, num_boost_round=1000, evals=evals, early_stopping_rounds=50, verbose_eval=10)


[0]	train-rmse:71229.80461	eval-rmse:81384.44930
[10]	train-rmse:34026.33375	eval-rmse:44332.07387
[20]	train-rmse:19295.60330	eval-rmse:31515.10967


Parameters: { "n_estimators" } are not used.



[30]	train-rmse:13257.95344	eval-rmse:26835.38257
[40]	train-rmse:10709.30400	eval-rmse:25059.28871
[50]	train-rmse:9424.32948	eval-rmse:24412.99846
[60]	train-rmse:8565.68397	eval-rmse:24271.28220
[70]	train-rmse:7781.27204	eval-rmse:24008.09410
[80]	train-rmse:7092.21255	eval-rmse:23884.49740
[90]	train-rmse:6517.23968	eval-rmse:23749.09939
[100]	train-rmse:5912.36094	eval-rmse:23713.95521
[110]	train-rmse:5466.03883	eval-rmse:23642.43630
[120]	train-rmse:4992.17855	eval-rmse:23614.15993
[130]	train-rmse:4511.95218	eval-rmse:23641.63125
[140]	train-rmse:4170.84970	eval-rmse:23655.93376
[150]	train-rmse:3884.97989	eval-rmse:23651.41922
[160]	train-rmse:3539.56808	eval-rmse:23619.96133
[170]	train-rmse:3186.66039	eval-rmse:23610.42371
[180]	train-rmse:2933.50012	eval-rmse:23615.19748
[190]	train-rmse:2657.79306	eval-rmse:23603.78163
[200]	train-rmse:2446.47858	eval-rmse:23589.19577
[210]	train-rmse:2253.37003	eval-rmse:23568.22952
[220]	train-rmse:2064.61173	eval-rmse:23583.22155
[230]

model is starting overfitting. checking crossvalidation

In [18]:
from sklearn.model_selection import KFold
import xgboost as xgb

# Prepare features and target
X = processed_train.drop(columns=['SalePrice'])
y = processed_train['SalePrice']

# Convert data into DMatrix format (optimized for XGBoost)
dtrain = xgb.DMatrix(X, label=y)

# Define the parameters for XGBoost
params = {
    'objective': 'reg:squarederror',  # Regression task
    'max_depth': 6,                   # Maximum depth of a tree
    'learning_rate': 0.1,             # Step size shrinkage
    'subsample': 0.8,                 # Subsample ratio of the training instance
    'colsample_bytree': 0.8,          # Subsample ratio of columns when constructing each tree
    'eval_metric': 'rmse'             # Evaluation metric for validation
}

# Set up cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
cv_results = xgb.cv(
    params=params,
    dtrain=dtrain,
    num_boost_round=1000,
    folds=kf,
    early_stopping_rounds=50,
    metrics="rmse",
    as_pandas=True,
    seed=42,
    verbose_eval=10
)

# Print the results
print(cv_results)
print(f"Best RMSE: {cv_results['test-rmse-mean'].min()}")


[0]	train-rmse:73086.10501+1335.30267	test-rmse:73677.28542+5373.13654
[10]	train-rmse:34081.45858+567.25903	test-rmse:40634.46092+3355.19213
[20]	train-rmse:18959.61557+326.26080	test-rmse:31769.26127+4345.95418
[30]	train-rmse:12871.68930+246.79306	test-rmse:29597.97216+5980.71841
[40]	train-rmse:10197.73783+240.93550	test-rmse:28745.01497+6858.54906
[50]	train-rmse:8852.60161+250.90801	test-rmse:28539.23942+7346.90084
[60]	train-rmse:7961.59175+198.28700	test-rmse:28437.64935+7540.60635
[70]	train-rmse:7332.73359+179.73021	test-rmse:28320.49268+7610.20831
[80]	train-rmse:6614.41026+182.55521	test-rmse:28278.29636+7647.64142
[90]	train-rmse:6043.98991+185.89772	test-rmse:28260.27707+7666.45442
[100]	train-rmse:5487.64262+152.68442	test-rmse:28237.47630+7693.45940
[110]	train-rmse:4980.76940+112.44662	test-rmse:28256.34278+7697.27900
[120]	train-rmse:4529.92484+115.34464	test-rmse:28252.73039+7700.64654
[130]	train-rmse:4134.93779+91.92738	test-rmse:28231.94192+7684.94097
[140]	train-

# hyperparameter tuning

In [19]:
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score

# Initialize the XGBoost model with reasonable defaults
model = XGBRegressor(
    objective='reg:squarederror',
    max_depth=6,
    learning_rate=0.1,
    n_estimators=1000,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

# Use cross-validation with early stopping
cv_scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=3)

# Calculate RMSE from negative MSE
rmse_scores = (-cv_scores)**0.5

print("Cross-validated RMSE:", rmse_scores.mean())


Cross-validated RMSE: 27246.94588480625


In [20]:
# Calculate the mean sale price from the training data
mean_sale_price = y.mean()
print("Mean Sale Price:", mean_sale_price)


Mean Sale Price: 180921.19589041095


In [21]:
# Calculate RMSE as a percentage of the mean sale price
rmse_percentage = (27554.317281169406 / mean_sale_price) * 100
print(f"RMSE as a percentage of the mean sale price: {rmse_percentage:.2f}%")


RMSE as a percentage of the mean sale price: 15.23%


In [22]:
# Initialize the XGBoost model with the chosen parameters
final_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    max_depth=6,
    learning_rate=0.1,
    n_estimators=1000,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

# Train the model on the entire training set
final_model.fit(X, y)


In [23]:
# Predict on the test set
final_predictions = final_model.predict(processed_test)


In [24]:
# Prepare the submission DataFrame
submission = pd.DataFrame({
    'Id': test['Id'],  # Assuming 'Id' is in your test set
    'SalePrice': final_predictions
})

# Save the submission file
submission.to_csv('submission.csv', index=False)
print("Submission file created!")


Submission file created!
