In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew
from scipy.special import boxcox1p
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [5]:
test_raw = pd.read_csv('test.csv')
test_ID = test_raw['Id']

In [7]:
train.drop("Id", axis=1, inplace=True)
test.drop("Id", axis=1, inplace=True)

y = train['SalePrice']
train.drop('SalePrice', axis=1, inplace=True)

data = pd.concat([train, test], axis=0, sort=False)

In [9]:
for col in ['PoolQC','MiscFeature','Alley','Fence','FireplaceQu','GarageType','GarageFinish','GarageQual','GarageCond',
            'BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','MasVnrType']:
    data[col] = data[col].fillna("None")

In [11]:
for col in ['GarageYrBlt','GarageArea','GarageCars','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF',
            'BsmtFullBath','BsmtHalfBath','MasVnrArea']:
    data[col] = data[col].fillna(0)

In [13]:
data['LotFrontage'] = data.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))

In [15]:
for col in ['MSZoning', 'Electrical', 'KitchenQual', 'Exterior1st', 'Exterior2nd', 'SaleType', 'Functional']:
    data[col] = data[col].fillna(data[col].mode()[0])

In [19]:
data.drop(['Utilities'], axis=1, inplace=True)

In [21]:
data['TotalSF'] = data['TotalBsmtSF'] + data['1stFlrSF'] + data['2ndFlrSF']
data['TotalBath'] = data['FullBath'] + 0.5*data['HalfBath'] + data['BsmtFullBath'] + 0.5*data['BsmtHalfBath']
data['TotalPorchSF'] = data['OpenPorchSF'] + data['EnclosedPorch'] + data['3SsnPorch'] + data['ScreenPorch']
data['HasPool'] = data['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
data['HasGarage'] = data['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
data['HasBsmt'] = data['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
data['HasFireplace'] = data['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

In [23]:
ordinal_cols = ['FireplaceQu','BsmtQual','BsmtCond','GarageQual','GarageCond','ExterQual','ExterCond',
                'HeatingQC','PoolQC','KitchenQual','BsmtFinType1','BsmtFinType2','Functional','Fence',
                'BsmtExposure','GarageFinish','LandSlope','LotShape','PavedDrive','Street','Alley','CentralAir']

for col in ordinal_cols:
    lbl = LabelEncoder()
    data[col] = lbl.fit_transform(data[col].astype(str))

In [25]:
data = pd.get_dummies(data)

In [27]:
numeric_feats = data.dtypes[data.dtypes != "object"].index
skewed_feats = data[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewness = skewed_feats[abs(skewed_feats) > 0.75]

for feat in skewness.index:
    data[feat] = boxcox1p(data[feat], 0.15)

In [29]:
scaler = StandardScaler()
data[numeric_feats] = scaler.fit_transform(data[numeric_feats])

In [31]:
X_train = data[:len(y)]
X_test = data[len(y):]

In [33]:
model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
model.fit(X_train, y)

In [35]:
rmse = np.sqrt(-cross_val_score(model, X_train, y, scoring="neg_mean_squared_error", cv=5))
print("Cross-validated RMSE:", rmse.mean())

Cross-validated RMSE: 27933.561994063406


In [36]:
preds = model.predict(X_test)

In [37]:
submission = pd.DataFrame({
    "Id": test_ID,
    "SalePrice": preds
})

In [41]:
submission.to_csv("submission.csv", index=False)
print("✅ submission.csv created successfully!")

✅ submission.csv created successfully!
