# Import libraries and data upload

In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from scipy.stats import skew, kurtosis
from sklearn.preprocessing import LabelEncoder

In [42]:
test = pd.read_csv(r'C:\Users\USER\OneDrive\Documents\Property_prices_predictions\test.csv')
test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


# Data cleaning

In [43]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1455 non-null   object 
 3   LotFrontage    1232 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          107 non-null    object 
 7   LotShape       1459 non-null   object 
 8   LandContour    1459 non-null   object 
 9   Utilities      1457 non-null   object 
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   object 
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  OverallC

In [44]:
test.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           4
LotFrontage      227
LotArea            0
                ... 
MiscVal            0
MoSold             0
YrSold             0
SaleType           1
SaleCondition      0
Length: 80, dtype: int64

In [45]:
# Drop columns that were dropped in the train dataset for uniformity
test.drop(columns = ['MasVnrType','Fence','Alley','MiscFeature','PoolQC'],inplace = True )

In [46]:
(test.isna().sum().sort_values().tail(10) / len(test)) * 100

BsmtQual         3.015764
BsmtExposure     3.015764
BsmtCond         3.084304
GarageType       5.209047
GarageQual       5.346127
GarageFinish     5.346127
GarageCond       5.346127
GarageYrBlt      5.346127
LotFrontage     15.558602
FireplaceQu     50.034270
dtype: float64

### Missingness in FireplaceQu

In [47]:
fire_mask = []
for i in test.columns:
    if i.startswith("Fire"):
        fire_mask.append(i)

fire_mask = test[fire_mask]

def fill_missing(column , value ):
    if column in test.columns :
        test.fillna({ column : value } , inplace = True)
        print(f"Missing values in column {column} is replaced by {value}")
    else :
        print(f"column {column} does not exist")

fill_missing('FireplaceQu', "Zero")

Missing values in column FireplaceQu is replaced by Zero


### Missingness in GarageFinish, GarageQual, GarageCond, GarageYrBlt, GarageType

In [48]:
garage_mask = []
for i in test.columns :
    if i.startswith("Garage"):
        garage_mask.append(i)
garage_mask

garage_mask= test[garage_mask]

garage_mask[garage_mask["GarageType"].isna()][['GarageCars','GarageArea']].sum()

fill_missing('GarageType', "NF")
fill_missing('GarageFinish', "NF")
fill_missing('GarageQual', "NF")
fill_missing('GarageCond', "NF")
fill_missing('GarageYrBlt', 0)
fill_missing('GarageCars', 0)
fill_missing('GarageArea', 0)

Missing values in column GarageType is replaced by NF
Missing values in column GarageFinish is replaced by NF
Missing values in column GarageQual is replaced by NF
Missing values in column GarageCond is replaced by NF
Missing values in column GarageYrBlt is replaced by 0
Missing values in column GarageCars is replaced by 0
Missing values in column GarageArea is replaced by 0


### Missingness in Bsmt

In [49]:
Bsmnt_mask = []
for i in test.columns :
    if i.startswith("Bsmt"):
        Bsmnt_mask.append(i)
Bsmnt_mask

Bsmnt_mask= test[Bsmnt_mask]
Bsmnt_mask.isna().sum()


BsmtQual        44
BsmtCond        45
BsmtExposure    44
BsmtFinType1    42
BsmtFinSF1       1
BsmtFinType2    42
BsmtFinSF2       1
BsmtUnfSF        1
BsmtFullBath     2
BsmtHalfBath     2
dtype: int64

In [50]:
fill_missing('BsmtQual', "NF")
fill_missing('BsmtCond', "NF")
fill_missing('BsmtExposure', "NF")
fill_missing('BsmtFinType1', "NF")
fill_missing('BsmtFinType2', "NF")

Missing values in column BsmtQual is replaced by NF
Missing values in column BsmtCond is replaced by NF
Missing values in column BsmtExposure is replaced by NF
Missing values in column BsmtFinType1 is replaced by NF
Missing values in column BsmtFinType2 is replaced by NF


In [51]:
(test.isna().sum().sort_values().tail(10) / len(test)) * 100

BsmtFinSF1       0.068540
SaleType         0.068540
KitchenQual      0.068540
Utilities        0.137080
BsmtHalfBath     0.137080
BsmtFullBath     0.137080
Functional       0.137080
MSZoning         0.274160
MasVnrArea       1.028101
LotFrontage     15.558602
dtype: float64

### Missingness in MasVnrArea

In [52]:
test['MasVnrArea'].fillna(test['MasVnrArea'].median(), inplace= True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test['MasVnrArea'].fillna(test['MasVnrArea'].median(), inplace= True)


### Missingness in LotFrontage

In [53]:
LotFrontage_mean = test['LotFrontage'].mean()
fill_missing('LotFrontage', LotFrontage_mean)

Missing values in column LotFrontage is replaced by 68.58035714285714


In [54]:
# Fill missing categorical values
test['Exterior1st'].fillna(test['Exterior1st'].mode()[0], inplace=True)
test['SaleType'].fillna(test['SaleType'].mode()[0], inplace=True)
test['KitchenQual'].fillna(test['KitchenQual'].mode()[0], inplace=True)
test['Functional'].fillna('Typ', inplace=True)
test['MSZoning'].fillna(test['MSZoning'].mode()[0], inplace=True)
test['Utilities'].fillna(test['Utilities'].mode()[0], inplace=True)
test['BsmtFinSF2'] = test['BsmtFinSF2'].fillna(0)
test['BsmtUnfSF'] = test['BsmtUnfSF'].fillna(0)
test['TotalBsmtSF'].fillna(0, inplace=True)
test['BsmtFinSF1'].fillna(0, inplace=True)
test['BsmtHalfBath'].fillna(0, inplace=True)
test['BsmtFullBath'].fillna(0, inplace=True)
test['Exterior2nd'] = test['Exterior2nd'].fillna(test['Exterior2nd'].mode()[0])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test['Exterior1st'].fillna(test['Exterior1st'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test['SaleType'].fillna(test['SaleType'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the interm

In [55]:
(test.isna().sum().sort_values().tail(10) / len(test)) * 100

OpenPorchSF      0.0
EnclosedPorch    0.0
3SsnPorch        0.0
ScreenPorch      0.0
PoolArea         0.0
MiscVal          0.0
MoSold           0.0
YrSold           0.0
SaleType         0.0
SaleCondition    0.0
dtype: float64

## Duplicates

In [56]:
test.duplicated().sum()

np.int64(0)

In [57]:
# log transformation
log_cols = ['LotArea', 'LotFrontage', 'MasVnrArea', 
            'OpenPorchSF', 'EnclosedPorch', 'BsmtFinSF1', 
            'WoodDeckSF', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea']

for col in log_cols:
    test[col] = np.log1p(test[col])

# Binary conversion for mostly-zero features
zero_heavy = ['MiscVal', 'PoolArea', '3SsnPorch', 'LowQualFinSF',
              'BsmtFinSF2', 'BsmtHalfBath']
for col in zero_heavy:
    test[col] = (test[col] > 0).astype(int)

In [58]:
# encoding
cat_cols = test.select_dtypes(include=['object']).columns
# Ordinal Encoding
qual_mapping = {
    "Ex": 5,
    "Gd": 4,
    "TA": 3,
    "Fa": 2,
    "Po": 1,
    "NF": 0
}

ordinal_cols = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
                'HeatingQC', 'KitchenQual',
                'GarageQual', 'GarageCond']

for col in ordinal_cols:
    test[col] = test[col].fillna('NF').map(qual_mapping)

# Label Encoding
test['FireplaceQu'] = test['FireplaceQu'].fillna('Zero')
le = LabelEncoder()
test['FireplaceQu'] = le.fit_transform(test['FireplaceQu'])

# One-Hot Encoding
nominal_cols = [col for col in cat_cols if col not in ordinal_cols + ['FireplaceQu']]
test = pd.get_dummies(test, columns=nominal_cols, drop_first=True)

In [59]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Columns: 201 entries, Id to SaleCondition_Partial
dtypes: bool(155), float64(15), int64(31)
memory usage: 745.3 KB


# Feature engineering

In [60]:
# Total Square Footage
test['TotalSF'] = test['TotalBsmtSF'] + test['1stFlrSF'] + test['2ndFlrSF']

# Total Bathrooms
test['Total_Bathrooms'] = (test['FullBath'] + (0.5 * test['HalfBath']) +
                              test['BsmtFullBath'] + (0.5 * test['BsmtHalfBath']))

# House Age Features
test['RemodAge'] = test['YrSold'] - test['YearRemodAdd']
test['HouseAge'] = test['YrSold'] - test['YearBuilt']

# Quality × Size Interaction
test['OverallQualArea'] = test['OverallQual'] * test['TotalSF']

In [65]:
test.shape

(1459, 206)

In [62]:
test.to_csv("processed_test.csv", index=False)