Import libraries


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


Load

In [3]:
train_df = pd.read_csv('train (1).csv')
test_df = pd.read_csv('test.csv')


Explore

In [4]:
train_df.info()
train_df.describe()
train_df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


Check missing values

In [5]:
missing_values = train_df.isnull().sum().sort_values(ascending=False)
missing_values[missing_values > 0]


PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
MasVnrType       872
FireplaceQu      690
LotFrontage      259
GarageYrBlt       81
GarageCond        81
GarageType        81
GarageFinish      81
GarageQual        81
BsmtFinType2      38
BsmtExposure      38
BsmtQual          37
BsmtCond          37
BsmtFinType1      37
MasVnrArea         8
Electrical         1
dtype: int64

Handle missing data

In [6]:
# Categorical 
train_df['MasVnrType'].fillna(train_df['MasVnrType'].mode()[0], inplace=True)

# Numerical
train_df['LotFrontage'].fillna(train_df['LotFrontage'].median(), inplace=True)

# Drop columns with too many missing values
train_df.drop(['Alley', 'PoolQC', 'Fence', 'MiscFeature'], axis=1, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['MasVnrType'].fillna(train_df['MasVnrType'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['LotFrontage'].fillna(train_df['LotFrontage'].median(), inplace=True)


Feature Engineering

Combine train and test

In [7]:
train_df['TrainFlag'] = 1
test_df['TrainFlag'] = 0
test_df['SalePrice'] = np.nan  # Add target column for merging
full_df = pd.concat([train_df, test_df])


Label Encoding/One-hot Encoding

In [8]:
# Label Encoding for ordinal features
from sklearn.preprocessing import LabelEncoder
ordinal_cols = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual']
for col in ordinal_cols:
    le = LabelEncoder()
    full_df[col] = le.fit_transform(full_df[col].astype(str))

# One-hot Encoding for nominal categorical variables
full_df = pd.get_dummies(full_df, drop_first=True)


Create new features

In [9]:
# Total square footage
full_df['TotalSF'] = full_df['TotalBsmtSF'] + full_df['1stFlrSF'] + full_df['2ndFlrSF']

# Age of house
full_df['HouseAge'] = full_df['YrSold'] - full_df['YearBuilt']

# Time since remodel
full_df['RemodAge'] = full_df['YrSold'] - full_df['YearRemodAdd']

# Total Bathrooms
full_df['TotalBath'] = full_df['FullBath'] + (0.5 * full_df['HalfBath']) + \
                       full_df['BsmtFullBath'] + (0.5 * full_df['BsmtHalfBath'])


Drop unnecessary features

In [10]:
full_df.drop(['TrainFlag', 'Id'], axis=1, inplace=True)


Split back into train and test

In [11]:
train_df = full_df[full_df['SalePrice'].notnull()]
test_df = full_df[full_df['SalePrice'].isnull()].drop('SalePrice', axis=1)
