In [22]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [23]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [24]:
print(train_df.head())
train_df.shape

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape   
0   1          60       RL         65.0     8450   Pave   NaN      Reg  \
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold   
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2  \
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   

(1460, 81)

In [25]:
# Print all label with NaN values
for col in train_df.columns:
    if train_df[col].isna().sum() > 0:
        print(col, train_df[col].isna().sum())

LotFrontage 259
Alley 1369
MasVnrType 872
MasVnrArea 8
BsmtQual 37
BsmtCond 37
BsmtExposure 38
BsmtFinType1 37
BsmtFinType2 38
Electrical 1
FireplaceQu 690
GarageType 81
GarageYrBlt 81
GarageFinish 81
GarageQual 81
GarageCond 81
PoolQC 1453
Fence 1179
MiscFeature 1406


In [26]:
print(train_df.head())

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape   
0   1          60       RL         65.0     8450   Pave   NaN      Reg  \
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold   
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2  \
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   

In [27]:
# # Drop columns that are over 50% NaN
# for col in train_df.columns:
#     if train_df[col].isna().sum() > 700:
#         train_df = train_df.drop([col], axis=1)

# print(train_df.head())

In [28]:
# # Print out columns with NaN
# for col in train_df.columns:
#     if train_df[col].isna().sum() > 0:
#         print(col, train_df[col].isna().sum())

In [29]:
# Convert categorical data to numerical data or just drop them

# Replace NaN values with mean on LotFrontage, MasVnrArea, GarageYrBlt
train_df['LotFrontage'] = train_df['LotFrontage'].fillna(train_df['LotFrontage'].mean())
train_df['MasVnrArea'] = train_df['MasVnrArea'].fillna(train_df['MasVnrArea'].mean())
train_df['GarageYrBlt'] = train_df['GarageYrBlt'].fillna(train_df['GarageYrBlt'].mean())

# Drop these features
train_df = train_df.dropna(subset=['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType2', 'Electrical'])

# Map FireplaceQu
fireplacequ_map = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, None: 0}
train_df['FireplaceQu'] = train_df['FireplaceQu'].replace('NA', None).map(fireplacequ_map)
train_df['FireplaceQu'] = train_df['FireplaceQu'].fillna(6)

# Map GarageType
garagetype_map = {'2Types': 6, 'Attchd': 5, 'Basment': 4, 'BuiltIn': 3, 'CarPort': 2, 'Detchd': 1, None: 0}
train_df['GarageType'] = train_df['GarageType'].replace('NA', None).map(garagetype_map)
train_df['GarageType'] = train_df['GarageType'].fillna(0)

# Map GarageFinish
garagefinish_map = {'Fin': 3, 'RFn': 2, 'Unf': 1, None: 0}
train_df['GarageFinish'] = train_df['GarageFinish'].replace('NA', None).map(garagefinish_map)
train_df['GarageFinish'] = train_df['GarageFinish'].fillna(0)

# Map GarageQual
garagequal_map = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, None: 0}
train_df['GarageQual'] = train_df['GarageQual'].replace('NA', None).map(garagequal_map)
train_df['GarageQual'] = train_df['GarageQual'].fillna(0)

# Map GarageCond
garagecond_map = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, None: 0}
train_df['GarageCond'] = train_df['GarageCond'].replace('NA', None).map(garagecond_map)
train_df['GarageCond'] = train_df['GarageCond'].fillna(0)

# Map Alley
alley_map = {'Grvl': 2, 'Pave': 1, None: 0}
train_df['Alley'] = train_df['Alley'].replace('NA', None).map(alley_map)
train_df['Alley'] = train_df['Alley'].fillna(0)

# Map MasVnrType
MasVnrType_map = {'BrkCmn': 4, 'BrkFace': 3, 'CBlock': 2,  'Stone': 1, None: 0}
train_df['MasVnrType'] = train_df['MasVnrType'].replace('NA', None).map(MasVnrType_map)
train_df['MasVnrType'] = train_df['MasVnrType'].fillna(0)

# Map PoolQC
poolqc_map = {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, None: 0}
train_df['PoolQC'] = train_df['PoolQC'].replace('NA', None).map(poolqc_map)
train_df['PoolQC'] = train_df['PoolQC'].fillna(0)

# Map Fence
fence_map = {'GdPrv': 4, 'MnPrv': 3, 'GdWo': 2,  'MnWw': 1, None: 0}
train_df['Fence'] = train_df['Fence'].replace('NA', None).map(fence_map)
train_df['Fence'] = train_df['Fence'].fillna(0)

# Map MiscFeature
miscFeature_map = {'Elev': 5, 'Gar2': 4, 'Othr': 3,  'Shed': 2, 'TenC': 1, None: 0}
train_df['MiscFeature'] = train_df['MiscFeature'].replace('NA', None).map(miscFeature_map)
train_df['MiscFeature'] = train_df['MiscFeature'].fillna(0)

In [30]:
train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,0.0,Reg,Lvl,AllPub,...,0,0.0,0.0,0.0,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,0.0,Reg,Lvl,AllPub,...,0,0.0,0.0,0.0,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,0.0,IR1,Lvl,AllPub,...,0,0.0,0.0,0.0,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,0.0,IR1,Lvl,AllPub,...,0,0.0,0.0,0.0,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,0.0,IR1,Lvl,AllPub,...,0,0.0,0.0,0.0,0,12,2008,WD,Normal,250000


In [31]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

category_features = []
for col in train_df.columns:
    if train_df[col].dtype == 'object':
        category_features.append(col)
print(category_features)

numeric_features = []
for col in train_df.columns:
    if train_df[col].dtype != 'object' and col != 'SalePrice' and col != 'Id':
        numeric_features.append(col)
        
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), category_features)])

# Split into features and targets
X = train_df.drop(columns=['SalePrice'])
y = train_df['SalePrice']

# Preprocess the features
X = preprocessor.fit_transform(X)

# Get the column names for the one-hot encoded features
onehot_columns = preprocessor.named_transformers_['cat'].get_feature_names_out(input_features=category_features)

# Combine the numeric and one-hot encoded feature names
all_feature_names = numeric_features + list(onehot_columns)

# Convert the transformed matrix X back into a DataFrame with the feature names. Pandas provides functions for explorarory data analysis.
X = pd.DataFrame(X, columns=all_feature_names)

print(X.head())

['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'PavedDrive', 'SaleType', 'SaleCondition']
   MSSubClass  LotFrontage   LotArea     Alley  OverallQual  OverallCond   
0    0.075750    -0.230858 -0.207565 -0.246794     0.629142    -0.533001  \
1   -0.870914     0.442867 -0.093716 -0.246794    -0.106757     2.170075   
2    0.075750    -0.096113  0.069632 -0.246794     0.629142    -0.533001   
3    0.312416    -0.455433 -0.098666 -0.246794     0.629142    -0.533001   
4    0.075750     0.622527  0.367618 -0.246794     1.365041    -0.533001   

   YearBuilt  YearRemodAdd  MasVnrType  MasVnrArea  ...  SaleType_ConLw   
0   1.034570      

In [10]:
# from sklearn.preprocessing import LabelEncoder

# # Convert categorical data to numerical data
# for col in train_df.columns:
#     if train_df[col].dtype == 'object':
#         le = LabelEncoder()
#         train_df[col] = le.fit_transform(train_df[col])
# print(train_df.head())



   Id  MSSubClass  MSZoning  LotFrontage  LotArea  Street  Alley  LotShape   
0   1          60         3         65.0     8450       1      2         3  \
1   2          20         3         80.0     9600       1      2         3   
2   3          60         3         68.0    11250       1      2         0   
3   4          70         3         60.0     9550       1      2         0   
4   5          60         3         84.0    14260       1      2         0   

   LandContour  Utilities  ...  PoolArea  PoolQC  Fence  MiscFeature  MiscVal   
0            3          0  ...         0       3      4            4        0  \
1            3          0  ...         0       3      4            4        0   
2            3          0  ...         0       3      4            4        0   
3            3          0  ...         0       3      4            4        0   
4            3          0  ...         0       3      4            4        0   

   MoSold  YrSold  SaleType  SaleCondition  