# Feature Engineering of Housing Prices

This notebook focuses on preparing the dataset by handling missing values, creating new features, encoding categorical variables, scaling numerical features, and splitting the data for modeling.


In [77]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats as stats
from scipy.stats import skew

from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [78]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
submission_data = pd.read_csv('sample_submission.csv')

In [79]:
y_train = train_data['SalePrice'] 
x_train = train_data.drop(columns='SalePrice') 

x_test = test_data 
y_test = submission_data['SalePrice'] 


In [80]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((1460, 80), (1460,), (1459, 80), (1459,))

In [81]:
train_test_data = pd.concat([x_train, x_test], axis=0, ignore_index=True)

train_test_y_data = pd.concat([y_train, y_test], axis=0, ignore_index=True)
dataset = pd.concat([train_test_data, train_test_y_data], axis=1)

print("Train-Test Data Shape:", train_test_data.shape)
print("Dataset Shape:", dataset.shape)


Train-Test Data Shape: (2919, 80)
Dataset Shape: (2919, 81)


In [82]:
dataset = dataset.drop(columns='Id')
train_test_data = train_test_data.drop(columns='Id')

In [83]:
dataset.describe()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,2919.0,2433.0,2919.0,2919.0,2919.0,2919.0,2919.0,2896.0,2918.0,2918.0,2918.0,2918.0,2919.0,2919.0,2919.0,2919.0,2917.0,2917.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2760.0,2918.0,2918.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0
mean,57.137718,69.305795,10168.11408,6.089072,5.564577,1971.312778,1984.264474,102.201312,441.423235,49.582248,560.772104,1051.777587,1159.581706,336.483727,4.694416,1500.759849,0.429894,0.061364,1.568003,0.380267,2.860226,1.044536,6.451524,0.597122,1978.113406,1.766621,472.874572,93.709832,47.486811,23.098321,2.602261,16.06235,2.251799,50.825968,6.213087,2007.792737,180052.854647
std,42.517628,23.344905,7886.996359,1.409947,1.113131,30.291442,20.894344,179.334253,455.610826,169.205611,439.543659,440.766258,392.362079,428.701456,46.396825,506.051045,0.524736,0.245687,0.552969,0.502872,0.822693,0.214462,1.569379,0.646129,25.574285,0.761624,215.394815,126.526589,67.575493,64.244246,25.188169,56.184365,35.663946,567.402211,2.714762,1.314964,57381.565721
min,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,0.0,0.0,334.0,0.0,0.0,334.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1895.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,20.0,59.0,7478.0,5.0,5.0,1953.5,1965.0,0.0,0.0,0.0,220.0,793.0,876.0,0.0,0.0,1126.0,0.0,0.0,1.0,0.0,2.0,1.0,5.0,0.0,1960.0,1.0,320.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2007.0,154795.084126
50%,50.0,68.0,9453.0,6.0,5.0,1973.0,1993.0,0.0,368.5,0.0,467.0,989.5,1082.0,0.0,0.0,1444.0,0.0,0.0,2.0,0.0,3.0,1.0,6.0,1.0,1979.0,2.0,480.0,0.0,26.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,176734.841494
75%,70.0,80.0,11570.0,7.0,6.0,2001.0,2004.0,164.0,733.0,0.0,805.5,1302.0,1387.5,704.0,0.0,1743.5,1.0,0.0,2.0,1.0,3.0,1.0,7.0,1.0,2002.0,2.0,576.0,168.0,70.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,191895.744157
max,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,1526.0,2336.0,6110.0,5095.0,2065.0,1064.0,5642.0,3.0,2.0,4.0,2.0,8.0,3.0,15.0,4.0,2207.0,5.0,1488.0,1424.0,742.0,1012.0,508.0,576.0,800.0,17000.0,12.0,2010.0,755000.0


## Handling Missing Values

- Identified continuous features with more than 20 unique values and categorical features with 20 or fewer unique values, including object features.
- Listed categorical features with meaningful NaN values (features where missing data has a specific meaning) and without meaningful NaN values.
- According to the dataset, the missing values in this feature were filled with 'Typ'
- Replaced meaningful NaNs in specific categorical features with 'None'.
- Filled non-meaningful NaNs in categorical features with the mode (most frequent value) of each column.


In [84]:
# List of continuous numerical features (more than 20 unique values)
continuous_features = [col for col in dataset.select_dtypes(include=['number']).columns
                       if dataset[col].nunique() > 20]

# List of categorical features (numerical categorical with 20 or fewer unique values + object categorical)
categorical_features = [col for col in dataset.select_dtypes(include=['number']).columns
                        if dataset[col].nunique() <= 20] + dataset.select_dtypes(include=['object']).columns.tolist()

print("Continuous Features:", continuous_features)
print("Categorical Features:", categorical_features)


Continuous Features: ['LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageYrBlt', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'MiscVal', 'SalePrice']
Categorical Features: ['MSSubClass', 'OverallQual', 'OverallCond', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'PoolArea', 'MoSold', 'YrSold', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu'

In [85]:
len_con = len(continuous_features)
len_cat = len(categorical_features)

len_con, len_cat, len_con+len_cat

(22, 58, 80)

In [86]:
# List of categorical features with meaningful NaN values
meaningful_nan_features = [
    'Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'FireplaceQu', 'GarageType', 'GarageFinish',
    'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature', 'Functional', 'BsmtFinType1',
    'BsmtFinType2', 'MasVnrType'
]

# List of categorical features without meaningful NaN values
non_meaningful_nan_features = [feature for feature in categorical_features if feature not in meaningful_nan_features]

print("Categorical Features with Meaningful NaN Values:")
print(meaningful_nan_features)
print("\nCategorical Features Without Meaningful NaN Values:")
print(non_meaningful_nan_features)

Categorical Features with Meaningful NaN Values:
['Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature', 'Functional', 'BsmtFinType1', 'BsmtFinType2', 'MasVnrType']

Categorical Features Without Meaningful NaN Values:
['MSSubClass', 'OverallQual', 'OverallCond', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'PoolArea', 'MoSold', 'YrSold', 'MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'PavedDrive', 'SaleType', 'SaleCondition']


In [87]:
len(meaningful_nan_features), len(non_meaningful_nan_features), len(categorical_features)

(16, 42, 58)

In [88]:
dataset['Functional'].unique()

array(['Typ', 'Min1', 'Maj1', 'Min2', 'Mod', 'Maj2', 'Sev', nan],
      dtype=object)

In [89]:
dataset['Functional'].fillna('Typ', inplace=True)


for feature in meaningful_nan_features:
    dataset[feature].fillna('None', inplace=True)


for feature in non_meaningful_nan_features:
    mode_value = dataset[feature].mode()[0]  
    dataset[feature].fillna(mode_value, inplace=True)
    
for col in continuous_features:
    dataset[col].fillna(dataset[col].median(), inplace=True)

In [90]:
null_counts = dataset[categorical_features].isnull().sum().sum()
null_counts

0

In [91]:
null_counts = dataset[continuous_features].isnull().sum().sum()
null_counts

0

In [92]:
dataset.isnull().sum().sum()

0

## Correlation and New Features

- Calculated the correlation between numerical features and `SalePrice` to identify relevant features.
- Created new features based on existing columns:
  - `AgeSum`: Sum of construction and remodel years.
  - `LivAreaTotal`: Weighted living area including basement.
  - `BathScore`: Composite bathroom score.
  - `RoomAndBathScore`: Combined rooms and bathrooms score.
  - `OutdoorSpace`: Total outdoor usable area.
- Dropped original columns used for new feature creation and updated the list of categorical features accordingly.


In [93]:
numerical_features = [col for col in dataset.select_dtypes(include=['number']).columns]

In [94]:
dataset[numerical_features].head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,60,65.0,8450,7,5,2003,2003,196.0,706.0,0.0,150.0,856.0,856,854,0,1710,1.0,0.0,2,1,3,1,8,0,2003.0,2.0,548.0,0,61,0,0,0,0,0,2,2008,208500.0
1,20,80.0,9600,6,8,1976,1976,0.0,978.0,0.0,284.0,1262.0,1262,0,0,1262,0.0,1.0,2,0,3,1,6,1,1976.0,2.0,460.0,298,0,0,0,0,0,0,5,2007,181500.0
2,60,68.0,11250,7,5,2001,2002,162.0,486.0,0.0,434.0,920.0,920,866,0,1786,1.0,0.0,2,1,3,1,6,1,2001.0,2.0,608.0,0,42,0,0,0,0,0,9,2008,223500.0
3,70,60.0,9550,7,5,1915,1970,0.0,216.0,0.0,540.0,756.0,961,756,0,1717,1.0,0.0,1,0,3,1,7,1,1998.0,3.0,642.0,0,35,272,0,0,0,0,2,2006,140000.0
4,60,84.0,14260,8,5,2000,2000,350.0,655.0,0.0,490.0,1145.0,1145,1053,0,2198,1.0,0.0,2,1,4,1,9,1,2000.0,3.0,836.0,192,84,0,0,0,0,0,12,2008,250000.0


In [95]:
correlations = []


for feature in numerical_features:
    if feature in dataset.columns:
        corr = dataset[feature].corr(dataset['SalePrice'])
        correlations.append((feature, abs(corr)))
    else:
        print(f"Feature '{feature}' is not in the dataset. Skipping.")


correlation_df = pd.DataFrame(correlations, columns=['Feature', 'Correlation'])

# Sort by correlation in descending order
#correlation_df = correlation_df.sort_values(by='Correlation', ascending=False).reset_index(drop=True)


print(correlation_df)


          Feature  Correlation
0      MSSubClass     0.087910
1     LotFrontage     0.301460
2         LotArea     0.296497
3     OverallQual     0.550911
4     OverallCond     0.065785
5       YearBuilt     0.362066
6    YearRemodAdd     0.350032
7      MasVnrArea     0.351883
8      BsmtFinSF1     0.272215
9      BsmtFinSF2     0.007398
10      BsmtUnfSF     0.175218
11    TotalBsmtSF     0.453230
12       1stFlrSF     0.462865
13       2ndFlrSF     0.272140
14   LowQualFinSF     0.015395
15      GrLivArea     0.588010
16   BsmtFullBath     0.147705
17   BsmtHalfBath     0.012734
18       FullBath     0.433710
19       HalfBath     0.228837
20   BedroomAbvGr     0.229611
21   KitchenAbvGr     0.071760
22   TotRmsAbvGrd     0.469800
23     Fireplaces     0.353567
24    GarageYrBlt     0.313555
25     GarageCars     0.469236
26     GarageArea     0.464808
27     WoodDeckSF     0.238381
28    OpenPorchSF     0.236376
29  EnclosedPorch     0.070698
30      3SsnPorch     0.029979
31    Sc

In [96]:
dataset['AgeSum'] = dataset['YearBuilt'] + dataset['YearRemodAdd']  
dataset['LivAreaTotal'] = 2 * dataset['GrLivArea'] + dataset['TotalBsmtSF']  
dataset['BathScore'] = (
    dataset['BsmtFullBath'] + 0.5 * dataset['BsmtHalfBath'] +
    2 * dataset['FullBath'] + dataset['HalfBath']
)  
dataset['RoomAndBathScore'] = dataset['TotRmsAbvGrd'] + 2 * dataset['FullBath'] + dataset['HalfBath']  

dataset['OutdoorSpace'] = (
    dataset['WoodDeckSF'] + dataset['OpenPorchSF'] + dataset['EnclosedPorch'] +
    dataset['3SsnPorch'] + dataset['ScreenPorch'] + dataset['PoolArea']
)  


In [97]:
columns_to_drop = [
    'YearBuilt', 'YearRemodAdd', 'GrLivArea', 'TotalBsmtSF',
    'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
    'TotRmsAbvGrd', 'YrSold', 'WoodDeckSF', 'OpenPorchSF',
    'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea'
]

dataset.drop(columns=columns_to_drop, inplace=True)
categorical_features = [feature for feature in categorical_features if feature not in columns_to_drop]

In [98]:
dataset.shape

(2919, 69)

## Transformation

- Identified numerical features in the dataset.
- Calculated skewness for each numerical feature and flagged features with absolute skewness ≥ 0.5 as skewed.
- Applied `log1p` transformation to reduce skewness in the identified skewed features.


In [99]:
numerical_features = [col for col in dataset.columns if dataset[col].dtype != 'object']

skew_df = pd.DataFrame(numerical_features, columns=['Feature'])

skew_df['Skew'] = skew_df['Feature'].apply(lambda feature: skew(dataset[feature].dropna()))
skew_df['Absolute Skew'] = skew_df['Skew'].abs()
skew_df['Skewed'] = skew_df['Absolute Skew'] >= 0.5

skew_df

Unnamed: 0,Feature,Skew,Absolute Skew,Skewed
0,MSSubClass,1.375457,1.375457,True
1,LotFrontage,1.674852,1.674852,True
2,LotArea,12.822431,12.822431,True
3,OverallQual,0.19711,0.19711,False
4,OverallCond,0.570312,0.570312,True
5,MasVnrArea,2.613592,2.613592,True
6,BsmtFinSF1,1.425378,1.425378,True
7,BsmtFinSF2,4.146143,4.146143,True
8,BsmtUnfSF,0.919703,0.919703,True
9,1stFlrSF,1.469604,1.469604,True


In [100]:
skewed_features = skew_df[skew_df['Skewed']]['Feature'].tolist()

for feature in skewed_features:
    dataset[feature] = np.log1p(dataset[feature])

In [101]:

dataset.head(3)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,BedroomAbvGr,KitchenAbvGr,KitchenQual,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,MiscVal,MoSold,SaleType,SaleCondition,SalePrice,AgeSum,LivAreaTotal,BathScore,RoomAndBathScore,OutdoorSpace
0,4.110874,RL,4.189655,9.04204,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,1.791759,Gable,CompShg,VinylSd,VinylSd,BrkFace,5.283204,Gd,TA,PConc,Gd,TA,No,GLQ,6.561031,Unf,0.0,5.01728,GasA,Ex,Y,SBrkr,6.753438,6.751101,0.0,3,0.693147,Gd,Typ,0.0,,Attchd,2003.0,RFn,2.0,548.0,TA,TA,Y,,,,0.0,2,WD,Normal,12.247699,4006,8.361007,6.0,13,4.127134
1,3.044522,RL,4.394449,9.169623,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,2.197225,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,6.886532,Unf,0.0,5.652489,GasA,Ex,Y,SBrkr,7.141245,0.0,0.0,3,0.693147,TA,Typ,0.693147,TA,Attchd,1976.0,RFn,2.0,460.0,TA,TA,Y,,,,0.0,5,WD,Normal,12.109016,3952,8.239329,4.5,10,5.700444
2,4.110874,RL,4.234107,9.328212,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,1.791759,Gable,CompShg,VinylSd,VinylSd,BrkFace,5.09375,Gd,TA,PConc,Gd,TA,Mn,GLQ,6.188264,Unf,0.0,6.075346,GasA,Ex,Y,SBrkr,6.82546,6.765039,0.0,3,0.693147,Gd,Typ,0.693147,TA,Attchd,2001.0,RFn,2.0,608.0,TA,TA,Y,,,,0.0,9,WD,Normal,12.317171,4003,8.410276,6.0,11,3.7612


## Target Encoding

### Steps Performed:
1. **Dataset Duplication**: Created a copy of the dataset for encoding purposes.
2. **Target Encoding**:
   - Calculated the mean `SalePrice` for each category within a feature.
   - Sorted categories based on their mean `SalePrice`.
   - Assigned an ordinal value to each category based on its rank.
   - Replaced original category values in the dataset with their assigned ordinal value.
3. **Conversion**: Converted all columns in the encoded dataset to float type for further processing.

### Outcome:
- Target encoding applied successfully to all categorical features.
- Encoded features:
  - Listed in the variable `qual_encoded`.


In [102]:
encoded_dataset = dataset.copy()

def encode(frame, feature):
    ordering = pd.DataFrame()
    ordering['val'] = frame[feature].unique()
    ordering.index = ordering.val
    # Calculate the mean of 'SalePrice' for each category
    ordering['spmean'] = frame[[feature, 'SalePrice']].groupby(feature).mean()['SalePrice']
    # Sort categories by mean target value
    ordering = ordering.sort_values('spmean')
    # Assign an ordering based on the sorted means
    ordering['ordering'] = range(1, ordering.shape[0] + 1)
    # Convert ordering to a dictionary
    ordering = ordering['ordering'].to_dict()
    
    # Replace categories in the dataset with their assigned ordering
    for cat, o in ordering.items():
        frame.loc[frame[feature] == cat, feature] = o

# Apply encoding to all categorical features
catagory_encoded = []
for feature in categorical_features:
    encode(encoded_dataset, feature)
    catagory_encoded.append(feature )

encoded_dataset = encoded_dataset.astype(float)

print("Target encoding applied to all categorical features.")
print("Encoded features:", catagory_encoded)


Target encoding applied to all categorical features.
Encoded features: ['MSSubClass', 'OverallQual', 'OverallCond', 'BedroomAbvGr', 'KitchenAbvGr', 'Fireplaces', 'GarageCars', 'MoSold', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']


In [103]:
encoded_dataset.head(3)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,BedroomAbvGr,KitchenAbvGr,KitchenQual,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,MiscVal,MoSold,SaleType,SaleCondition,SalePrice,AgeSum,LivAreaTotal,BathScore,RoomAndBathScore,OutdoorSpace
0,16.0,4.0,4.189655,9.04204,2.0,3.0,1.0,2.0,2.0,1.0,1.0,16.0,4.0,5.0,5.0,7.0,7.0,8.0,2.0,3.0,13.0,14.0,3.0,5.283204,3.0,4.0,6.0,4.0,4.0,2.0,7.0,6.561031,6.0,0.0,5.01728,6.0,5.0,2.0,5.0,6.753438,6.751101,0.0,3.0,4.0,3.0,7.0,1.0,2.0,6.0,2003.0,3.0,5.0,548.0,4.0,6.0,3.0,1.0,4.0,3.0,0.0,4.0,5.0,5.0,12.247699,4006.0,8.361007,6.0,13.0,4.127134
1,14.0,4.0,4.394449,9.169623,2.0,3.0,1.0,2.0,2.0,3.0,1.0,21.0,2.0,5.0,5.0,4.0,6.0,5.0,2.0,3.0,6.0,8.0,2.0,0.0,2.0,4.0,3.0,4.0,4.0,5.0,5.0,6.886532,6.0,0.0,5.652489,6.0,5.0,2.0,5.0,7.141245,0.0,0.0,3.0,4.0,2.0,7.0,3.0,4.0,6.0,1976.0,3.0,5.0,460.0,4.0,6.0,3.0,1.0,4.0,3.0,0.0,5.0,5.0,5.0,12.109016,3952.0,8.239329,4.5,10.0,5.700444
2,16.0,4.0,4.234107,9.328212,2.0,3.0,2.0,2.0,2.0,1.0,1.0,16.0,4.0,5.0,5.0,7.0,7.0,8.0,2.0,3.0,13.0,14.0,3.0,5.09375,3.0,4.0,6.0,4.0,4.0,3.0,7.0,6.188264,6.0,0.0,6.075346,6.0,5.0,2.0,5.0,6.82546,6.765039,0.0,3.0,4.0,3.0,7.0,3.0,4.0,6.0,2001.0,3.0,5.0,608.0,4.0,6.0,3.0,1.0,4.0,3.0,0.0,12.0,5.0,5.0,12.317171,4003.0,8.410276,6.0,11.0,3.7612


## Standardization

- Fitted and transformed `encoded_dataset` using the scaler.



In [104]:
scaler = StandardScaler()

scaler.fit(encoded_dataset)

scaled_dataset = pd.DataFrame(scaler.transform(encoded_dataset), index=encoded_dataset.index, columns=encoded_dataset.columns)

In [105]:
scaled_dataset.head(3)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,BedroomAbvGr,KitchenAbvGr,KitchenQual,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,MiscVal,MoSold,SaleType,SaleCondition,SalePrice,AgeSum,LivAreaTotal,BathScore,RoomAndBathScore,OutdoorSpace
0,1.009364,0.282881,-0.040039,-0.103719,0.064249,0.257283,-0.656298,-0.133314,0.018512,-0.474733,-0.21626,0.47243,0.08348,0.021356,0.383156,1.408846,0.648263,0.747523,-0.488446,-0.106444,1.110645,1.078474,0.812649,1.22267,1.039805,0.341656,1.078627,0.611926,0.12267,-0.583117,0.999223,0.783116,0.331588,-0.362565,-0.326899,0.104271,0.846286,0.26829,0.261497,-0.777287,1.196876,-0.116932,-0.184805,0.21828,0.738935,0.245129,-0.994988,-0.911409,0.640088,0.998954,0.316551,-0.383934,0.348888,0.283251,0.302458,0.31503,-0.053507,0.286328,0.164382,-0.189329,-1.063542,-0.235232,0.189839,0.650638,1.092947,0.326801,1.482613,1.190349,-0.080419
1,0.511924,0.282881,0.598801,0.146544,0.064249,0.257283,-0.656298,-0.133314,0.018512,1.467447,-0.21626,1.266159,-1.922103,0.021356,0.383156,-0.497995,-0.066884,-0.77199,-0.488446,-0.106444,-1.273408,-0.867178,-0.701149,-0.793461,-0.683756,0.341656,-0.730516,0.611926,0.12267,2.221033,-0.135251,0.892327,0.331588,-0.362565,0.013485,0.104271,0.846286,0.26829,0.261497,0.424865,-0.86361,-0.116932,-0.184805,0.21828,-0.771535,0.245129,0.79894,0.560053,0.640088,-0.08694,0.316551,-0.383934,-0.059804,0.283251,0.302458,0.31503,-0.053507,0.286328,0.164382,-0.189329,-0.72239,-0.235232,0.189839,0.174847,-0.077539,-0.072216,0.383549,0.012642,0.690363
2,1.009364,0.282881,0.098625,0.457629,0.064249,0.257283,0.903746,-0.133314,0.018512,-0.474733,-0.21626,0.47243,0.08348,0.021356,0.383156,1.408846,0.648263,0.747523,-0.488446,-0.106444,1.110645,1.078474,0.812649,1.150372,1.039805,0.341656,1.078627,0.611926,0.12267,0.3516,0.999223,0.658046,0.331588,-0.362565,0.240077,0.104271,0.846286,0.26829,0.261497,-0.554028,1.201129,-0.116932,-0.184805,0.21828,0.738935,0.245129,0.79894,0.560053,0.640088,0.918517,0.316551,-0.383934,0.627542,0.283251,0.302458,0.31503,-0.053507,0.286328,0.164382,-0.189329,1.66567,-0.235232,0.189839,0.888982,1.02792,0.488368,1.482613,0.405211,-0.259694


# Preparing Data for Modeling

- Created `train_x` and `test_x` from `scaled_dataset`, excluding `SalePrice`.
- Transformed `y_train` using `log1p` and stored it as `log_train_y`.


In [106]:
train_test_data = scaled_dataset.drop(columns='SalePrice')
train_x = train_test_data.iloc[:y_train.shape[0], :]
test_x = train_test_data.iloc[y_train.shape[0]:, :]

In [107]:
log_train_y = np.log1p(y_train)

In [108]:
train_x.shape, log_train_y.shape, test_x.shape

((1460, 68), (1460,), (1459, 68))

In [109]:
#train_x.to_csv('train_x.csv', index=False)
#log_train_y.to_csv('log_train_y.csv', index=False)
#test_x.to_csv('test_x.csv', index=False)
