# CS5785 HW1 Part1

Yufan Zhang (yz2894)

Tian Jin (tj299)

## Preparation

In [1]:
# Import libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

# Ignore any warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# PATH variables
DATA_PATH = "./data/Part_I/"
IMG_PATH = "./img/Part_I/"

if not os.path.exists(IMG_PATH):
    os.makedirs(IMG_PATH)

## Data Ingestion

In [3]:
# Load the training and test data
def load_data_from_csv(filename):
    """
    Load the data to a Pandas dataframe from a file
    
    Args:
        filename: string containing the path to the dataset
    
    Return:
        df: a Pandas dataframe containing the loaded data
    """
    df = pd.read_csv(
        os.path.join(DATA_PATH, filename),
        index_col=0
    )
    
    return df

df = load_data_from_csv("train.csv")
print(f"Shape of the training data: {df.shape}")
Y_train = df['SalePrice']
print(f"Shape of the training Y: {Y_train.shape}")
df_test = load_data_from_csv("test.csv")
print(f"Shape of the test data: {df_test.shape}")

df.head()

df = pd.concat([df, df_test])
print(f"Shape of the contacted data: {df.shape}")

Shape of the training data: (1460, 80)
Shape of the training Y: (1460,)
Shape of the test data: (1459, 79)
Shape of the contacted data: (2919, 80)


In [4]:
# Get the descriptive statistics of numerical and categorical features
column_types = df.dtypes    # Get the data types of all columns

numerical_features = column_types[column_types != 'object'].index.tolist()
categorical_features = column_types[column_types == 'object'].index.tolist()

print(f"> Numerical features: ({len(numerical_features)})")
print(numerical_features)
print()

print(f"> Categorical features: ({len(categorical_features)})")
print(categorical_features)

> Numerical features: (37)
['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice']

> Categorical features: (43)
['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional',

## Feature Engineering

### Feature selection

In [5]:
# Select the representative features based on the data deciption
numerical_features = [
    "LotFrontage",
    "LotArea",
    "OverallQual",
    "OverallCond",
    "YearBuilt",
    "YearRemodAdd",
    "MasVnrArea",
    "TotalBsmtSF",
    "GrLivArea",
    "BsmtFullBath",
    "BsmtHalfBath",
    "FullBath",
    "HalfBath",
    "BedroomAbvGr",
    "KitchenAbvGr",
    "TotRmsAbvGrd",
    "Fireplaces",
    "GarageArea",
    "PoolArea",
    "YrSold",
]

categorical_features = [
    "MSZoning",
    "Utilities",
    "LandSlope",
    # "Neighborhood",
    "BldgType",
    "HouseStyle",
    "ExterQual",   # -> numerical
    "ExterCond",   # -> numerical
    "BsmtQual",    # -> numerical
    "BsmtCond",    # -> numerical
    "HeatingQC",   # -> numerical
    "CentralAir",  # -> 0, 1
    "KitchenQual", 
    "FireplaceQu",
    "GarageQual",
    "GarageCond",
    "PoolQC",
    "SaleType",
]

features_to_keep = numerical_features + categorical_features

print("# of numerical features: ", len(numerical_features))
print("# of categorical features: ", len(categorical_features))
print("total: ", len(features_to_keep))

# of numerical features:  20
# of categorical features:  17
total:  37


In [6]:
# Keep a copy of the original data and drop the unneeded features
df_ori = df.copy()


def keep_needed_features(df, features_to_keep):
    """
    Keep only the needed feature columns in the dataset

    Args:
        df: a Pandas dataframe of the dataset
        features_to_keep: a list of features to be keeped

    Return:
        A Pandas dataframe containing the needed features
    """
    return df[features_to_keep]


df = keep_needed_features(df, features_to_keep)

df


Unnamed: 0_level_0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,TotalBsmtSF,GrLivArea,BsmtFullBath,...,BsmtQual,BsmtCond,HeatingQC,CentralAir,KitchenQual,FireplaceQu,GarageQual,GarageCond,PoolQC,SaleType
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,65.0,8450,7,5,2003,2003,196.0,856.0,1710,1.0,...,Gd,TA,Ex,Y,Gd,,TA,TA,,WD
2,80.0,9600,6,8,1976,1976,0.0,1262.0,1262,0.0,...,Gd,TA,Ex,Y,TA,TA,TA,TA,,WD
3,68.0,11250,7,5,2001,2002,162.0,920.0,1786,1.0,...,Gd,TA,Ex,Y,Gd,TA,TA,TA,,WD
4,60.0,9550,7,5,1915,1970,0.0,756.0,1717,1.0,...,TA,Gd,Gd,Y,Gd,Gd,TA,TA,,WD
5,84.0,14260,8,5,2000,2000,350.0,1145.0,2198,1.0,...,Gd,TA,Ex,Y,Gd,TA,TA,TA,,WD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2915,21.0,1936,4,7,1970,1970,0.0,546.0,1092,0.0,...,TA,TA,Gd,Y,TA,,,,,WD
2916,21.0,1894,4,5,1970,1970,0.0,546.0,1092,0.0,...,TA,TA,TA,Y,TA,,TA,TA,,WD
2917,160.0,20000,5,7,1960,1996,0.0,1224.0,1224,1.0,...,TA,TA,Ex,Y,TA,TA,TA,TA,,WD
2918,62.0,10441,5,5,1992,1992,0.0,912.0,970,0.0,...,Gd,TA,TA,Y,TA,,,,,WD


### Handling missing values

In [7]:
# Determine which features to be dropped based on the percentage of NA values
def get_features_to_drop(df_train, feature_list, threshold):
    """
    Determine which features to be dropped based on the percentage of NA values
    in the training data

    Args:
        df_train: a Pandas dataframe containing the training data
        feature_list: a list of features to be checked
        threshold: a float number indicating the threshold of NA percentage

    Return:
        A list containing the features to be dropped
    """
    null_counts = df_train.loc[:, feature_list].isna().sum() / len(df_train)
    return list(null_counts[null_counts > threshold].index)

# Drop the features with NA values more than 30%
feature_to_drop = get_features_to_drop(df, categorical_features, threshold=0.3)

print("The features need to be dropped:")
print(feature_to_drop)

The features need to be dropped:
['FireplaceQu', 'PoolQC']


In [8]:
# Drop the categorical features from the previous analysis
def drop_cat_fea_by_na(df, feature_to_drop):
    """
    Drop the provided categorical features

    Args:
        df: a Pandas dataframe for dataset
        feature_to_drop: a list of feature names to be dropped

    Return:
        A Pandas dataframe after dropping
    """
    return df.drop(feature_to_drop, axis=1)

categorical_features = list(set(categorical_features) - set(feature_to_drop))
df = drop_cat_fea_by_na(df, feature_to_drop)

df

Unnamed: 0_level_0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,TotalBsmtSF,GrLivArea,BsmtFullBath,...,ExterQual,ExterCond,BsmtQual,BsmtCond,HeatingQC,CentralAir,KitchenQual,GarageQual,GarageCond,SaleType
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,65.0,8450,7,5,2003,2003,196.0,856.0,1710,1.0,...,Gd,TA,Gd,TA,Ex,Y,Gd,TA,TA,WD
2,80.0,9600,6,8,1976,1976,0.0,1262.0,1262,0.0,...,TA,TA,Gd,TA,Ex,Y,TA,TA,TA,WD
3,68.0,11250,7,5,2001,2002,162.0,920.0,1786,1.0,...,Gd,TA,Gd,TA,Ex,Y,Gd,TA,TA,WD
4,60.0,9550,7,5,1915,1970,0.0,756.0,1717,1.0,...,TA,TA,TA,Gd,Gd,Y,Gd,TA,TA,WD
5,84.0,14260,8,5,2000,2000,350.0,1145.0,2198,1.0,...,Gd,TA,Gd,TA,Ex,Y,Gd,TA,TA,WD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2915,21.0,1936,4,7,1970,1970,0.0,546.0,1092,0.0,...,TA,TA,TA,TA,Gd,Y,TA,,,WD
2916,21.0,1894,4,5,1970,1970,0.0,546.0,1092,0.0,...,TA,TA,TA,TA,TA,Y,TA,TA,TA,WD
2917,160.0,20000,5,7,1960,1996,0.0,1224.0,1224,1.0,...,TA,TA,TA,TA,Ex,Y,TA,TA,TA,WD
2918,62.0,10441,5,5,1992,1992,0.0,912.0,970,0.0,...,TA,TA,Gd,TA,TA,Y,TA,,,WD


In [9]:
# Create a dict to store the mode/mean/median of each feature in the training dataset
def create_fea_mesature_dict(df_train, feature_list, measure="mode"):
    """
    Create a dict to map the mode/mean/median of each feature in the training dataset,
    which can be used to transform the test dataset

    Args:
        df_train: a Pandas dataframe containing the training data
        feature_list: a list of features to measure on
        measure: mode/mean/median

    Return:
        A dict containing the measure of each feature
    """
    measure_dict = dict()
    for col in feature_list:
        if measure == "mode":
            measure_dict[col] = df_train[col].mode()[0]
        elif measure == "mean":
            measure_dict[col] = df_train[col].mean()
        elif measure == "median":
            measure_dict[col] = df_train[col].median()
        else:
            raise ValueError("measure should be mode/mean/median")
    return measure_dict

cat_mode_dict = create_fea_mesature_dict(df, categorical_features, "mode")
num_mean_dict = create_fea_mesature_dict(df, numerical_features, "mean")

In [10]:
# Fill the NA values in categorical/numerical features with the mode/mean of the features in the training dataset
def fill_na_with_measure(df, measure_dict):
    """
    Fill the NA values in features with the measures of the features in the training dataset

    Args:
        df: a Pandas dataframe containing the data to be transformed
        measure_dict: a dict containing the measure of each feature
    
    Return:
        A Pandas dataframe after NA values being filled
    """
    for col in measure_dict.keys():
        df[col].fillna(measure_dict[col], inplace=True)
    return df

df = fill_na_with_measure(df, cat_mode_dict)  # Fill categorical features NA with the mode
df = fill_na_with_measure(df, num_mean_dict)  # Fill numerical features NA with the mean

# Check if there are still NA values in the numerical features
print("Number of NA values in the training data's numerical features:")
print(df.isna().sum().sum())

df

Number of NA values in the training data's numerical features:
0


Unnamed: 0_level_0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,TotalBsmtSF,GrLivArea,BsmtFullBath,...,ExterQual,ExterCond,BsmtQual,BsmtCond,HeatingQC,CentralAir,KitchenQual,GarageQual,GarageCond,SaleType
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,65.0,8450,7,5,2003,2003,196.0,856.0,1710,1.0,...,Gd,TA,Gd,TA,Ex,Y,Gd,TA,TA,WD
2,80.0,9600,6,8,1976,1976,0.0,1262.0,1262,0.0,...,TA,TA,Gd,TA,Ex,Y,TA,TA,TA,WD
3,68.0,11250,7,5,2001,2002,162.0,920.0,1786,1.0,...,Gd,TA,Gd,TA,Ex,Y,Gd,TA,TA,WD
4,60.0,9550,7,5,1915,1970,0.0,756.0,1717,1.0,...,TA,TA,TA,Gd,Gd,Y,Gd,TA,TA,WD
5,84.0,14260,8,5,2000,2000,350.0,1145.0,2198,1.0,...,Gd,TA,Gd,TA,Ex,Y,Gd,TA,TA,WD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2915,21.0,1936,4,7,1970,1970,0.0,546.0,1092,0.0,...,TA,TA,TA,TA,Gd,Y,TA,TA,TA,WD
2916,21.0,1894,4,5,1970,1970,0.0,546.0,1092,0.0,...,TA,TA,TA,TA,TA,Y,TA,TA,TA,WD
2917,160.0,20000,5,7,1960,1996,0.0,1224.0,1224,1.0,...,TA,TA,TA,TA,Ex,Y,TA,TA,TA,WD
2918,62.0,10441,5,5,1992,1992,0.0,912.0,970,0.0,...,TA,TA,Gd,TA,TA,Y,TA,TA,TA,WD


### Encoding categorical features

In [11]:
# Drop the skewed categorical features
def drop_skewed_cat_fea(df, feature_list, threshold=0.5):
    """
    Drop the skewed categorical features

    Args:
        df: a Pandas dataframe containing the data to be transformed
        feature_list: a list of features to be checked
        threshold: a float number indicating the threshold of NA percentage
    
    Return:
        A Pandas dataframe after dropping
    """
    feature_to_drop = list()
    for col in feature_list:
        if df[col].value_counts(normalize=True).values[0] > threshold:
            feature_to_drop.append(col)
            # df.drop(col, axis=1, inplace=True)
    return feature_to_drop

feature_to_drop = drop_skewed_cat_fea(df, categorical_features)

print("The features need to be dropped:")
print(feature_to_drop)

# Drop the skewed categorical features
df.drop(feature_to_drop, axis=1, inplace=True)

# Update the categorical features list
categorical_features = list(set(categorical_features) - set(feature_to_drop))

df

The features need to be dropped:
['ExterCond', 'Utilities', 'SaleType', 'LandSlope', 'GarageQual', 'MSZoning', 'BsmtCond', 'KitchenQual', 'ExterQual', 'BldgType', 'HouseStyle', 'GarageCond', 'CentralAir', 'HeatingQC']


Unnamed: 0_level_0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,TotalBsmtSF,GrLivArea,BsmtFullBath,...,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageArea,PoolArea,YrSold,BsmtQual
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,65.0,8450,7,5,2003,2003,196.0,856.0,1710,1.0,...,2,1,3,1,8,0,548.0,0,2008,Gd
2,80.0,9600,6,8,1976,1976,0.0,1262.0,1262,0.0,...,2,0,3,1,6,1,460.0,0,2007,Gd
3,68.0,11250,7,5,2001,2002,162.0,920.0,1786,1.0,...,2,1,3,1,6,1,608.0,0,2008,Gd
4,60.0,9550,7,5,1915,1970,0.0,756.0,1717,1.0,...,1,0,3,1,7,1,642.0,0,2006,TA
5,84.0,14260,8,5,2000,2000,350.0,1145.0,2198,1.0,...,2,1,4,1,9,1,836.0,0,2008,Gd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2915,21.0,1936,4,7,1970,1970,0.0,546.0,1092,0.0,...,1,1,3,1,5,0,0.0,0,2006,TA
2916,21.0,1894,4,5,1970,1970,0.0,546.0,1092,0.0,...,1,1,3,1,6,0,286.0,0,2006,TA
2917,160.0,20000,5,7,1960,1996,0.0,1224.0,1224,1.0,...,1,0,4,1,7,1,576.0,0,2006,TA
2918,62.0,10441,5,5,1992,1992,0.0,912.0,970,0.0,...,1,0,3,1,6,0,0.0,0,2006,Gd


In [12]:
# Create a dict containing all the feature names of each categorical feature
def get_categories_list(df_train, categorical_features):
    """
    Create a dict containing all the feature names of each categorical feature from the training data

    Arg:
        df_train: a Pandas dataframe for training data
        categorical_features: a list of categorical feature names

    Return:
        A dict whose key is each categorical feature and value is the category name
    """
    categories = dict()
    for col in categorical_features:
        categories[col] = df_train[col].unique().tolist()

    return categories

categories_dict = get_categories_list(df, categorical_features)

In [27]:
categories_dict

{'BsmtQual': ['Gd', 'TA', 'Ex', 'Fa']}

In [13]:
# Perform one-hot encoding to the training dataset
def apply_one_hot_encode(df, categories_dict):
    """
    Perform one-hot encoding to the dataset given the categories dict generated from the training dataset

    Args:
        df: a Pandas dataframe containing the data
        categories_dict: a dict whose key is each categorical feature and value is the category name

    Return:
        A pandas dataframe after one-hot encoding on the categorical data
    """
    for col, categories in categories_dict.items():
        for category in categories:
            df[f"{col}_{category}"] = (df[col] == category).astype(int)
        df.drop(col, axis=1, inplace=True)
    return df

df = apply_one_hot_encode(df, categories_dict)

df

Unnamed: 0_level_0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,TotalBsmtSF,GrLivArea,BsmtFullBath,...,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageArea,PoolArea,YrSold,BsmtQual_Gd,BsmtQual_TA,BsmtQual_Ex,BsmtQual_Fa
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,65.0,8450,7,5,2003,2003,196.0,856.0,1710,1.0,...,1,8,0,548.0,0,2008,1,0,0,0
2,80.0,9600,6,8,1976,1976,0.0,1262.0,1262,0.0,...,1,6,1,460.0,0,2007,1,0,0,0
3,68.0,11250,7,5,2001,2002,162.0,920.0,1786,1.0,...,1,6,1,608.0,0,2008,1,0,0,0
4,60.0,9550,7,5,1915,1970,0.0,756.0,1717,1.0,...,1,7,1,642.0,0,2006,0,1,0,0
5,84.0,14260,8,5,2000,2000,350.0,1145.0,2198,1.0,...,1,9,1,836.0,0,2008,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2915,21.0,1936,4,7,1970,1970,0.0,546.0,1092,0.0,...,1,5,0,0.0,0,2006,0,1,0,0
2916,21.0,1894,4,5,1970,1970,0.0,546.0,1092,0.0,...,1,6,0,286.0,0,2006,0,1,0,0
2917,160.0,20000,5,7,1960,1996,0.0,1224.0,1224,1.0,...,1,7,1,576.0,0,2006,0,1,0,0
2918,62.0,10441,5,5,1992,1992,0.0,912.0,970,0.0,...,1,6,0,0.0,0,2006,1,0,0,0


In [14]:
len(list(df.columns))

24

### Scaling numerical features

In [15]:
list(df.columns)[-20:]

['YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'TotalBsmtSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageArea',
 'PoolArea',
 'YrSold',
 'BsmtQual_Gd',
 'BsmtQual_TA',
 'BsmtQual_Ex',
 'BsmtQual_Fa']

In [16]:
temp_features = numerical_features + list(df.columns)[-10:0]

# ["LandSlope_Sev", "LandSlope_Mod", "LandSlope_Gtl"]

In [17]:
from sklearn.preprocessing import Normalizer

# X_normalized = Normalizer().fit_transform(df[temp_features])
X_normalized = Normalizer().fit_transform(df)
X_normalized

array([[6.95044002e-03, 9.03557202e-01, 7.48508925e-04, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [7.71945337e-03, 9.26334405e-01, 5.78959003e-04, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [5.68558246e-03, 9.40629451e-01, 5.85280547e-04, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [7.85216591e-03, 9.81520739e-01, 2.45380185e-04, ...,
        4.90760369e-05, 0.00000000e+00, 0.00000000e+00],
       [5.59601358e-03, 9.42386738e-01, 4.51291418e-04, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [7.05305087e-03, 9.17563793e-01, 6.67180488e-04, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [18]:
# # Set the standard scaler based on the mean and std from the training dataset
# def get_scaler_config_dict(df_train, numerical_features):
#     """
#     Calculate the scaler confis (mean and std of each feature)

#     Args:
#         df_train: a Pandas dataframe containing the training data
#         numerical_features: a list of numerical features

#     Return:
#         A dict containing two dicts, containing the mean and std of each feature repectively
#     """
#     scaler_configs = dict()
#     scaler_configs["mean"] = dict()
#     scaler_configs["std"] = dict()

#     for col in numerical_features:
#         scaler_configs["mean"][col] = df_train[col].mean()
#         scaler_configs["std"][col] = df_train[col].std()

#     return scaler_configs

# scaler_configs = get_scaler_config_dict(df, numerical_features)

In [19]:
# # Apply the scaling to the training dataset
# def apply_scalar(df, numerical_features, scaler_configs):
#     """
#     Apply the standard scaling to the data

#     Args:
#         df: a Pandas dataframe for data
#         numerical_features: a list of numerical features in data
#         scalar_configs: A dict containing two dicts, containing the mean and std of each feature repectively

#     Return:
#         A updated dataframe with numerical features scaled
#     """
#     for col in numerical_features:
#         df[col] = (df[col] - scaler_configs["mean"][col]) / scaler_configs["std"][col]
#     return df

# # df = apply_scalar(df, numerical_features, scaler_configs)

# df

### Target transformation

In [20]:
# plt.figure(figsize=(15, 4))
# plt.subplot(1, 2, 2)

# plt.subplot(1, 2, 1)
# sns.distplot(Y_train, kde=True, fit=stats.norm)
# plt.title("Skewed values")

# # Apply log() to the training target to reduce skewiness
# Y_train_transformed = np.log(Y_train)

# plt.subplot(1, 2, 2)
# sns.distplot(Y_train_transformed, kde=True, fit=stats.norm)
# plt.title("Normalized values")
# plt.xlabel("log SalePrice")

# plt.show()


In [21]:
df

Unnamed: 0_level_0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,TotalBsmtSF,GrLivArea,BsmtFullBath,...,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageArea,PoolArea,YrSold,BsmtQual_Gd,BsmtQual_TA,BsmtQual_Ex,BsmtQual_Fa
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,65.0,8450,7,5,2003,2003,196.0,856.0,1710,1.0,...,1,8,0,548.0,0,2008,1,0,0,0
2,80.0,9600,6,8,1976,1976,0.0,1262.0,1262,0.0,...,1,6,1,460.0,0,2007,1,0,0,0
3,68.0,11250,7,5,2001,2002,162.0,920.0,1786,1.0,...,1,6,1,608.0,0,2008,1,0,0,0
4,60.0,9550,7,5,1915,1970,0.0,756.0,1717,1.0,...,1,7,1,642.0,0,2006,0,1,0,0
5,84.0,14260,8,5,2000,2000,350.0,1145.0,2198,1.0,...,1,9,1,836.0,0,2008,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2915,21.0,1936,4,7,1970,1970,0.0,546.0,1092,0.0,...,1,5,0,0.0,0,2006,0,1,0,0
2916,21.0,1894,4,5,1970,1970,0.0,546.0,1092,0.0,...,1,6,0,286.0,0,2006,0,1,0,0
2917,160.0,20000,5,7,1960,1996,0.0,1224.0,1224,1.0,...,1,7,1,576.0,0,2006,0,1,0,0
2918,62.0,10441,5,5,1992,1992,0.0,912.0,970,0.0,...,1,6,0,0.0,0,2006,1,0,0,0


## Model Training

In [22]:
Y_train.size

1460

In [23]:
X_train = X_normalized[:1460, :] 
X_test = X_normalized[1460:, :]
X_test

array([[6.55556798e-03, 9.52360138e-01, 4.09722999e-04, ...,
        8.19445997e-05, 0.00000000e+00, 0.00000000e+00],
       [5.47468422e-03, 9.64287898e-01, 4.05532164e-04, ...,
        6.75886940e-05, 0.00000000e+00, 0.00000000e+00],
       [5.14283705e-03, 9.61154547e-01, 3.47488990e-04, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [7.85216591e-03, 9.81520739e-01, 2.45380185e-04, ...,
        4.90760369e-05, 0.00000000e+00, 0.00000000e+00],
       [5.59601358e-03, 9.42386738e-01, 4.51291418e-04, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [7.05305087e-03, 9.17563793e-01, 6.67180488e-04, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [24]:
# Compute OLS from scratch
def OLS(X, Y):
    theta = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(Y)   
    return theta
    # return X.dot(theta) 
    
theta = OLS(X_train, Y_train)

Y_train_pred = X_train.dot(theta)

## Model Validation

In [25]:
# Compute MSE from scratch
MSE = 0.5 * np.mean((Y_train - Y_train_pred)**2)

# Compute r_sqaured from scratch
Y_train_mean = np.mean(Y_train)
sst = np.sum((Y_train - Y_train_mean) ** 2) # total sum of squares (SST)
ssr = np.sum((Y_train - Y_train_pred) ** 2) # residual sum of squares (SSR)
r_squared = 1 - (ssr / sst)

print('Evaluate predictions on training set: MSE: '+ str(MSE)+', r^2: '+str(r_squared))

Evaluate predictions on training set: MSE: 967962138.1769713, r^2: 0.6930411967746779


In [26]:
Y_test_pred = X_test.dot(theta)
Y_test_pred

# Save the predictions to a csv file with Id starting from 1461
def save_predictions_to_csv(Y_test_pred, filename):
    """
    Save the predictions to a csv file with Id starting from 1461

    Args:
        Y_test_pred: a numpy array containing the predictions
        filename: a string containing the path to the csv file
    """
    df = pd.DataFrame(Y_test_pred, columns=["SalePrice"])
    df.index += 1461
    df.index.name = "Id"
    df.to_csv(filename)
    
save_predictions_to_csv(Y_test_pred, os.path.join(DATA_PATH, "submission.csv"))

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=12719f73-9bbb-45d0-aa0c-e4182d582bf3' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>