In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import LinearRegression

# Load the new dataset
# Make sure "D600 Task 1 Dataset 1.csv" is in the same folder
df = pd.read_csv('D600 Task 2 Dataset 1 Housing Information.csv')

# Drop the ID column (not useful for prediction)
df = df.drop(columns=['ID'])

# Preview the data
print("Data Shape:", df.shape)
df.head()

Data Shape: (7000, 21)


Unnamed: 0,Price,SquareFootage,NumBathrooms,NumBedrooms,BackyardSpace,CrimeRate,SchoolRating,AgeOfHome,DistanceToCityCenter,EmploymentRate,...,RenovationQuality,LocalAmenities,TransportAccess,Fireplace,HouseColor,Garage,Floors,Windows,PreviousSalePrice,IsLuxury
0,255614.8992,566.62,1.0,4,779.42,20.56,5.62,39.46,10.08,97.29,...,4.93,4.44,4.55,Yes,Blue,No,1,13,181861.5423,0
1,155586.0947,1472.34,1.0,2,656.13,15.62,5.63,40.51,7.89,93.22,...,4.08,5.56,6.83,No,Green,No,1,17,50042.59757,0
2,131050.8324,550.0,1.779354,3,754.57,12.47,9.2,48.38,23.74,96.6,...,4.26,8.07,8.48,Yes,Green,Yes,2,34,48400.3444,0
3,151361.7125,941.81,2.035254,2,439.59,22.22,7.08,94.67,5.22,91.45,...,4.45,5.0,6.27,Yes,Red,No,1,14,84594.12145,0
4,113167.6128,550.0,1.064644,3,353.03,8.28,5.93,16.8,43.13,86.5,...,3.36,5.46,6.99,No,White,Yes,1,21,22934.59654,0


In [2]:
# 1. Binary Encoding (Yes=1, No=0)
binary_cols = ['Fireplace', 'Garage']
for col in binary_cols:
    # Maps if the column still contains 'Yes'/'No' strings (dtype 'object')
    if df[col].dtype == 'object' and df[col].astype(str).str.strip().isin(['Yes', 'No']).any():
        df[col] = df[col].astype(str).str.strip().map({'Yes': 1, 'No': 0})
    # Convert to category type to ensure describe shows categorical stats.
    df[col] = df[col].astype('category')

# Convert to category type to ensure describe() shows categorical stats.
df['IsLuxury'] = df['IsLuxury'].astype('category')

# 2. One-Hot Encoding for HouseColor
if 'HouseColor' in df.columns:
    df = pd.get_dummies(df, columns=['HouseColor'], drop_first=True)

# 3. Remove rows with negative values.
initial_rows = df.shape[0]
df = df[df['PreviousSalePrice'] >= 0]
df = df[df['Windows'] >= 0]
removed_rows = initial_rows - df.shape[0]
print(f"Removed {removed_rows} rows with negative 'PreviousSalePrice' or 'Windows'.")

# 4. Check for Nulls.
print("\nNull Values:\n", df.isnull().sum())

# Show cleaned dataframe info
print("\nCleaned Data Info:")
df.info()

Removed 78 rows with negative 'PreviousSalePrice' or 'Windows'.

Null Values:
 Price                   0
SquareFootage           0
NumBathrooms            0
NumBedrooms             0
BackyardSpace           0
CrimeRate               0
SchoolRating            0
AgeOfHome               0
DistanceToCityCenter    0
EmploymentRate          0
PropertyTaxRate         0
RenovationQuality       0
LocalAmenities          0
TransportAccess         0
Fireplace               0
Garage                  0
Floors                  0
Windows                 0
PreviousSalePrice       0
IsLuxury                0
HouseColor_Green        0
HouseColor_Red          0
HouseColor_White        0
HouseColor_Yellow       0
dtype: int64

Cleaned Data Info:
<class 'pandas.core.frame.DataFrame'>
Index: 6922 entries, 0 to 6999
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   Price                 6922 non-null   float64 
 1   S

In [3]:
# Display floats with 2 decimal places.
pd.set_option('display.float_format', lambda x: '%.2f' % x)
# Increase max columns to ensure all features are displayed
#pd.set_option('display.max_columns', None)

print("Descriptive Statistics for IsLuxury:\n")
display(df['IsLuxury'].describe())

initial_independent_features = df.drop(columns=['IsLuxury']).columns.tolist()

# Separate features into categorical and continuous lists.
categorical_independent_features = []
continuous_independent_features = []

for feature in initial_independent_features:
    if df[feature].dtype == 'category' or df[feature].dtype == 'bool':
        categorical_independent_features.append(feature)
    else:
        continuous_independent_features.append(feature)

# Display Descriptive Statistics for Continuous Variables.
print("\nDescriptive Statistics for Continuous Independent Variables:\n")
continuous_stats_list = []
for feature in continuous_independent_features:
    continuous_stats_list.append(df[feature].describe().rename(feature))
continuous_stats_df = pd.concat(continuous_stats_list, axis=1).T
display(continuous_stats_df)

# Display Descriptive Statistics for Categorical Variables.
print("\nDescriptive Statistics for Categorical Independent Variables:\n")
categorical_stats_list = []
for feature in categorical_independent_features:
    categorical_stats_list.append(df[feature].describe().rename(feature))
categorical_stats_df = pd.concat(categorical_stats_list, axis=1).T
display(categorical_stats_df)

Descriptive Statistics for IsLuxury:



count     6922
unique       2
top          1
freq      3501
Name: IsLuxury, dtype: int64


Descriptive Statistics for Continuous Independent Variables:



Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Price,6922.0,308243.94,150002.9,85000.0,193197.56,280439.37,392825.24,1046675.64
SquareFootage,6922.0,1050.99,425.84,550.0,663.71,1001.09,1344.42,2874.7
NumBathrooms,6922.0,2.13,0.95,1.0,1.29,2.0,2.77,5.81
NumBedrooms,6922.0,3.01,1.02,1.0,2.0,3.0,4.0,7.0
BackyardSpace,6922.0,511.23,280.0,0.39,300.71,495.81,703.9,1631.36
CrimeRate,6922.0,31.22,18.03,0.03,17.41,30.34,43.64,99.73
SchoolRating,6922.0,6.95,1.89,0.22,5.66,7.01,8.36,10.0
AgeOfHome,6922.0,46.86,31.82,0.01,20.77,42.69,67.31,178.68
DistanceToCityCenter,6922.0,17.44,11.99,0.0,7.81,15.6,25.17,65.2
EmploymentRate,6922.0,93.71,4.51,72.05,90.62,94.01,97.41,99.9



Descriptive Statistics for Categorical Independent Variables:



Unnamed: 0,count,unique,top,freq
Fireplace,6922,2,0,5111
Garage,6922,2,0,4436
HouseColor_Green,6922,2,False,5566
HouseColor_Red,6922,2,False,5592
HouseColor_White,6922,2,False,5490
HouseColor_Yellow,6922,2,False,5513
