In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [4]:
# Load the data into a pandas DataFrame
data_df = pd.read_csv("./Resource/clean_data.csv")
print(data_df.isnull().sum())

LotArea          0
LotShape         0
BldgType         0
YearBuilt        0
Neighborhood     0
YearRemodAdd     0
Heating          0
TtlHouseSF       0
FullBath         0
HalfBath         0
BedroomAbvGr     0
Fireplaces       0
GarageType      71
GarageCars       0
PavedDrive       0
PoolArea         0
YrSold           0
SalePrice        0
dtype: int64


In [5]:
data_df.dropna(inplace=True)
data_df.fillna(data_df.mean(), inplace=True)
print(data_df.isnull().sum())
data_df

LotArea         0
LotShape        0
BldgType        0
YearBuilt       0
Neighborhood    0
YearRemodAdd    0
Heating         0
TtlHouseSF      0
FullBath        0
HalfBath        0
BedroomAbvGr    0
Fireplaces      0
GarageType      0
GarageCars      0
PavedDrive      0
PoolArea        0
YrSold          0
SalePrice       0
dtype: int64


  


Unnamed: 0,LotArea,LotShape,BldgType,YearBuilt,Neighborhood,YearRemodAdd,Heating,TtlHouseSF,FullBath,HalfBath,BedroomAbvGr,Fireplaces,GarageType,GarageCars,PavedDrive,PoolArea,YrSold,SalePrice
0,8400,Reg,1Fam,1950,Sawyer West,1950,GasA,1562,1,0,2,False,CarPort,1,N,False,2009,82000
1,9000,Reg,1Fam,1928,Sawyer West,1950,GasA,1568,1,0,2,False,Detchd,2,N,False,2009,76000
2,8400,Reg,Duplex,1977,Sawyer West,1977,GasA,2020,2,0,4,True,Detchd,2,Y,False,2007,144000
3,7800,Reg,1Fam,1965,Sawyer West,1965,GasA,1793,1,0,3,False,Detchd,1,Y,False,2006,119900
4,8900,Reg,1Fam,1966,Sawyer West,1966,GasA,2112,1,0,2,False,Detchd,1,Y,False,2006,107000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1145,1680,Reg,Twnhs,1972,Briardale,1972,GasA,1932,2,1,3,False,Detchd,1,Y,False,2009,119500
1146,2368,Reg,TwnhsE,1970,Briardale,1970,GasA,2130,1,1,3,False,Attchd,2,Y,False,2009,125000
1147,1953,Reg,Twnhs,1973,Briardale,1973,GasA,1470,1,1,2,False,Detchd,1,Y,False,2006,83000
1148,1950,Reg,Twnhs,1980,Bluestem,1980,GasA,2272,2,1,3,True,Attchd,2,Y,False,2008,151000


In [6]:
# Define the predictor variables and the target variable
X = data_df[["LotArea", "LotShape", "BldgType", "YearBuilt", "Neighborhood", "YearRemodAdd", "Heating", "TtlHouseSF",
             "FullBath", "HalfBath", "BedroomAbvGr", "Fireplaces", "GarageType", "GarageCars", "PavedDrive", "PoolArea",
            "YrSold"]]
y = data_df["SalePrice"]


In [7]:

# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Remove rows with NaN values in y_train
y_train = y_train.dropna()

# Remove corresponding rows in X_train
X_train = X_train.loc[y_train.index]

# Create an instance of the LinearRegression model
reg = LinearRegression()


In [8]:
# Make sure sample datasets have the same number of records
print(data_df.shape[0]) 
print(X_train.shape[0])
print(y_train.shape[0])
print(X_test.shape[0])
print(y_test.shape[0])

1079
863
863
216
216


## Encode the other categorical variables to convert categorical values to numeric values

In [9]:

# Create an instance of OneHotEncoder
ohe = OneHotEncoder(handle_unknown='ignore')

# Fit and transform the LotShape column of the training set
X_train_encoded = ohe.fit_transform(X_train[['LotShape']])

# Transform the LotShape column of the test set
X_test_encoded = ohe.transform(X_test[['LotShape']])

# Convert the encoded data to a DataFrame
X_train_encoded = pd.DataFrame(X_train_encoded.toarray(), columns=ohe.get_feature_names_out(['LotShape']))
X_test_encoded = pd.DataFrame(X_test_encoded.toarray(), columns=ohe.get_feature_names_out(['LotShape']))

# Concatenate the encoded DataFrame to the original DataFrame
X_train = pd.concat([X_train, X_train_encoded], axis=1)
X_test = pd.concat([X_test, X_test_encoded], axis=1)

# drop the original column
X_train.drop(columns=["LotShape"], inplace=True)
X_test.drop(columns=["LotShape"], inplace=True)

In [10]:
# Fit and transform the LotShape column of the training set
X_train_encoded = ohe.fit_transform(X_train[['BldgType']])

# Transform the LotShape column of the test set
X_test_encoded = ohe.transform(X_test[['BldgType']])

# Convert the encoded data to a DataFrame
X_train_encoded = pd.DataFrame(X_train_encoded.toarray(), columns=ohe.get_feature_names_out(['BldgType']))
X_test_encoded = pd.DataFrame(X_test_encoded.toarray(), columns=ohe.get_feature_names_out(['BldgType']))

# Concatenate the encoded DataFrame to the original DataFrame
X_train = pd.concat([X_train, X_train_encoded], axis=1)
X_test = pd.concat([X_test, X_test_encoded], axis=1)

# drop the original column
X_train.drop(columns=["BldgType"], inplace=True)
X_test.drop(columns=["BldgType"], inplace=True)

In [11]:

# Fit and transform the LotShape column of the training set
X_train_encoded = ohe.fit_transform(X_train[['Neighborhood']])

# Transform the LotShape column of the test set
X_test_encoded = ohe.transform(X_test[['Neighborhood']])

# Convert the encoded data to a DataFrame
X_train_encoded = pd.DataFrame(X_train_encoded.toarray(), columns=ohe.get_feature_names_out(['Neighborhood']))
X_test_encoded = pd.DataFrame(X_test_encoded.toarray(), columns=ohe.get_feature_names_out(['Neighborhood']))

# Concatenate the encoded DataFrame to the original DataFrame
X_train = pd.concat([X_train, X_train_encoded], axis=1)
X_test = pd.concat([X_test, X_test_encoded], axis=1)

# drop the original column
X_train.drop(columns=["Neighborhood"], inplace=True)
X_test.drop(columns=["Neighborhood"], inplace=True)

In [12]:

# Fit and transform the LotShape column of the training set
X_train_encoded = ohe.fit_transform(X_train[['Heating']])

# Transform the LotShape column of the test set
X_test_encoded = ohe.transform(X_test[['Heating']])

# Convert the encoded data to a DataFrame
X_train_encoded = pd.DataFrame(X_train_encoded.toarray(), columns=ohe.get_feature_names_out(['Heating']))
X_test_encoded = pd.DataFrame(X_test_encoded.toarray(), columns=ohe.get_feature_names_out(['Heating']))

# Concatenate the encoded DataFrame to the original DataFrame
X_train = pd.concat([X_train, X_train_encoded], axis=1)
X_test = pd.concat([X_test, X_test_encoded], axis=1)

# drop the original column
X_train.drop(columns=["Heating"], inplace=True)
X_test.drop(columns=["Heating"], inplace=True)

In [13]:

# Fit and transform the LotShape column of the training set
X_train_encoded = ohe.fit_transform(X_train[['Fireplaces']])

# Transform the LotShape column of the test set
X_test_encoded = ohe.transform(X_test[['Fireplaces']])

# Convert the encoded data to a DataFrame
X_train_encoded = pd.DataFrame(X_train_encoded.toarray(), columns=ohe.get_feature_names_out(['Fireplaces']))
X_test_encoded = pd.DataFrame(X_test_encoded.toarray(), columns=ohe.get_feature_names_out(['Fireplaces']))

# Concatenate the encoded DataFrame to the original DataFrame
X_train = pd.concat([X_train, X_train_encoded], axis=1)
X_test = pd.concat([X_test, X_test_encoded], axis=1)

# drop the original column
X_train.drop(columns=["Fireplaces"], inplace=True)
X_test.drop(columns=["Fireplaces"], inplace=True)

In [14]:

# Fit and transform the LotShape column of the training set
X_train_encoded = ohe.fit_transform(X_train[['GarageType']])

# Transform the LotShape column of the test set
X_test_encoded = ohe.transform(X_test[['GarageType']])

# Convert the encoded data to a DataFrame
X_train_encoded = pd.DataFrame(X_train_encoded.toarray(), columns=ohe.get_feature_names_out(['GarageType']))
X_test_encoded = pd.DataFrame(X_test_encoded.toarray(), columns=ohe.get_feature_names_out(['GarageType']))

# Concatenate the encoded DataFrame to the original DataFrame
X_train = pd.concat([X_train, X_train_encoded], axis=1)
X_test = pd.concat([X_test, X_test_encoded], axis=1)

# drop the original column
X_train.drop(columns=["GarageType"], inplace=True)
X_test.drop(columns=["GarageType"], inplace=True)

In [15]:

# Fit and transform the LotShape column of the training set
X_train_encoded = ohe.fit_transform(X_train[['PavedDrive']])

# Transform the LotShape column of the test set
X_test_encoded = ohe.transform(X_test[['PavedDrive']])

# Convert the encoded data to a DataFrame
X_train_encoded = pd.DataFrame(X_train_encoded.toarray(), columns=ohe.get_feature_names_out(['PavedDrive']))
X_test_encoded = pd.DataFrame(X_test_encoded.toarray(), columns=ohe.get_feature_names_out(['PavedDrive']))

# Concatenate the encoded DataFrame to the original DataFrame
X_train = pd.concat([X_train, X_train_encoded], axis=1)
X_test = pd.concat([X_test, X_test_encoded], axis=1)

# drop the original column
X_train.drop(columns=["PavedDrive"], inplace=True)
X_test.drop(columns=["PavedDrive"], inplace=True)

In [16]:
# Fit and transform the LotShape column of the training set
X_train_encoded = ohe.fit_transform(X_train[['PoolArea']])

# Transform the LotShape column of the test set
X_test_encoded = ohe.transform(X_test[['PoolArea']])

# Convert the encoded data to a DataFrame
X_train_encoded = pd.DataFrame(X_train_encoded.toarray(), columns=ohe.get_feature_names_out(['PoolArea']))
X_test_encoded = pd.DataFrame(X_test_encoded.toarray(), columns=ohe.get_feature_names_out(['PoolArea']))

# Concatenate the encoded DataFrame to the original DataFrame
X_train = pd.concat([X_train, X_train_encoded], axis=1)
X_test = pd.concat([X_test, X_test_encoded], axis=1)

# drop the original column
X_train.drop(columns=["PoolArea"], inplace=True)
X_test.drop(columns=["PoolArea"], inplace=True)

In [17]:
# Remove rows with NaN values in y_train
y_train = y_train.dropna()

# Remove corresponding rows in X_train
X_train = X_train.loc[y_train.index]
X_test = X_test.loc[y_test.index]

print(data_df.shape[0]) 
print(X_train.shape[0])
print(y_train.shape[0])
print(X_test.shape[0])
print(y_test.shape[0])
#print(y_pred.shape[0])

1079
863
863
216
216


In [19]:
X_test.dropna(inplace=True)
y_test = y_test[X_test.index]
X_train.dropna(inplace=True)
y_train = y_train[X_train.index]

#print(y_pred.shape[0])# should be equal to y_train.shape[0]

print(y_test.shape[0])
print(X_test.shape[0])

37
37


In [20]:
# Fit the model to the training data
reg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = reg.predict(X_test)


In [21]:
# Add a new column to the original dataset DataFrame
data_df["Predicted_Price"] = np.nan

data_df.reset_index(drop=True, inplace=True)

# Add the actual value of SalePrice to the original dataset for comparison purpose
data_df.loc[X_test.index, "Actual_Price"] = y_test

# Assign the predicted values to the new column
data_df.loc[X_test.index, "Predicted_Price"] = y_pred
data_df


Unnamed: 0,LotArea,LotShape,BldgType,YearBuilt,Neighborhood,YearRemodAdd,Heating,TtlHouseSF,FullBath,HalfBath,BedroomAbvGr,Fireplaces,GarageType,GarageCars,PavedDrive,PoolArea,YrSold,SalePrice,Predicted_Price,Actual_Price
0,8400,Reg,1Fam,1950,Sawyer West,1950,GasA,1562,1,0,2,False,CarPort,1,N,False,2009,82000,,
1,9000,Reg,1Fam,1928,Sawyer West,1950,GasA,1568,1,0,2,False,Detchd,2,N,False,2009,76000,,
2,8400,Reg,Duplex,1977,Sawyer West,1977,GasA,2020,2,0,4,True,Detchd,2,Y,False,2007,144000,126156.508353,144000.0
3,7800,Reg,1Fam,1965,Sawyer West,1965,GasA,1793,1,0,3,False,Detchd,1,Y,False,2006,119900,105451.087831,119900.0
4,8900,Reg,1Fam,1966,Sawyer West,1966,GasA,2112,1,0,2,False,Detchd,1,Y,False,2006,107000,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1074,1680,Reg,Twnhs,1972,Briardale,1972,GasA,1932,2,1,3,False,Detchd,1,Y,False,2009,119500,,
1075,2368,Reg,TwnhsE,1970,Briardale,1970,GasA,2130,1,1,3,False,Attchd,2,Y,False,2009,125000,,
1076,1953,Reg,Twnhs,1973,Briardale,1973,GasA,1470,1,1,2,False,Detchd,1,Y,False,2006,83000,,
1077,1950,Reg,Twnhs,1980,Bluestem,1980,GasA,2272,2,1,3,True,Attchd,2,Y,False,2008,151000,,


In [23]:
# Write the dataframe to a CSV file
data_df.to_csv('./Resource/predicted_data1.csv', index=False)

## Evaluate the model

In [24]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, explained_variance_score, mean_squared_log_error, median_absolute_error

# Mean Absolute Error
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

# Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Root Mean Squared Error
rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

# R-squared
r2 = r2_score(y_test, y_pred)
print("R-squared:", r2)

#Adjusted R-squared
n = X_test.shape[0]
p = X_test.shape[1]
adjusted_r2 = 1 - (1-r2)*(n-1)/(n-p-1)
print("Adjusted R-squared:", adjusted_r2)

#Explained variance score
exp_var_score = explained_variance_score(y_test, y_pred)
print("Explained variance score:", exp_var_score)

#Mean squared logarithmic error
msle = mean_squared_log_error(y_test, y_pred)
print("Mean squared logarithmic error:", msle)

#Median absolute error
med_ae = median_absolute_error(y_test, y_pred)
print("Median absolute error:", med_ae)


Mean Absolute Error: 20260.396655587145
Mean Squared Error: 617347195.2608962
Root Mean Squared Error: 24846.472491299366
R-squared: 0.8080992697887824
Adjusted R-squared: 1.2228524608904463
Explained variance score: 0.8226524384423772
Mean squared logarithmic error: 0.034032733359603974
Median absolute error: 18077.995505210944
