In [1]:
import pandas as pd # python's data handling package
import numpy as np # python's scientific computing package
import matplotlib.pyplot as plt # python's plotting package
import openpyxl
from sklearn.metrics import mean_squared_error as mse
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import StandardScaler



In [2]:
original_data = pd.read_excel('Original_Data.xlsx', engine='openpyxl', header=3) 
original_data = original_data.iloc[:,:-2]
print(original_data.head())

print(original_data['LotFrontage'].describe())
print(original_data.isnull().sum())

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   

In [3]:
original_data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2903,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2006,WD,Normal,90500
2904,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2006,WD,Abnorml,71000
2905,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,,,,0,9,2006,WD,Abnorml,131000
2906,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,Shed,700,7,2006,WD,Normal,132000


In [4]:
features = ['LotFrontage', 'OverallQual', 'YearBuilt', 'LotArea', 'SalePrice']
subset = original_data[features]

In [5]:
train = subset.iloc[:1800] 
val = subset.iloc[1800:2400]

X_train, X_val = train.drop('SalePrice', axis=1), val.drop('SalePrice', axis=1)
y_train, y_val = train[['SalePrice']], val[['SalePrice']] 


In [6]:
print(X_train.isnull().sum())

LotFrontage    311
OverallQual      0
YearBuilt        0
LotArea          0
dtype: int64


In [7]:
mses_mean_imputation = {}
mses_median_imputation = {}
mses_knn_imputation = {}

# Mean Imputer
mean_imputer = SimpleImputer(strategy='mean')
# Deep copy before transformation
X_train_mean_dc = X_train.copy(deep=True)
X_val_mean_dc = X_val.copy(deep=True)
X_train_mean = mean_imputer.fit_transform(X_train_mean_dc)
X_val_mean = mean_imputer.transform(X_val_mean_dc)

# Median Imputer
median_imputer = SimpleImputer(strategy='median')
# Deep copy before transformation
X_train_median_dc = X_train.copy(deep=True)
X_val_median_dc = X_val.copy(deep=True)
X_train_median = median_imputer.fit_transform(X_train_median_dc)
X_val_median = median_imputer.transform(X_val_median_dc)

# KNN Imputer
knn_imputer = KNNImputer(n_neighbors=5)
# Deep copy before transformation
X_train_knn_dc = X_train.copy(deep=True)
X_val_knn_dc = X_val.copy(deep=True)
X_train_knn = knn_imputer.fit_transform(X_train_knn_dc)
X_val_knn = knn_imputer.transform(X_val_knn_dc)

# Repeat deep copying for any additional transformations as necessary, using the same pattern:
# For example, if applying transformations again for a specific analysis:
ls_X_train_mean_dc = X_train.copy(deep=True)
ls_X_val_mean_dc = X_val.copy(deep=True)
ls_X_train_mean = mean_imputer.fit_transform(ls_X_train_mean_dc)
ls_X_val_mean = mean_imputer.transform(ls_X_val_mean_dc)

ls_X_train_median_dc = X_train.copy(deep=True)
ls_X_val_median_dc = X_val.copy(deep=True)
ls_X_train_median = median_imputer.fit_transform(ls_X_train_median_dc)
ls_X_val_median = median_imputer.transform(ls_X_val_median_dc)

ls_X_train_knn_dc = X_train.copy(deep=True)
ls_X_val_knn_dc = X_val.copy(deep=True)
ls_X_train_knn = knn_imputer.fit_transform(ls_X_train_knn_dc)
ls_X_val_knn = knn_imputer.transform(ls_X_val_knn_dc)


scaler = StandardScaler()

# Standardize X_train_mean and overwrite
X_train_mean = scaler.fit_transform(X_train_mean)

# Standardize X_train_median and overwrite
# It's important to fit the scaler again for accurate mean and std
X_train_median = scaler.fit_transform(X_train_median)

# Standardize X_train_knn and overwrite
# Fit the scaler again for this dataset
X_train_knn = scaler.fit_transform(X_train_knn)

scaler_mean = StandardScaler().fit(X_train_mean)
scaler_median = StandardScaler().fit(X_train_median)
scaler_knn = StandardScaler().fit(X_train_knn)

# Apply the fitted scalers to the corresponding validation datasets
X_val_mean = scaler_mean.transform(X_val_mean)
X_val_median = scaler_median.transform(X_val_median)
X_val_knn = scaler_knn.transform(X_val_knn)

In [8]:
# Assumed datasets
# X_train_mean, X_val_mean, y_train, y_val_mean (mean-imputed training and validation sets)
# X_train_median, X_val_median, y_train, y_val_median (median-imputed training and validation sets)
# X_train_knn, X_val_knn, y_train, y_val_knn (KNN-imputed training and validation sets)

# Define your alpha values
alphas = [0.01*1800, 0.02*1800, 0.03*1800, 0.04*1800, 0.05*1800, 0.075*1800, 0.1*1800, 0.2*1800, 0.4*1800]

# Placeholder for MSE results
mses = {
    'mean': [],
    'median': [],
    'knn': []
}

# Datasets dictionary for easier iteration
datasets = {
    'mean': (X_train_mean, X_val_mean, y_val),
    'median': (X_train_median, X_val_median, y_val),
    'knn': (X_train_knn, X_val_knn, y_val)
}

# Loop over each alpha
for alpha in alphas:
    print(f"Alpha: {alpha}")
    
    # Evaluate each dataset
    for imputation_type, (X_train, X_val, y_val) in datasets.items():
        ridge = Ridge(alpha=alpha)
        ridge.fit(X_train, y_train)  # Fit model using training data
        pred = ridge.predict(X_val)  # Predict on respective validation set
        mse_value = mse(y_val, pred)  # Calculate MSE for the validation set
        mses[imputation_type].append(mse_value)  # Store MSE
        print(f"{imputation_type} MSE: {mse_value}")


Alpha: 18.0
mean MSE: 2.5615811826563556e+16
median MSE: 2.5665524453118476e+16
knn MSE: 2.2491230587414508e+16
Alpha: 36.0
mean MSE: 2.552168919665235e+16
median MSE: 2.557051607312204e+16
knn MSE: 2.2452905222510584e+16
Alpha: 54.0
mean MSE: 2.5418371967882764e+16
median MSE: 2.5466331114123936e+16
knn MSE: 2.2404761535591856e+16
Alpha: 72.0
mean MSE: 2.530682237038894e+16
median MSE: 2.5353931668114492e+16
knn MSE: 2.2347694225895616e+16
Alpha: 90.0
mean MSE: 2.5187911759063516e+16
median MSE: 2.5234188904206364e+16
knn MSE: 2.228251925389069e+16
Alpha: 135.0
mean MSE: 2.486343538072442e+16
median MSE: 2.49077080240653e+16
knn MSE: 2.2088842475882024e+16
Alpha: 180.0
mean MSE: 2.4508170445407776e+16
median MSE: 2.4550543519963396e+16
knn MSE: 2.1858975177303696e+16
Alpha: 360.0
mean MSE: 2.2920597785182456e+16
median MSE: 2.2956322518505652e+16
knn MSE: 2.0716598650019476e+16
Alpha: 720.0
mean MSE: 1.966980959234326e+16
median MSE: 1.9695806394200964e+16
knn MSE: 1.811696460000892e+

Modify ^ so that it only selects the best mean media knn alpha for each run - have a comparison loop here to do so 

In [9]:

lr1 = LinearRegression()
lr2 = LinearRegression()
lr3 = LinearRegression()
lr1.fit(X_train_mean, y_train)
pred_lr = lr1.predict(X_val_mean)
mses_mean_imputation['Linear Regression'] = mse(y_val, pred_lr)

lr2.fit(X_train_median, y_train)
pred_lr = lr2.predict(X_val_median)
mses_median_imputation['Linear Regression'] = mse(y_val, pred_lr)

lr3.fit(X_train_knn, y_train)
pred_lr = lr3.predict(X_val_knn)
mses_knn_imputation['Linear Regression'] = mse(y_val, pred_lr)


In [10]:
ridge = Ridge(alpha=0.1 * len(X_train))  # Adjust alpha as necessary
ridge.fit(X_train_mean, y_train)
pred_ridge = ridge.predict(X_val_mean)
mses_mean_imputation['Ridge'] = mse(y_val, pred_ridge)

ridge.fit(X_train_median, y_train)
pred_ridge = ridge.predict(X_val_median)
mses_median_imputation['Ridge'] = mse(y_val, pred_ridge)

ridge.fit(X_train_knn, y_train)
pred_ridge = ridge.predict(X_val_knn)
mses_knn_imputation['Ridge'] = mse(y_val, pred_ridge)

In [13]:
lasso = Lasso(alpha=0.05)  # Adjust alpha as necessary
lasso.fit(X_train_mean, y_train)
pred_lasso = lasso.predict(X_val_mean)
mses_mean_imputation['Lasso'] = mse(y_val, pred_lasso)

lasso.fit(X_train_median, y_train)
pred_lasso = lasso.predict(X_val_median)
mses_median_imputation['Lasso'] = mse(y_val, pred_lasso)

lasso.fit(X_train_knn, y_train)
pred_lasso = lasso.predict(X_val_knn)
mses_knn_imputation['Lasso'] = mse(y_val, pred_lasso)

In [15]:
print("MSE with Mean Imputation:")
for model, mse_val in mses_mean_imputation.items():
    print(f"{model}: {mse_val}")
    
print("\nMSE with Median Imputation:")
for model, mse_val in mses_median_imputation.items():
    print(f"{model}: {mse_val}")
    
print("\nMSE with KNN Imputation:")
for model, mse_val in mses_knn_imputation.items():
    print(f"{model}: {mse_val}")

MSE with Mean Imputation:
Linear Regression: 2.569967621697147e+16
Ridge: 2.4508170445407776e+16
Lasso: 2.569951684895292e+16

MSE with Median Imputation:
Linear Regression: 2.5750292692141772e+16
Ridge: 2.4550543519963396e+16
Lasso: 2.575013307790789e+16

MSE with KNN Imputation:
Linear Regression: 2.2518755595402316e+16
Ridge: 2.1858975177303696e+16
Lasso: 2.2518611652947492e+16


Make a copy of this entire process and redo it when including Lot Shape 