In [1]:
import pandas as pd # python's data handling package
import numpy as np # python's scientific computing package
import matplotlib.pyplot as plt # python's plotting package
import openpyxl
from sklearn.metrics import mean_squared_error as mse
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import StandardScaler

In [2]:
original_data = pd.read_excel('Original_Data.xlsx', engine='openpyxl', header=3) 
original_data = original_data.iloc[:,:-2]
print(original_data.head())

print(original_data['LotFrontage'].describe())
print(original_data.isnull().sum())

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   

In [3]:
original_data.keys()

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [4]:
features = ['LotFrontage', 'OverallQual', 'YearBuilt', 'LotArea', 'SalePrice', 'LotShape', 'GarageArea', 'OpenPorchSF']
subset = original_data[features]

In [5]:
train = subset.iloc[:1800] 
val = subset.iloc[1800:2400]

X_train, X_val = train.drop('SalePrice', axis=1), val.drop('SalePrice', axis=1)
y_train, y_val = train[['SalePrice']], val[['SalePrice']] 


In [6]:
X_train

Unnamed: 0,LotFrontage,OverallQual,YearBuilt,LotArea,LotShape,GarageArea,OpenPorchSF
0,65.0,7,2003,8450,Reg,548,61
1,80.0,6,1976,9600,Reg,460,0
2,68.0,7,2001,11250,IR1,608,42
3,60.0,7,1915,9550,IR1,642,35
4,84.0,8,2000,14260,IR1,836,84
...,...,...,...,...,...,...,...
1795,80.0,5,1959,8000,Reg,288,0
1796,60.0,4,1949,10800,Reg,384,204
1797,60.0,6,1948,8064,Reg,576,36
1798,76.0,6,1964,7570,Reg,483,55


In [7]:
print(X_train.isnull().sum())

LotFrontage    311
OverallQual      0
YearBuilt        0
LotArea          0
LotShape         0
GarageArea       0
OpenPorchSF      0
dtype: int64


In [8]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer

# Assuming X_train and X_val are pandas DataFrames
mses_mean_imputation = {}
mses_median_imputation = {}
mses_knn_imputation = {}
# Select numerical and categorical columns
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = ['LotShape']  # Specify categorical columns; add more if needed

# Define transformers
one_hot_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
mean_imputer = SimpleImputer(strategy='mean')
median_imputer = SimpleImputer(strategy='median')
knn_imputer = KNNImputer(n_neighbors=5)

# Create column transformers for one-hot encoding of categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', one_hot_encoder, categorical_cols),
    ],
    remainder='passthrough'  # passthrough numerical columns as is
)

# Deep copy datasets
X_train_dc = X_train.copy(deep=True)
X_val_dc = X_val.copy(deep=True)

# Apply one-hot encoding to 'LotShape' in the deep copied data
X_train_encoded = preprocessor.fit_transform(X_train_dc)
X_val_encoded = preprocessor.transform(X_val_dc)

# Convert encoded arrays back to DataFrames
# Extract new column names for the one-hot encoded categories
columns_transformed = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols)
new_columns = list(columns_transformed) + [col for col in numerical_cols if col not in categorical_cols]

X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=new_columns)
X_val_encoded_df = pd.DataFrame(X_val_encoded, columns=new_columns)

# Now apply imputation strategies on the one-hot encoded data
# Mean Imputation
X_train_mean = mean_imputer.fit_transform(X_train_encoded_df)
X_val_mean = mean_imputer.transform(X_val_encoded_df)

# Median Imputation
X_train_median = median_imputer.fit_transform(X_train_encoded_df)
X_val_median = median_imputer.transform(X_val_encoded_df)

# KNN Imputation
X_train_knn = knn_imputer.fit_transform(X_train_encoded_df)
X_val_knn = knn_imputer.transform(X_val_encoded_df)

scaler = StandardScaler()

# Standardize X_train_mean and overwrite
X_train_mean = scaler.fit_transform(X_train_mean)

# Standardize X_train_median and overwrite
# It's important to fit the scaler again for accurate mean and std
X_train_median = scaler.fit_transform(X_train_median)

# Standardize X_train_knn and overwrite
# Fit the scaler again for this dataset
X_train_knn = scaler.fit_transform(X_train_knn)

scaler_mean = StandardScaler().fit(X_train_mean)
scaler_median = StandardScaler().fit(X_train_median)
scaler_knn = StandardScaler().fit(X_train_knn)

# Apply the fitted scalers to the corresponding validation datasets
X_val_mean = scaler_mean.transform(X_val_mean)
X_val_median = scaler_median.transform(X_val_median)
X_val_knn = scaler_knn.transform(X_val_knn)




In [9]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error as mse

# Assumed datasets
# X_train_mean, X_val_mean, y_train, y_val_mean (mean-imputed training and validation sets)
# X_train_median, X_val_median, y_train, y_val_median (median-imputed training and validation sets)
# X_train_knn, X_val_knn, y_train, y_val_knn (KNN-imputed training and validation sets)

# Define your alpha values
alphas = [0.01*1800, 0.02*1800, 0.03*1800, 0.04*1800, 0.05*1800, 0.075*1800, 0.1*1800, 0.2*1800, 0.4*1800]

# Placeholder for MSE results
mses = {
    'mean': [],
    'median': [],
    'knn': []
}

# Datasets dictionary for easier iteration
datasets = {
    'mean': (X_train_mean, X_val_mean, y_val),
    'median': (X_train_median, X_val_median, y_val),
    'knn': (X_train_knn, X_val_knn, y_val)
}

# Loop over each alpha
for alpha in alphas:
    print(f"Alpha: {alpha}")
    
    # Evaluate each dataset
    for imputation_type, (X_train, X_val, y_val) in datasets.items():
        ridge = Ridge(alpha=alpha)
        ridge.fit(X_train, y_train)  # Fit model using training data
        pred = ridge.predict(X_val)  # Predict on respective validation set
        mse_value = mse(y_val, pred)  # Calculate MSE for the validation set
        mses[imputation_type].append(mse_value)  # Store MSE
        print(f"{imputation_type} MSE: {mse_value}")


Alpha: 18.0
mean MSE: 1.821182021300393e+16
median MSE: 1.8219679942431436e+16
knn MSE: 1.7121972649608796e+16
Alpha: 36.0
mean MSE: 1.8090600228059852e+16
median MSE: 1.8098194098572336e+16
knn MSE: 1.701915930620342e+16
Alpha: 54.0
mean MSE: 1.7966948989914642e+16
median MSE: 1.7974296649147826e+16
knn MSE: 1.691348514799728e+16
Alpha: 72.0
mean MSE: 1.7841332106950028e+16
median MSE: 1.7848451456674236e+16
knn MSE: 1.6805425532583872e+16
Alpha: 90.0
mean MSE: 1.771415780821307e+16
median MSE: 1.772106517628459e+16
knn MSE: 1.6695398664833564e+16
Alpha: 135.0
mean MSE: 1.7391651989548258e+16
median MSE: 1.7398091849886846e+16
knn MSE: 1.6414034759465558e+16
Alpha: 180.0
mean MSE: 1.7066033928231266e+16
median MSE: 1.7072081095147458e+16
knn MSE: 1.6127218097893594e+16
Alpha: 360.0
mean MSE: 1.5785165472521816e+16
median MSE: 1.5790140262386798e+16
knn MSE: 1.498112702743423e+16
Alpha: 720.0
mean MSE: 1.35242954917264e+16
median MSE: 1.3528234572647024e+16
knn MSE: 1.291666016836939e+

Modify ^ so that it only selects the best mean media knn alpha for each run 

In [10]:

lr1 = LinearRegression()
lr2 = LinearRegression()
lr3 = LinearRegression()
lr1.fit(X_train_mean, y_train)
pred_lr = lr1.predict(X_val_mean)
mses_mean_imputation['Linear Regression'] = mse(y_val, pred_lr)

lr2.fit(X_train_median, y_train)
pred_lr = lr2.predict(X_val_median)
mses_median_imputation['Linear Regression'] = mse(y_val, pred_lr)

lr3.fit(X_train_knn, y_train)
pred_lr = lr3.predict(X_val_knn)
mses_knn_imputation['Linear Regression'] = mse(y_val, pred_lr)


In [11]:
ridge = Ridge(alpha=0.1 * len(X_train))  # Adjust alpha as necessary
ridge.fit(X_train_mean, y_train)
pred_ridge = ridge.predict(X_val_mean)
mses_mean_imputation['Ridge'] = mse(y_val, pred_ridge)

ridge.fit(X_train_median, y_train)
pred_ridge = ridge.predict(X_val_median)
mses_median_imputation['Ridge'] = mse(y_val, pred_ridge)

ridge.fit(X_train_knn, y_train)
pred_ridge = ridge.predict(X_val_knn)
mses_knn_imputation['Ridge'] = mse(y_val, pred_ridge)

In [12]:
# from sklearn.linear_model import Lasso
# lasso = Lasso(alpha=0.05)
# lasso.fit(X_train, y_train)

In [13]:
# coeffs = pd.DataFrame(
#     [
#         ['intercept'] + list(X_train.columns),
#         list(lasso.intercept_) + list(lasso.coef_)
#     ]
# ).transpose().set_index(0)
# coeffs

In [14]:
lasso = Lasso(alpha=0.05)  # Adjust alpha as necessary
lasso.fit(X_train_mean, y_train)
pred_lasso = lasso.predict(X_val_mean)
mses_mean_imputation['Lasso wiht Lot Shape'] = mse(y_val, pred_lasso)

lasso.fit(X_train_median, y_train)
pred_lasso = lasso.predict(X_val_median)
mses_median_imputation['Lasso with Lot Shape'] = mse(y_val, pred_lasso)

lasso.fit(X_train_knn, y_train)
pred_lasso = lasso.predict(X_val_knn)
mses_knn_imputation['Lasso with Lot Shape'] = mse(y_val, pred_lasso)

In [15]:
# lasso.fit(ls_X_train_mean, y_train)
# ls_pred_lasso_mean = lasso.predict(ls_X_val_mean)
# mses_mean_imputation['Lasso with Lot Shape'] = mse(y_val, ls_pred_lasso_mean)

# lasso.fit(ls_X_train_median, y_train)
# ls_pred_lasso_median = lasso.predict(ls_X_val_median)
# mses_median_imputation['Lasso with Lot Shape'] = mse(y_val, ls_pred_lasso_median)

# lasso.fit(ls_X_train_knn, y_train)
# ls_pred_lasso_knn = lasso.predict(ls_X_val_knn)
# mses_knn_imputation['Lasso with Lot Shape'] = mse(y_val, ls_pred_lasso_knn)

In [16]:
print("MSE with Mean Imputation:")
for model, mse_val in mses_mean_imputation.items():
    print(f"{model}: {mse_val}")
    
print("\nMSE with Median Imputation:")
for model, mse_val in mses_median_imputation.items():
    print(f"{model}: {mse_val}")
    
print("\nMSE with KNN Imputation:")
for model, mse_val in mses_knn_imputation.items():
    print(f"{model}: {mse_val}")

MSE with Mean Imputation:
Linear Regression: 2.4334171595377352e+33
Ridge: 1.7066033928231266e+16
Lasso wiht Lot Shape: 1.833029049772613e+16

MSE with Median Imputation:
Linear Regression: 2.4247373547433552e+33
Ridge: 1.7072081095147458e+16
Lasso with Lot Shape: 1.833844026681451e+16

MSE with KNN Imputation:
Linear Regression: 2.7898504182232445e+32
Ridge: 1.6127218097893594e+16
Lasso with Lot Shape: 1.7221533758671714e+16
