In [1]:
import pandas as pd # python's data handling package
import numpy as np # python's scientific computing package
import matplotlib.pyplot as plt # python's plotting package
import openpyxl
from sklearn.metrics import mean_squared_error as mse
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import StandardScaler

In [2]:
original_data = pd.read_excel('Original_Data.xlsx', engine='openpyxl', header=3) 
original_data = original_data.iloc[:,:-2]
print(original_data.head())

print(original_data['LotFrontage'].describe())
print(original_data.isnull().sum())

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   

In [3]:
original_data.keys()

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [4]:
features = ['LotFrontage', 'OverallQual', 'YearBuilt', 'LotArea', 'SalePrice', 'LotShape', 'GarageArea', 'OpenPorchSF']
subset = original_data[features]

In [5]:
X = subset.drop('SalePrice', axis=1)  # Features
y = subset['SalePrice']  # Target variable
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# test_size=0.2 means 20% of the data is used for the test set, and 80% for the training set
# random_state is set to a fixed number for reproducibility


In [6]:
X_train

Unnamed: 0,LotFrontage,OverallQual,YearBuilt,LotArea,LotShape,GarageArea,OpenPorchSF
1505,80.0,5,1966,9600,Reg,490,224
435,43.0,7,1996,10667,IR2,550,61
2785,90.0,6,1962,33120,IR3,576,0
237,,7,1993,9453,IR1,457,70
2698,,5,1967,8789,IR1,288,0
...,...,...,...,...,...,...,...
1638,,8,2000,7500,Reg,588,134
1095,,8,1986,3696,Reg,461,74
1130,90.0,6,1880,9900,Reg,205,48
1294,35.0,6,2005,3675,Reg,525,44


In [7]:
print(X_train.isnull().sum())

LotFrontage    391
OverallQual      0
YearBuilt        0
LotArea          0
LotShape         0
GarageArea       0
OpenPorchSF      0
dtype: int64


In [8]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer

# Assuming X_train and X_val are pandas DataFrames
mses_mean_imputation = {}
mses_median_imputation = {}
mses_knn_imputation = {}
# Select numerical and categorical columns
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = ['LotShape']  # Specify categorical columns; add more if needed

# Define transformers
one_hot_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
mean_imputer = SimpleImputer(strategy='mean')
median_imputer = SimpleImputer(strategy='median')
knn_imputer = KNNImputer(n_neighbors=5)

# Create column transformers for one-hot encoding of categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', one_hot_encoder, categorical_cols),
    ],
    remainder='passthrough'  # passthrough numerical columns as is
)

# Deep copy datasets
X_train_dc = X_train.copy(deep=True)
X_val_dc = X_val.copy(deep=True)

# Apply one-hot encoding to 'LotShape' in the deep copied data
X_train_encoded = preprocessor.fit_transform(X_train_dc)
X_val_encoded = preprocessor.transform(X_val_dc)

# Convert encoded arrays back to DataFrames
# Extract new column names for the one-hot encoded categories
columns_transformed = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols)
new_columns = list(columns_transformed) + [col for col in numerical_cols if col not in categorical_cols]

X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=new_columns)
X_val_encoded_df = pd.DataFrame(X_val_encoded, columns=new_columns)

# Now apply imputation strategies on the one-hot encoded data
# Mean Imputation
X_train_mean = mean_imputer.fit_transform(X_train_encoded_df)
X_val_mean = mean_imputer.transform(X_val_encoded_df)

# Median Imputation
X_train_median = median_imputer.fit_transform(X_train_encoded_df)
X_val_median = median_imputer.transform(X_val_encoded_df)

# KNN Imputation
X_train_knn = knn_imputer.fit_transform(X_train_encoded_df)
X_val_knn = knn_imputer.transform(X_val_encoded_df)

scaler = StandardScaler()

# Standardize X_train_mean and overwrite
X_train_mean = scaler.fit_transform(X_train_mean)

# Standardize X_train_median and overwrite
# It's important to fit the scaler again for accurate mean and std
X_train_median = scaler.fit_transform(X_train_median)

# Standardize X_train_knn and overwrite
# Fit the scaler again for this dataset
X_train_knn = scaler.fit_transform(X_train_knn)

scaler_mean = StandardScaler().fit(X_train_mean)
scaler_median = StandardScaler().fit(X_train_median)
scaler_knn = StandardScaler().fit(X_train_knn)

# Apply the fitted scalers to the corresponding validation datasets
X_val_mean = scaler_mean.transform(X_val_mean)
X_val_median = scaler_median.transform(X_val_median)
X_val_knn = scaler_knn.transform(X_val_knn)




In [9]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error as mse

# Assumed datasets
# X_train_mean, X_val_mean, y_train, y_val_mean (mean-imputed training and validation sets)
# X_train_median, X_val_median, y_train, y_val_median (median-imputed training and validation sets)
# X_train_knn, X_val_knn, y_train, y_val_knn (KNN-imputed training and validation sets)

# Define your alpha values
alphas = [0.01*1800, 0.02*1800, 0.03*1800, 0.04*1800, 0.05*1800, 0.075*1800, 0.1*1800, 0.2*1800, 0.4*1800]

# Placeholder for MSE results
mses = {
    'mean': [],
    'median': [],
    'knn': []
}

# Datasets dictionary for easier iteration
datasets = {
    'mean': (X_train_mean, X_val_mean, y_val),
    'median': (X_train_median, X_val_median, y_val),
    'knn': (X_train_knn, X_val_knn, y_val)
}

# Loop over each alpha
for alpha in alphas:
    print(f"Alpha: {alpha}")
    
    # Evaluate each dataset
    for imputation_type, (X_train, X_val, y_val) in datasets.items():
        ridge = Ridge(alpha=alpha)
        ridge.fit(X_train, y_train)  # Fit model using training data
        pred = ridge.predict(X_val)  # Predict on respective validation set
        mse_value = mse(y_val, pred)  # Calculate MSE for the validation set
        mses[imputation_type].append(mse_value)  # Store MSE
        print(f"{imputation_type} MSE: {mse_value}")


Alpha: 18.0
mean MSE: 2.932822896800378e+16
median MSE: 2.9339707295281516e+16
knn MSE: 2.788762333638185e+16
Alpha: 36.0
mean MSE: 2.9071388943174868e+16
median MSE: 2.9082464235887544e+16
knn MSE: 2.7646947367059344e+16
Alpha: 54.0
mean MSE: 2.8818038767623932e+16
median MSE: 2.882875484820494e+16
knn MSE: 2.740950152436076e+16
Alpha: 72.0
mean MSE: 2.856818891994951e+16
median MSE: 2.8578585409734064e+16
knn MSE: 2.7175304501745584e+16
Alpha: 90.0
mean MSE: 2.8321838712015052e+16
median MSE: 2.833195144577683e+16
knn MSE: 2.6944363011805344e+16
Alpha: 135.0
mean MSE: 2.7721187753009256e+16
median MSE: 2.773072625871225e+16
knn MSE: 2.6381208531941692e+16
Alpha: 180.0
mean MSE: 2.7141968880636164e+16
median MSE: 2.7151091657773036e+16
knn MSE: 2.58380915892962e+16
Alpha: 360.0
mean MSE: 2.50264756267808e+16
median MSE: 2.503492288447124e+16
knn MSE: 2.3854428948342456e+16
Alpha: 720.0
mean MSE: 2.1599122610337004e+16
median MSE: 2.1607955314546884e+16
knn MSE: 2.0640730138791396e+16


Modify ^ so that it only selects the best mean media knn alpha for each run - ask chatgpt to do it

In [10]:

lr1 = LinearRegression()
lr2 = LinearRegression()
lr3 = LinearRegression()
lr1.fit(X_train_mean, y_train)
pred_lr = lr1.predict(X_val_mean)
mses_mean_imputation['Linear Regression'] = mse(y_val, pred_lr)

lr2.fit(X_train_median, y_train)
pred_lr = lr2.predict(X_val_median)
mses_median_imputation['Linear Regression'] = mse(y_val, pred_lr)

lr3.fit(X_train_knn, y_train)
pred_lr = lr3.predict(X_val_knn)
mses_knn_imputation['Linear Regression'] = mse(y_val, pred_lr)


In [11]:
ridge = Ridge(alpha=0.1 * len(X_train))  # Adjust alpha as necessary
ridge.fit(X_train_mean, y_train)
pred_ridge = ridge.predict(X_val_mean)
mses_mean_imputation['Ridge'] = mse(y_val, pred_ridge)

ridge.fit(X_train_median, y_train)
pred_ridge = ridge.predict(X_val_median)
mses_median_imputation['Ridge'] = mse(y_val, pred_ridge)

ridge.fit(X_train_knn, y_train)
pred_ridge = ridge.predict(X_val_knn)
mses_knn_imputation['Ridge'] = mse(y_val, pred_ridge)

In [12]:
# from sklearn.linear_model import Lasso
# lasso = Lasso(alpha=0.05)
# lasso.fit(X_train, y_train)

In [13]:
# coeffs = pd.DataFrame(
#     [
#         ['intercept'] + list(X_train.columns),
#         list(lasso.intercept_) + list(lasso.coef_)
#     ]
# ).transpose().set_index(0)
# coeffs

In [14]:
lasso = Lasso(alpha=0.05)  # Adjust alpha as necessary
lasso.fit(X_train_mean, y_train)
pred_lasso = lasso.predict(X_val_mean)
mses_mean_imputation['Lasso wiht Lot Shape'] = mse(y_val, pred_lasso)

lasso.fit(X_train_median, y_train)
pred_lasso = lasso.predict(X_val_median)
mses_median_imputation['Lasso with Lot Shape'] = mse(y_val, pred_lasso)

lasso.fit(X_train_knn, y_train)
pred_lasso = lasso.predict(X_val_knn)
mses_knn_imputation['Lasso with Lot Shape'] = mse(y_val, pred_lasso)

In [15]:
# lasso.fit(ls_X_train_mean, y_train)
# ls_pred_lasso_mean = lasso.predict(ls_X_val_mean)
# mses_mean_imputation['Lasso with Lot Shape'] = mse(y_val, ls_pred_lasso_mean)

# lasso.fit(ls_X_train_median, y_train)
# ls_pred_lasso_median = lasso.predict(ls_X_val_median)
# mses_median_imputation['Lasso with Lot Shape'] = mse(y_val, ls_pred_lasso_median)

# lasso.fit(ls_X_train_knn, y_train)
# ls_pred_lasso_knn = lasso.predict(ls_X_val_knn)
# mses_knn_imputation['Lasso with Lot Shape'] = mse(y_val, ls_pred_lasso_knn)

In [16]:
print("MSE with Mean Imputation:")
for model, mse_val in mses_mean_imputation.items():
    print(f"{model}: {mse_val}")
    
print("\nMSE with Median Imputation:")
for model, mse_val in mses_median_imputation.items():
    print(f"{model}: {mse_val}")
    
print("\nMSE with KNN Imputation:")
for model, mse_val in mses_knn_imputation.items():
    print(f"{model}: {mse_val}")

MSE with Mean Imputation:
Linear Regression: 1.350010786324756e+33
Ridge: 2.649133016215304e+16
Lasso wiht Lot Shape: 2.9588762573626616e+16

MSE with Median Imputation:
Linear Regression: 1.3619809496560625e+33
Ridge: 2.6500121383179332e+16
Lasso with Lot Shape: 2.96006980302188e+16

MSE with KNN Imputation:
Linear Regression: 8.082907199106508e+32
Ridge: 2.522798451631518e+16
Lasso with Lot Shape: 2.8131686876463536e+16


Make a copy of this entire process and redo it when including Lot Shape 