In [29]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model as lm
from sklearn.metrics import mean_squared_error

original_data = pd.read_excel("Original_Data.xlsx", header = 3, usecols = "A:CC")
hull_data = pd.read_csv('Houseprice_data_scaled.csv') 

lr = lm.LinearRegression()
rg = lm.Ridge(alpha = 0.01)
ls = lm.Lasso(alpha = 0.01)

scaler = StandardScaler()


In [47]:
#Imputation Strategy X: Remove all missing data (Scrapped)

# imputed_data_1 = original_data.copy()

# nan_map = imputed_data_1.isna()

# for i, row in nan_map.iterrows():
#     if row['LotFrontage']:
#         imputed_data_1 = imputed_data_1.drop([i])

# X = imputed_data_1.loc[:, ["LotFrontage"]]
# y = np.dot(X, np.array(imputed_data_1.loc[:, ['SalePrice']]))

# coeffs = pd.DataFrame(
#     [
#         ['intercept'] + list(X_train.columns),
#         list(lr.intercept_) + list(lr.coef_[0])
#     ]
# ).transpose().set_index(0)

# strat1_linear = lm.LinearRegression().fit(X, y)
# strat1_lasso = lm.Lasso(alpha = 0.01).fit(X, y)
# strat1_ridge = lm.Ridge(alpha = 0.01).fit(X, y)

# print(strat1_linear.score(X, y))
# print(strat1_lasso.score(X, y))
# print(strat1_ridge.score(X, y))

In [None]:
#Scaling the data

In [30]:
#Finding Optimal Alphas
def AlphaOptimizer(method, X_train, y_train, X_val, y_val):
    
    mses=[]
    
    if method == "Ridge":
        alphas=[0.01*1800, 0.02*1800, 0.03*1800, 0.04*1800, 0.05*1800, 0.075*1800,0.1*1800,0.2*1800, 0.4*1800]
        for alpha in alphas:
            ridge = lm.Ridge(alpha=alpha)
            ridge.fit(X_train,y_train)
            pred = ridge.predict(X_val)
            mses.append(mean_squared_error(y_val,pred))
            return min(mses)
        
    elif method == "Lasso":
        alphas=[0.01/2, 0.02/2, 0.03/2, 0.04/2, 0.05/2, 0.075/2, 0.1/2]
        for alpha in alphas:
            lasso = lm.Lasso(alpha=alpha)
            lasso.fit(X_train,y_train)
            pred = lasso.predict(X_val)
            mses.append(mean_squared_error(y_val,pred))
            return min(mses)
        
    else:
        return("Not available method")

In [32]:
#Imputation Strategy 1: Replace missing data with the mean of data
imputed_data_1 = original_data.copy()

nan_map = imputed_data_1.isna()

mean = imputed_data_1["LotFrontage"].mean()

for i, row in nan_map.iterrows():
    if row['LotFrontage']:
        imputed_data_1["LotFrontage"][i] = mean

train1 = hull_data[:1800]

train1.insert(0, "LotFrontage", imputed_data_1.loc[:1799, ["LotFrontage"]].values)

val1 = hull_data[1800:2400]

val1.insert(0, "LotFrontage", imputed_data_1.loc[1800:2399, ["LotFrontage"]].values)

X_train1, X_val1 = train1.drop('Sale Price', axis=1), val1.drop('Sale Price', axis=1)
y_train1, y_val1 = train1[['Sale Price']], val1[['Sale Price']] 

X_train1 = scaler.fit_transform(X_train1)
X_val1 = scaler.transform(X_val1)

#Linear Regression
strat1_linear = lr.fit(X_train1, y_train1)
val1_lr_pred = strat1_linear.predict(X_val1)
print("Linear Regression:")
print("R-Score:", strat1_linear.score(X_train1, y_train1))
print("MSE:", mean_squared_error(y_val1, val1_lr_pred))

#Ridge
strat1_ridge = rg.fit(X_train1, y_train1)
val1_rg_pred = strat1_ridge.predict(X_val1)
print("\nRidge:")
print("R-Score:", strat1_ridge.score(X_train1, y_train1))
print("MSE:", AlphaOptimizer("Ridge", X_train1, y_train1, X_val1, y_val1))

#Lasso
strat1_lasso = ls.fit(X_train1, y_train1)
val1_ls_pred = strat1_lasso.predict(X_val1)
print("\nLasso:")
print("R-Score:", strat1_lasso.score(X_train1, y_train1))
print("MSE:", AlphaOptimizer("Lasso", X_train1, y_train1, X_val1, y_val1))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imputed_data_1["LotFrontage"][i] = mean


Linear Regression:
R-Score: 0.883964526068077
MSE: 0.11494154465047382

Ridge:
R-Score: 0.8878587627288108
MSE: 0.11326369106126791

Lasso:
R-Score: 0.8844910428276486
MSE: 0.11282838839994151


In [34]:
#Imputation Strategy 2: Replace missing data with the median of data
imputed_data_2 = original_data.copy()

nan_map = imputed_data_2.isna()

median = imputed_data_2["LotFrontage"].median()

for i, row in nan_map.iterrows():
    if row['LotFrontage']:
        imputed_data_2["LotFrontage"][i] = median

train2 = hull_data[:1800]

train2.insert(0, "LotFrontage", imputed_data_2.loc[:1799, ["LotFrontage"]].values)

val2 = hull_data[1800:2400]

val2.insert(0, "LotFrontage", imputed_data_2.loc[1800:2399, ["LotFrontage"]].values)

X_train2, X_val2 = train2.drop('Sale Price', axis=1), val2.drop('Sale Price', axis=1)
y_train2, y_val2 = train2[['Sale Price']], val2[['Sale Price']] 

X_train2 = scaler.fit_transform(X_train2)
X_val2 = scaler.transform(X_val2)

#Linear Regression
strat2_linear = lr.fit(X_train2, y_train2)
val2_lr_pred = strat2_linear.predict(X_val2)
print("Linear Regression:")
print("R-Score:", strat2_linear.score(X_train2, y_train2))
print("MSE:", mean_squared_error(y_val2, val2_lr_pred))

#Ridge
strat2_ridge = rg.fit(X_train2, y_train2)
val2_rg_pred = strat2_ridge.predict(X_val2)
print("\nRidge:")
print("R-Score:", strat2_ridge.score(X_train2, y_train2))
print("MSE:", AlphaOptimizer("Ridge", X_train2, y_train2, X_val2, y_val2))

#Lasso
strat2_lasso = ls.fit(X_train2, y_train2)
val2_ls_pred = strat2_lasso.predict(X_val2)
print("\nLasso:")
print("R-Score:", strat2_lasso.score(X_train2, y_train2))
print("MSE:", AlphaOptimizer("Lasso", X_train2, y_train2, X_val2, y_val2))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imputed_data_2["LotFrontage"][i] = median


Linear Regression:
R-Score: 0.8878859534584692
MSE: 0.1132128495398123

Ridge:
R-Score: 0.8878856877940828
MSE: 0.11322040864225225

Lasso:
R-Score: 0.8845191770319231
MSE: 0.11277532846444549


In [33]:
#Imputation Strategy 3: Replace missing data with linear interpolation prediction
imputed_data_3 = original_data.copy()

nan_map = imputed_data_3.isna()

last_valid_value = imputed_data_3["LotFrontage"][imputed_data_3["LotFrontage"].first_valid_index()]

for i, row in nan_map.iterrows():
    if row['LotFrontage']:
        j = i
        while nan_map["LotFrontage"][j]:
            j += 1
        next_valid_value = imputed_data_3["LotFrontage"][j]
        imputed_data_3["LotFrontage"][i] = (next_valid_value + last_valid_value) / 2
    else:
        last_valid_value = imputed_data_3["LotFrontage"][i]

train3 = hull_data[:1800]

train3.insert(0, "LotFrontage", imputed_data_3.loc[:1799, ["LotFrontage"]].values)

val3 = hull_data[1800:2400]

val3.insert(0, "LotFrontage", imputed_data_3.loc[1800:2399, ["LotFrontage"]].values)

X_train3, X_val3 = train3.drop('Sale Price', axis=1), val3.drop('Sale Price', axis=1)
y_train3, y_val3 = train3[['Sale Price']], val3[['Sale Price']] 

X_train3 = scaler.fit_transform(X_train3)
X_val3 = scaler.transform(X_val3)

#Linear Regression
strat3_linear = lr.fit(X_train3, y_train3)
val3_lr_pred = strat3_linear.predict(X_val3)
print("Linear Regression:")
print("R-Score:", strat3_linear.score(X_train3, y_train3))
print("MSE:", mean_squared_error(y_val3, val3_lr_pred))

#Ridge
strat3_ridge = rg.fit(X_train3, y_train3)
val3_rg_pred = strat3_ridge.predict(X_val3)
print("\nRidge:")
print("R-Score:", strat3_ridge.score(X_train3, y_train3))
print("MSE:", AlphaOptimizer("Ridge", X_train3, y_train3, X_val3, y_val3))

#Lasso
strat3_lasso = ls.fit(X_train3, y_train3)
val3_ls_pred = strat3_lasso.predict(X_val3)
print("\nLasso:")
print("R-Score:", strat3_lasso.score(X_train3, y_train3))
print("MSE:", AlphaOptimizer("Lasso", X_train3, y_train3, X_val3, y_val3))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imputed_data_3["LotFrontage"][i] = (next_valid_value + last_valid_value) / 2


Linear Regression:
R-Score: 0.8875411443569411
MSE: 0.11358578953124135

Ridge:
R-Score: 0.8875437620895417
MSE: 0.11364028204620193

Lasso:
R-Score: 0.8841481871317547
MSE: 0.11321248417972676
