In [136]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso,ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import joblib

import matplotlib.pyplot as plt

class CONFIG:
    cols_to_drop = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu']
    features_to_check = ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 
                        'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
                        'TotRmsAbvGrd', 'GarageYrBlt', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'MiscVal', 'MoSold','TotalArea']
    final_features = ['OverallQual', 'TotalArea', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'ExterQual', 'GarageArea', '1stFlrSF', 'BsmtQual', 'KitchenQual', 'FullBath', 'GarageFinish', 
                      'TotRmsAbvGrd', 'YearBuilt', 'HouseAge', 'YearsSinceRemodel', 'YearRemodAdd', 'Fireplaces', 'GarageYrBlt','MasVnrArea', 'LotArea', 'BsmtFinSF1', 'HeatingQC', 
                      'Foundation', 'LotFrontage', 'OpenPorchSF', 'WoodDeckSF', 'MasVnrType', '2ndFlrSF', 'GarageType']
    top_features = ['OverallQual', 'TotalArea', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'GarageArea', '1stFlrSF', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt', 'HouseAge', 
                    'YearsSinceRemodel', 'YearRemodAdd', 'Fireplaces', 'GarageYrBlt', 'MasVnrArea', 'LotArea', 'BsmtFinSF1', 'LotFrontage', 'OpenPorchSF', 'WoodDeckSF', '2ndFlrSF']

    target = 'SalePrice'
    test_size = 0.25
    random_state = 42
    models = {
    #"Linear Regression": LinearRegression(),
    "Ridge": Ridge(tol=0.005, solver='auto', max_iter=3000, alpha=0.5),
    "Lasso": Lasso( tol=0.005, max_iter=3000, alpha=0.5),
    "ElasticNet": ElasticNet( tol=0.005, max_iter=3000),
    "Random Forest": RandomForestRegressor(n_estimators=500, max_depth=100),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=500, max_depth=50),
    "Decision Tree": DecisionTreeRegressor( max_depth=10, min_samples_split=100),
    "XGBoost": XGBRegressor( n_estimators=500, max_depth=10, learning_rate=0.1),
    "AdaBoost": AdaBoostRegressor( n_estimators=500, learning_rate=0.05),
    "CatBoost": CatBoostRegressor( n_estimators=1000, learning_rate=0.05, verbose=False)
}
    
def load_data(path):
    df = pd.read_csv(path)
    return df

def clean_data(df):
    df = df.drop(CONFIG.cols_to_drop, axis=1)
    return df

def fill_na(df):
    numerical_columns_with_na = df.select_dtypes(include=['float64', 'int64']).columns[df.select_dtypes(include=['float64', 'int64']).isnull().any()].tolist()
    categorical_columns_with_na = df.select_dtypes(include=['object']).columns[df.select_dtypes(include=['object']).isnull().any()].tolist()

    for column in numerical_columns_with_na:
        df[column].fillna(df[column].median(), inplace=True)

    for column in categorical_columns_with_na:
        df[column].fillna(df[column].mode()[0], inplace=True)
    return df

def feature_engieering(df):
    df['YearsSinceRemodel'] = df['YrSold'] - df['YearRemodAdd']
    df['TotalArea'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
    df['HouseAge'] = df['YrSold'] - df['YearBuilt']
    df['HasPool'] = df['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
    df['HasGarage'] = df['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
    df['HasBsmt'] = df['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
    return df


def remove_outliers(df):
    features_to_check = CONFIG.features_to_check
    
    for feature in features_to_check:
        Q1 = df[feature].quantile(0.25)
        Q3 = df[feature].quantile(0.75)
        IQR = Q3 - Q1
        
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        df[feature] = df[feature].apply(lambda x: upper_bound if x > upper_bound else (lower_bound if x < lower_bound else x))   
    return df

def select_features(df, new_cols, type='train'):
    X = df[CONFIG.final_features + new_cols]
    if type == 'train':
        y = df[CONFIG.target]
        return X, y
    else:
        return X

def train_test_split_data(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=CONFIG.test_size, random_state=CONFIG.random_state)
    return X_train, X_test, y_train, y_test

def label_encode(data, type='train'):
    categorical_columns = data.select_dtypes(include=['object']).columns
    for column in categorical_columns:
        if type == 'train':
            le = LabelEncoder()
            data[column] = le.fit(data[column])
            joblib.dump(le, './models/label_encoder.joblib')
        else:
            le = joblib.load('./models/label_encoder.joblib')
        data[column] = le.transform(data[column])
    return data
    
        

def one_hot_encode(data, type='train'):
    selected_categorical_columns = data.select_dtypes(include=['object']).columns
    if type == 'train':
        encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
        data_encoded = encoder.fit(data[selected_categorical_columns])
        joblib.dump(encoder, './models/one_hot_encoder.joblib')
    else:
        encoder = joblib.load('./models/one_hot_encoder.joblib')
    data_encoded = encoder.transform(data[selected_categorical_columns])
    data_encoded_df = pd.DataFrame(data_encoded, columns=encoder.get_feature_names_out(selected_categorical_columns))
    data = data.drop(columns=selected_categorical_columns).reset_index(drop=True)
    data = pd.concat([data, data_encoded_df], axis=1)
    return data

def scale_data(data, type='train'):
    numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns
    if type == 'train':
        scaler = StandardScaler()
        data[numerical_columns] = scaler.fit_transform(data[numerical_columns])
        joblib.dump(scaler, './models/standard_scaler.joblib')
    else:
        scaler = joblib.load('./models/standard_scaler.joblib')
        data[numerical_columns] = scaler.transform(data[numerical_columns])
    return data


def train_model(X_train, y_train, X_test, y_test, model):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    mae = mean_absolute_error(y_test, predictions)
    rmse = mean_squared_error(y_test, predictions, squared=False)
    print("MAE:", mae)
    print("RMSE:", rmse)
    print("R2:", model.score(X_test, y_test))
    return model

def pred_test_data(model, test_data,id, model_name):
    predictions = model.predict(test_data)
    submission = pd.DataFrame({'Id': id, 'SalePrice': predictions})
    submission.to_csv(f"./submissions//submission_{model_name}.csv", index=False)
    return predictions




In [137]:
train_data = load_data('./data/train.csv')
test_data = load_data('./data/test.csv')
test_id = test_data['Id']

train_data = clean_data(train_data)
test_data = clean_data(test_data)

train_data = fill_na(train_data)
test_data = fill_na(test_data)

train_data = feature_engieering(train_data)
test_data = feature_engieering(test_data)


train_data = remove_outliers(train_data)
test_data = remove_outliers(test_data)

In [138]:
def gen_poly_features(data):
    for feature in  CONFIG.top_features:
        data[feature + '_squared'] = data[feature] ** 2
    return data

def gen_poly_features_with_interaction(data):
    new_columns = []
    for i in range(len(CONFIG.top_features)):
        for j in range(i+1, len(CONFIG.top_features)):
            new_column = CONFIG.top_features[i] + "_x_" + CONFIG.top_features[j]
            new_columns.append(new_column)
            data[new_column] = data[CONFIG.top_features[i]] * data[CONFIG.top_features[j]]    
    return data , new_columns


train_data = gen_poly_features(train_data)
test_data = gen_poly_features(test_data)

train_data, train_new_columns = gen_poly_features_with_interaction(train_data)
test_data, test_new_columns = gen_poly_features_with_interaction(test_data)


  data[new_column] = data[CONFIG.top_features[i]] * data[CONFIG.top_features[j]]
  data[new_column] = data[CONFIG.top_features[i]] * data[CONFIG.top_features[j]]
  data[new_column] = data[CONFIG.top_features[i]] * data[CONFIG.top_features[j]]
  data[new_column] = data[CONFIG.top_features[i]] * data[CONFIG.top_features[j]]
  data[new_column] = data[CONFIG.top_features[i]] * data[CONFIG.top_features[j]]
  data[new_column] = data[CONFIG.top_features[i]] * data[CONFIG.top_features[j]]
  data[new_column] = data[CONFIG.top_features[i]] * data[CONFIG.top_features[j]]
  data[new_column] = data[CONFIG.top_features[i]] * data[CONFIG.top_features[j]]
  data[new_column] = data[CONFIG.top_features[i]] * data[CONFIG.top_features[j]]
  data[new_column] = data[CONFIG.top_features[i]] * data[CONFIG.top_features[j]]
  data[new_column] = data[CONFIG.top_features[i]] * data[CONFIG.top_features[j]]
  data[new_column] = data[CONFIG.top_features[i]] * data[CONFIG.top_features[j]]
  data[new_column] = data[CO

In [139]:
# # Log transformations for skewed features
# numerical_features = train_data.select_dtypes(include=['float64', 'int64']).columns
# skewed_features = train_data[numerical_features].apply(lambda x: x.skew()).sort_values(ascending=False)
# skewness = pd.DataFrame({'Skew': skewed_features})

# # Use Box Cox Transformation for highly skewed features (we'll use a threshold of 0.75 for skewness)
# from scipy.special import boxcox1p
# skewed_features = skewness[abs(skewness) > 0.75].index
# lambda_ = 0.15
# for feature in skewed_features:
#     train_data[feature] = boxcox1p(train_data[feature], lambda_)
#     test_data[feature] = boxcox1p(test_data[feature], lambda_)

# # Add interaction terms for some of the top features
# for i in range(len(CONFIG.top_features)):
#     for j in range(i+1, len(CONFIG.top_features)):
#         new_col_name = CONFIG.top_features[i] + "_x_" + CONFIG.top_features[j]
#         train_data[new_col_name] = train_data[CONFIG.top_features[i]] * train_data[CONFIG.top_features[j]]
#         test_data[new_col_name] = test_data[CONFIG.top_features[i]] * test_data[CONFIG.top_features[j]]

# train_data.shape, test_data.shape

In [140]:
# train_data.columns

In [141]:
X, y = select_features(train_data, train_new_columns, type='train')
test_data = select_features(test_data, test_new_columns, type='test')

X_train, X_test, y_train, y_test = train_test_split_data(X, y)


# X_train = label_encode(X_train, type='train')
# X_test = label_encode(X_test, type='test')
# test_data = label_encode(test_data, type='test')


X_train = one_hot_encode(X_train, type='train')
X_test = one_hot_encode(X_test, type='test')
test_data = one_hot_encode(test_data, type='test')

X_train = scale_data(X_train, type='train')
X_test = scale_data(X_test, type='test')
test_data = scale_data(test_data, type='test')



In [142]:
for model_name, model in CONFIG.models.items():
    print(f"Model: {model_name}")
    model = train_model(X_train, y_train, X_test, y_test, model)
    print('===' * 20)
    pred_test_data(model, test_data, test_id, model_name)
    

Model: Ridge
MAE: 18233.13686103512
RMSE: 27379.83910431886
R2: 0.892987506507218
Model: Lasso


  model = cd_fast.enet_coordinate_descent(


MAE: 18336.5643119549
RMSE: 27430.30902037037
R2: 0.8925926254885569
Model: ElasticNet
MAE: 17769.757679791794
RMSE: 28483.387206311872
R2: 0.8841873583758924
Model: Random Forest
MAE: 16887.91705923027
RMSE: 27089.952172829693
R2: 0.8952415227874726
Model: Gradient Boosting
MAE: 23065.228041605897
RMSE: 35038.789727254494
R2: 0.8247448076340362
Model: Decision Tree
MAE: 23334.516902936954
RMSE: 39141.23806888457
R2: 0.7813035076513152
Model: XGBoost
MAE: 17070.68846318493
RMSE: 26400.689364588896
R2: 0.9005045468061448
Model: AdaBoost
MAE: 21031.730924251595
RMSE: 30017.499977569732
R2: 0.8713760808878936
Model: CatBoost
MAE: 15941.466778120082
RMSE: 24856.331866595312
R2: 0.9118044287202427
