In [None]:
# Authors:Haben Abaye & Payal Singh
# Course: CS697A
# Date: 12/27/2020

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import warnings
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from collections import Counter
from collections import defaultdict
from scipy.special import boxcox1p
from sklearn.preprocessing import RobustScaler
from sklearn import metrics
from scipy.stats import norm,skew,probplot
warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestRegressor,StackingRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.model_selection import RepeatedKFold
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from catboost import CatBoostRegressor
from sklearn.pipeline import make_pipeline
from sklearn.neural_network import MLPRegressor
from lightgbm import LGBMRegressor
from mlxtend.regressor import StackingCVRegressor

In [None]:
def get_outliers(data,features): 
    all_outliers = defaultdict(list)
    outlier_array = []
    outlier_number = 5
    for col in features:
        lower_bound = np.percentile(data[col], 25)
        upper_bound = np.percentile(data[col], 75)       
        IQR = upper_bound - lower_bound 
        outlier_range = 1.7 * IQR        
        outlier = data[(data[col] < lower_bound - outlier_range) | (data[col] > upper_bound + outlier_range )]
        outlier_index = outlier.index        
        outlier_array.extend(outlier_index)
        
    outlier_array = Counter(outlier_array) 
    for key,value in outlier_array.items():
        if(value > outlier_number):
            all_outliers[key].append(value)
    return all_outliers
    
def clean_data(data):
    threshold_value = 0.8
    data = data[data.columns[data.isnull().mean() < threshold_value]]
    return data

def handle_missing_value(data):
    for col in data.select_dtypes(include=['int64', 'float64']):
        data[col] = data[col].fillna(data[col].median())
        
    object_column = data.loc[:,'MSSubClass':'SaleCondition'].select_dtypes(include='object').columns
    for col in object_column:
        data[col] = data[col].fillna('NA')
    return data

def encode_data(data):
    object_column = data.loc[:,'MSSubClass':'SaleCondition'].select_dtypes(include='object').columns
    #handle ordinal data
    ordinal = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond' ,'HeatingQC', 'KitchenQual', 
               'FireplaceQu','GarageQual', 'GarageCond','Utilities','BsmtFinType1','BsmtFinType2',
               'GarageFinish']
    label_mapping = {'Ex':5, 'Gd':4, 'TA':3, 'Fa':2, 'Po':1, 'NA':0,
                     'AllPub':4, 'NoSewr':3, 'NoSeWa':2, 'ELO':1,
                    'Av':3, 'Mn':2, 'No':1, 'GLQ':6, 'ALQ':5, 'BLQ':4,
                    'Rec':3, 'LwQ':2, 'Unf':1, 'Y':2, 'N':1, 'Fin':3,
                    'RFn': 2, 'GdPrv':4, 'MnPrv':3, 'GdWo':2, 'MnWw':1}
    for col in ordinal:
        data[col] = data[col].map(label_mapping)
        
    #handle nominal data
    nominal  = [item for item in object_column if item not in ordinal]
    data = pd.get_dummies(data, columns=nominal, drop_first=True)
    return data

def add_features(data):
    data['Total_area'] = data['LotArea']+data['1stFlrSF']+ data['2ndFlrSF']
    data['Overall_type'] = data['OverallQual'] + data['OverallCond']
    data['age']= abs(data['YrSold'] - data['YearBuilt'] )
    data['remodeled'] = data['YearRemodAdd'] - data['YearBuilt']
    return data


def handle_skewness(data):
    numerical_value= data.select_dtypes(exclude=object).columns
    skew_value=data[numerical_value].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
    max_skew = 0.60
    skew_value=skew_value[abs(skew_value)>max_skew]
    skew_columns = skew_value.index
    for col in skew_columns:
        data[col] = boxcox1p(data[col], .22)
    return data

In [None]:
#load training data
url = "train.csv"
train_csv = pd.read_csv(url)
train = pd.DataFrame(train_csv)
# Load Testing dataset
url = "test.csv"
test_csv = pd.read_csv(url)
test = pd.DataFrame(test_csv)

In [None]:
train['SalePrice'] = np.log1p(train['SalePrice'])

In [None]:
#find outliers of training from numeric column 
numeric_data = train.loc[:,'MSSubClass':'SaleCondition'].select_dtypes(exclude=['object'])
numeric_column = numeric_data.columns

#get outliers
outliers = get_outliers(train, numeric_column)
#drop outliers
train = train.drop(outliers, axis = 0).reset_index(drop=True)

total_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'], test.loc[:,'MSSubClass':'SaleCondition']))
total_data = add_features(total_data)

total_data = clean_data(total_data)
total_data = handle_missing_value(total_data)
total_data = encode_data(total_data)
total_data = handle_skewness(total_data)

In [None]:
X_train_data = total_data[:train.shape[0]]
X_test_house = total_data[train.shape[0]:]
label_data = train.SalePrice

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_train_data, label_data, test_size=0.07,random_state=42)

In [None]:
rf = RandomForestRegressor(bootstrap=True,criterion='mse',
                           random_state=42,max_depth=45,
                           max_features=30,
                           n_estimators=1500,
                           n_jobs=-1, verbose=2)
rf.fit(X_train, y_train)

In [None]:
xgb = XGBRegressor(booster='gbtree',
                   colsample_bylevel=0.6, colsample_bynode=.4,#1,.5
                   colsample_bytree=0.6, gamma=0,
                   importance_type='gain', learning_rate=0.01,
                   max_delta_step=0, max_depth=45,
                   min_child_weight=4, n_estimators=2500,
                   n_jobs=-1, nthread=None,
                   objective='reg:linear', reg_alpha=0.6,
                   reg_lambda=0.6, scale_pos_weight=1,
                   silent=None, subsample=0.8,
                   verbosity=2)
xgb.fit(X_train, y_train)

In [None]:
lgb = LGBMRegressor(max_depth=45, num_leaves=50,
                    learning_rate=0.05, n_jobs=-1,
                    boosting_type='gbdt', objective='regression',
                    metric='rmse', verbosity=2,
                    bagging_fraction=0.7, feature_fraction=0.6,
                    bagging_freq=4, bagging_seed=42,
                    seed=42, colsample_bynode=.6,
                    colsample_bytree=.6)
lgb.fit(X_train, y_train)

In [None]:
knn = KNeighborsRegressor(n_neighbors=4,leaf_size=60,
                          p=1,metric='manhattan',weights='distance')
knn.fit(X_train, y_train)

In [None]:
svr = make_pipeline(RobustScaler(), SVR(C= 20, epsilon= 0.008, gamma=0.0003))
svr.fit(X_train, y_train)

In [None]:
gbr = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.01,
                                max_depth=4, max_features='sqrt',
                                min_samples_leaf=15, min_samples_split=5,
                                loss='huber', random_state =42) 
gbr.fit(X_train, y_train)

In [None]:
dtr = DecisionTreeRegressor(criterion='mse',random_state=42,
                            max_depth=5, max_features='sqrt',
                            min_samples_leaf=15, min_samples_split=10)

In [None]:
abr = AdaBoostRegressor(dtr,n_estimators=800,
                        random_state=42,learning_rate=0.5)
abr.fit(X_train, y_train)

In [None]:
cat = CatBoostRegressor(depth=6,learning_rate=0.1,
                        n_estimators=3500,eval_metric = 'RMSE')
cat.fit(X_train, y_train)

In [None]:
nn = MLPRegressor(hidden_layer_sizes=(84,84,84,84,84,84,84,84,84), n_iter_no_change=200,
                  activation='relu', verbose=True,
                  learning_rate_init=.0001, tol=0.000001,
                  random_state=761, max_iter=30000,
                  alpha=.0000001, solver='adam',
                  learning_rate='adaptive')
nn.fit(X_train, y_train)

In [None]:
y_pred_model = rf.predict(X_test)
print('RMSE RF: %2f'% np.sqrt(mean_squared_error(y_test, y_pred_model)))

y_pred_model = svr.predict(X_test)
print('RMSE SVR: %2f'% np.sqrt(mean_squared_error(y_test, y_pred_model)))

y_pred_model = xgb.predict(X_test)
print('RMSE XGB: %2f'% np.sqrt(mean_squared_error(y_test, y_pred_model)))

y_pred_model = abr.predict(X_test)
print('RMSE ABR: %2f'% np.sqrt(mean_squared_error(y_test, y_pred_model)))

y_pred_model = lgb.predict(X_test)
print('RMSE LGB: %2f'% np.sqrt(mean_squared_error(y_test, y_pred_model)))

y_pred_model = knn.predict(X_test)
print('RMSE KNN: %2f'% np.sqrt(mean_squared_error(y_test, y_pred_model)))

y_pred_model = nn.predict(X_test)
print('RMSE NN: %2f'% np.sqrt(mean_squared_error(y_test, y_pred_model)))

y_pred_model = cat.predict(X_test)
print('RMSE CAT: %2f'% np.sqrt(mean_squared_error(y_test, y_pred_model)))

y_pred_model = gbr.predict(X_test)
print('RMSE GBR: %2f'% np.sqrt(mean_squared_error(y_test, y_pred_model)))

In [None]:
level0 = [('rf',rf),('xgb',xgb),('gbr',gbr),('lgb',lgb),
          ('abr',abr),('cat',cat),('svr',svr),('nn',nn),
         ('knn',knn)]
level1 = xgb = XGBRegressor(booster='gbtree',
                   colsample_bylevel=0.6, colsample_bynode=.4,#1,.5
                   colsample_bytree=0.6, gamma=0,
                   importance_type='gain', learning_rate=0.01,
                   max_delta_step=0, max_depth=45,
                   min_child_weight=4, n_estimators=2500,
                   n_jobs=-1, nthread=None,
                   objective='reg:linear', reg_alpha=0.6,
                   reg_lambda=0.6, scale_pos_weight=1,
                   silent=None, subsample=0.8,
                   verbosity=2)
#LinearRegression()
stack = StackingRegressor(estimators=level0,final_estimator=level1,
                          cv=5, verbose=2, n_jobs=-1)
stack.fit(X_train, y_train)
y_pred_stack= stack.predict(X_test)
print('RMSE: %2f'% np.sqrt(mean_squared_error(y_test, y_pred_stack)))

In [None]:
predict_test_data = stack.predict(X_test_house)
predict_test_data

In [None]:
sub_result = pd.DataFrame()
test = pd.DataFrame(test_csv)
sub_result['Id'] = test['Id']
sub_result['SalePrice'] = np.expm1(predict_test_data)
sub_result

In [None]:
sub_result.to_csv('submission.csv', index=False)