In [3]:
## MAin aim is to understand more about the data

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from scipy import stats
from scipy.stats import norm, skew
from sklearn.preprocessing import LabelEncoder
from scipy import stats
from scipy.stats import norm, skew
from scipy.special import boxcox1p

from sklearn import model_selection

## Display all the columns of the dataframe
pd.pandas.set_option('display.max_columns',None)


In [4]:
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error, mean_squared_log_error, mean_absolute_error
from mlxtend.regressor import StackingCVRegressor
from xgboost import XGBRegressor


In [5]:
#elasticnet
elasticnet_alphas = [5e-5, 1e-4, 5e-4, 1e-3]
elasticnet_l1ratios = [0.8, 0.85, 0.9, 0.95, 1]
#lasso
lasso_alphas = [5e-5, 1e-4, 5e-4, 1e-3]
#ridge
ridge_alphas = [13.5, 14, 14.5, 15, 15.5]


MODELS = { 
    "elasticnet" : make_pipeline(RobustScaler(), ElasticNetCV(max_iter=1e7, alphas=elasticnet_alphas, l1_ratio=elasticnet_l1ratios)),
     "lasso" : make_pipeline(RobustScaler(), LassoCV(max_iter=1e7, alphas=lasso_alphas, random_state=42)),
     "ridge" : make_pipeline(RobustScaler(), RidgeCV(alphas=ridge_alphas)),
     "gradb" : GradientBoostingRegressor(n_estimators=6000, learning_rate=0.01,
                                  max_depth=4, max_features='sqrt',
                                  min_samples_leaf=15, min_samples_split=10,
                                  loss='huber', random_state=42),

    "svr" : make_pipeline(RobustScaler(),
                    SVR(C=20, epsilon=0.008, gamma=0.0003)),

    "xgboost" : XGBRegressor(learning_rate=0.01, n_estimators=6000,
                       max_depth=3, min_child_weight=0,
                       gamma=0, subsample=0.7,
                       colsample_bytree=0.7,
                       objective='reg:squarederror', nthread=-1,
                       scale_pos_weight=1, seed=27,
                       reg_alpha=0.00006, random_state=42)}

MODELS_stack = StackingCVRegressor(regressors=(MODELS['elasticnet'], MODELS['gradb'], MODELS['lasso'], 
                                          MODELS['ridge'], MODELS['svr'], MODELS['xgboost']),
                              meta_regressor=MODELS['xgboost'],
                              use_features_in_secondary=True)

In [7]:
def feature_engineering(df):
    #lib

    temporal_features = [feature for feature in df.columns if 'Yr' in feature or 'Year' in feature or 'Mo' in feature]
    numeric_features = [feature for feature in df.columns if df[feature].dtype != 'O' and feature not in temporal_features and feature not in ("Id", "kfold","SalePrice")]
    categorical_features = [feature for feature in df.columns if df[feature].dtype == 'O' and feature not in temporal_features]

    
    #feature-eng on temporal-dataset

    for feature in temporal_features:
        if feature == 'YrSold' or feature == 'MoSold':
            pass
        else:
            df[feature] = df['YrSold'] - df[feature]

    df['YrSold'] = df['YrSold'].astype(str)
    df['MoSold'] = df['MoSold'].astype(str) 
    df['MSSubClass'] = df['MSSubClass'].apply(str)
    
    
    #fill-na

    for feature in numeric_features:
        df[feature] = df.groupby("Neighborhood")[feature].transform(lambda x: x.fillna(x.median()))

    for feature in categorical_features:
        df[feature] = df[feature].fillna("Missing")

    for feature in temporal_features:
        if feature == 'YrSold' or feature == 'MoSold':
            df[feature] = df[feature].fillna("Missing")
        else:
            df[feature] = df[feature].fillna(0)

    #feature-generation

    df['TotalHouseSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']

    df['TotalLot'] = df['LotFrontage'] + df['LotArea']

    df['TotalBsmtFin'] = df['BsmtFinSF1'] + df['BsmtFinSF2']
    
    df['TotalBath'] = df['FullBath'] + df['HalfBath']

    df['TotalPorch'] = df['OpenPorchSF'] + df['EnclosedPorch'] + df['ScreenPorch']

    #feature-selection (multi-correnality)

    #df.drop(['TotalBsmtFin','LotArea','TotalBsmtSF','GrLivArea','GarageYrBlt','GarageArea'],axis=1,inplace=True)

    cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'YrSold', 'MoSold')
    # process columns, apply LabelEncoder to categorical features
    for c in cols:
        lbl = LabelEncoder() 
        lbl.fit(list(df[c].values)) 
        df[c] = lbl.transform(list(df[c].values))

    #some more-feature engineering:

    df["TotalGarageQual"] = df["GarageQual"] * df["GarageCond"]
    df["TotalExteriorQual"] = df["ExterQual"] * df["ExterCond"]
    

    #df.drop(["PoolQC"],axis=1,inplace=True)

    # box_cox

    numeric_feats = [feature for feature in df.columns if df[feature].dtype != "object" and feature not in ("Id", "kfold","SalePrice")]
    # Check the skew of all numerical features
    skewed_feats = df[numeric_feats].apply(lambda x: skew(x)).sort_values(ascending=False)
    skewness = pd.DataFrame({'Skew' :skewed_feats})
    
    skewness = skewness[abs(skewness) > 0.75]
    
    skewed_features = skewness.index
    lam = 0.15
    for feat in skewed_features:
        df[feat] = boxcox1p(df[feat], lam)


    #rare features 
    features = [feature for feature in df.columns if df[feature].dtype == 'O']   

    for feature in features:
        abc = df[feature].value_counts().to_dict()
        for key, value in abc.items():
            if value/len(df[feature]) < 0.01:
                df.loc[:,feature][df[feature]==key]="rare"

    return df

In [18]:
def run(fold,model):

    #loading
    df = pd.read_csv('/home/hazim/Desktop/Advanced-house-price-prediction/input/train.csv')
    df_test = pd.read_csv('/home/hazim/Desktop/Advanced-house-price-prediction/input/test.csv')

    #K-foldding
    
    df["kfold"] = -1
    kf = model_selection.StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

    for f, (train_idx, val_idx) in enumerate(kf.split(X=df, y=df.SalePrice.values)):
        print(len(train_idx), len(val_idx))
        df.loc[val_idx, 'kfold'] = f

    #function run
    df = feature_engineering(df)
    df_test = feature_engineering(df_test)  

     #Some missing values still-were comming!! (please fix it in the above feature-eng(function) otherwise this also works fine)
    df_test.fillna(0,inplace=True) 

    df.SalePrice = np.log1p(df.SalePrice)

    #concat
    df = pd.concat([df,df_test],axis=0)
    
    df = pd.get_dummies(df)
    
    #Feature_selection

    for feature in df.columns:
        all_value_counts = df[feature].value_counts()
        zero_value_counts = all_value_counts.iloc[0]
        if zero_value_counts / len(df) > 0.99:
            df.drop(feature,axis=1,inplace=True)

    #split back
    
    df_test = df.loc[df["Id"].between(1461,2919)]

    df =  df.loc[df["Id"].between(1,1460)]
    

    
    #train-tests-split
    df_train = df[df.kfold != fold].reset_index(drop=True)

    df_valid = df[df.kfold == fold].reset_index(drop=True)

    numeric_features = [feature for feature in df_train.columns if feature not in ("Id", "kfold","SalePrice")]
    
    x_train = df_train[numeric_features].values
    x_valid = df_valid[numeric_features].values

    
    #regressor models

    reg = MODELS[model]

    reg.fit(x_train,df_train.SalePrice.values)
    valid_preds = reg.predict(x_valid)
    test_preds = reg.predict(df_test[numeric_features].values)
    
    
    #joblib.dump(features, os.path.join(config.models_location, f"{model}_{fold}_columns.bin"))

    # scoring
    
    rmse = np.sqrt(mean_squared_error(df_valid.SalePrice.values, valid_preds))
    mae = mean_absolute_error(df_valid.SalePrice.values, valid_preds)
    
    print(f"FOLD={fold}, MODEL = {model}, RMSE = {rmse}, MAE ={mae} ")
    return(test_preds)


In [9]:
preds_df = pd.DataFrame()

for fold in range(3):
    for keys,items in MODELS.items():    
        preds_df["fold"+str(fold)+keys] = run(fold,keys)
        

FOLD=0, MODEL = elasticnet, RMSE = 0.14725744693691908, MAE =0.08390364493367412 
FOLD=0, MODEL = lasso, RMSE = 0.14793166888193054, MAE =0.08436734511887815 
FOLD=0, MODEL = ridge, RMSE = 0.1475926769008778, MAE =0.08417537938578107 
FOLD=0, MODEL = gradb, RMSE = 0.13762515330268787, MAE =0.08245005123861744 
FOLD=0, MODEL = svr, RMSE = 0.1539241032735953, MAE =0.08459413195601814 
FOLD=0, MODEL = xgboost, RMSE = 0.13923803706916, MAE =0.08674371715519258 
FOLD=1, MODEL = elasticnet, RMSE = 0.11335730173230714, MAE =0.07398889316711908 
FOLD=1, MODEL = lasso, RMSE = 0.11335730173230714, MAE =0.07398889316711908 
FOLD=1, MODEL = ridge, RMSE = 0.11342953121900298, MAE =0.07429427480447665 
FOLD=1, MODEL = gradb, RMSE = 0.10914808622096173, MAE =0.06996841602700234 
FOLD=1, MODEL = svr, RMSE = 0.10693056388911608, MAE =0.06791104658510982 
FOLD=1, MODEL = xgboost, RMSE = 0.11004067929134877, MAE =0.07070801957679702 
FOLD=2, MODEL = elasticnet, RMSE = 0.1240207562343852, MAE =0.086633479

In [17]:
#if you want to run,stacked regressor also. (it takes time, but results are good)

def run(fold,model):

    #loading
    df = pd.read_csv('/home/hazim/Desktop/Advanced-house-price-prediction/input/train.csv')
    df_test = pd.read_csv('/home/hazim/Desktop/Advanced-house-price-prediction/input/test.csv')

    #K-foldding
    
    df["kfold"] = -1
    kf = model_selection.StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

    for f, (train_idx, val_idx) in enumerate(kf.split(X=df, y=df.SalePrice.values)):
        print(len(train_idx), len(val_idx))
        df.loc[val_idx, 'kfold'] = f

    #function run
    df = feature_engineering(df)
    df_test = feature_engineering(df_test)  

    #Some missing values still-were comming!! (please fix it in the above feature-eng(function) otherwise this also works fine)
    df_test.fillna(0,inplace=True) 

    df.SalePrice = np.log1p(df.SalePrice)

    #concat
    df = pd.concat([df,df_test],axis=0)
    
    df = pd.get_dummies(df)
    
    #Feature_selection

    for feature in df.columns:
        all_value_counts = df[feature].value_counts()
        zero_value_counts = all_value_counts.iloc[0]
        if zero_value_counts / len(df) > 0.99:
            df.drop(feature,axis=1,inplace=True)

    #split back
    
    df_test = df.loc[df["Id"].between(1461,2919)]

    df =  df.loc[df["Id"].between(1,1460)]
    

    
    #train-tests-split
    df_train = df[df.kfold != fold].reset_index(drop=True)

    df_valid = df[df.kfold == fold].reset_index(drop=True)

    numeric_features = [feature for feature in df_train.columns if feature not in ("Id", "kfold","SalePrice")]
    
    x_train = df_train[numeric_features].values
    x_valid = df_valid[numeric_features].values

    
    #regressor models

    reg = MODELS_stack

    reg.fit(x_train,df_train.SalePrice.values)
    valid_preds = reg.predict(x_valid)
    test_preds = reg.predict(df_test[numeric_features].values)
    
    
    #joblib.dump(features, os.path.join(config.models_location, f"{model}_{fold}_columns.bin"))

    # scoring
    
    rmse = np.sqrt(mean_squared_error(df_valid.SalePrice.values, valid_preds))
    mae = mean_absolute_error(df_valid.SalePrice.values, valid_preds)
    
    print(f"FOLD={fold}, MODEL = {model}, RMSE = {rmse}, MAE ={mae} ")
    return(test_preds)


for fold in range(3):
    preds_df["fold"+str(fold)+"stack"] = run(fold,keys)

1095 365
1095 365
1095 365
1095 365


KeyboardInterrupt: 

In [331]:
preds_df

Unnamed: 0,fold0elasticnet,fold0lasso,fold0ridge,fold0gradb,fold0svr,fold0xgboost,fold1elasticnet,fold1lasso,fold1ridge,fold1gradb,fold1svr,fold1xgboost,fold2elasticnet,fold2lasso,fold2ridge,fold2gradb,fold2svr,fold2xgboost,fold0stack,fold1stack,fold2stack,SalePrice
0,114403.640786,114172.701971,113653.436570,125472.560813,116187.997511,127085.296875,111087.582845,111087.582845,109704.871615,119968.787138,112538.308375,121812.109375,111886.273014,111499.982283,113795.261731,129559.604746,116075.358142,134144.031250,124069.625000,116233.101562,135502.593750,120980.533995
1,155481.424279,155280.438493,156563.334103,158616.863844,188605.914184,168763.843750,153711.088886,153711.088886,153119.133427,159580.818147,172586.477317,164025.593750,159164.920221,158924.176879,165242.898224,165169.401095,178463.256188,168221.843750,176975.906250,168921.156250,177016.765625,166977.421546
2,177058.378439,177064.572110,179670.012902,190683.837742,188898.104398,191321.453125,179615.861128,179615.861128,179021.714271,187164.202817,188690.361535,189356.125000,179946.044122,179945.606024,178129.780628,192703.057585,190221.539893,204268.562500,196286.562500,184666.859375,207934.203125,189681.111318
3,197831.202756,198193.370135,195844.429916,193106.454321,199404.978834,200636.718750,201762.109390,201762.109390,201345.337381,190601.749482,201432.637619,195654.296875,201623.004721,201611.168605,200364.054317,192276.842692,201094.780724,204251.218750,202652.437500,191000.046875,216013.343750,199801.358292
4,195556.464756,193227.706610,193712.568856,194723.662508,186192.991397,193445.859375,190190.555452,190190.555452,194492.761572,188413.786936,189492.132130,186901.234375,194795.403016,193499.424791,195394.926042,198679.935585,195482.681005,209314.984375,194762.187500,192752.328125,212811.453125,195964.511947
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,87141.131838,86761.804485,86049.771152,80323.552209,89174.729196,78944.281250,88466.306102,88466.306102,87108.199362,81022.279088,85587.232984,82179.656250,90525.457470,90614.842525,88123.539585,88090.145011,89968.824670,88825.953125,83156.218750,85082.296875,92469.125000,86352.743976
1455,79517.919732,79783.412802,79622.892699,78115.888082,82423.854960,78522.140625,83597.834091,83597.834091,81730.900836,79789.978092,81391.114588,83464.734375,87974.476337,87833.018309,86904.642420,90926.143295,88628.121850,96494.437500,79472.679688,92416.929688,98044.218750,86026.868687
1456,164899.234693,166153.653972,167760.061751,151705.591070,176585.627093,168415.109375,175228.977027,175228.977027,177974.238180,163048.217502,182360.627752,169279.500000,169610.172166,170464.940681,166337.886159,169408.261529,188849.224548,174543.312500,163912.859375,177495.843750,185286.937500,171781.765019
1457,112517.019210,112865.114830,112402.212724,119076.802312,116067.165078,117482.390625,116059.433804,116059.433804,116355.511355,120027.869605,122762.402244,119866.281250,114406.230886,114536.584053,112692.445888,115566.526416,121387.181440,118632.820312,123069.468750,121243.390625,122604.515625,118821.337096


In [248]:
for cols in preds_df.columns:
    preds_df[cols] = np.expm1(preds_df[cols])


In [329]:
#Weighted_average aka blending # adjust if you did not use stacked-regressor. 

for e,col in enumerate(preds_df.columns):
    if e == 0:
        preds_df['SalePrice'] = preds_df[col]
    elif col in ['fold0stack','fold1stack','fold2stack']:
        preds_df['SalePrice'] += preds_df[col]*4
    else:
        preds_df['SalePrice'] += preds_df[col]    


In [330]:
 preds_df['SalePrice'] =  preds_df['SalePrice']/30

In [302]:
df_test = pd.read_csv('/home/hazim/Desktop/Advanced-house-price-prediction/input/test.csv',usecols=["Id"])

In [332]:
preds_df["id"] = df_test.values.flatten()

In [333]:
abc  = preds_df[['id','SalePrice']]

In [334]:

abc.to_csv(f"/home/hazim/Desktop/Advanced-house-price-prediction/output/abc.csv", index=False)