In [1]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV
import matplotlib.pyplot as plt
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter("ignore", category=ConvergenceWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter("ignore", category=ConvergenceWarning)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [72]:
def grab_col_names(dataframe, cat_th=20, car_th=100, num_th = 118):
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O" and col not in "SalePrice"]
    num_cols = [col for col in num_cols if col not in num_but_cat and dataframe[col].nunique() > num_th]
    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')
    return cat_cols, cat_but_car, num_cols, num_but_cat


cat_cols, cat_but_car, num_cols, num_but_cat = grab_col_names(df)

Observations: 2919
Variables: 100
cat_cols: 86
num_cols: 8
cat_but_car: 0
num_but_cat: 54


In [121]:
train = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
test = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")
df = train.append(test).reset_index()

In [122]:
df["Functional"] = np.where(df.Functional.isin(["Sev", "Maj2", "Maj1","Mod"]), "smmm", df["Functional"])
df["BldgType"] = np.where(df.BldgType.isin(["2fmCon", "Twnhs"]), "2T", df["BldgType"])
df["HouseStyle"] = np.where(df.HouseStyle.isin(["2.5Fin", "1.5Unf" ,"2.5Unf","SFoyer"]), "121", df["HouseStyle"])
df["ExterQual"] = np.where(df.ExterQual.isin(["Fa", "Ex"]), "FaEx", df["ExterQual"])
df["Foundation"] = np.where(df.Foundation.isin(["Wood", "Stone","Slab","BrkTil"]), "2T", df["Foundation"])
df["MSZoning"] = np.where(df.MSZoning.isin(["C (all)", "RH"]), "CRH", df["MSZoning"])
df["Fireplaces"] = np.where(df.Fireplaces.isin(["4", "3"]), "43", df["Fireplaces"])
df["GarageCars"] = np.where(df.GarageCars.isin(["4.000", "5.000"]), "45", df["GarageCars"])
df["Condition1"] = np.where(df.Condition1.isin(["RRNn", "RRNe","RRAn"]), "RRR", df["Condition1"])
df["Exterior2nd"] = np.where(df.Exterior2nd.isin(["AsbShng", "AsphShn"]), "AA", df["Exterior2nd"])
df["Exterior2nd"] = np.where(df.Exterior2nd.isin(["Brk Cmn", "BrkFace"]), "BB", df["Exterior2nd"])
df["Exterior2nd"] = np.where(df.Exterior2nd.isin(["Other", "Stone","CBlock"]), "OSC", df["Exterior2nd"])
df["Total_Floor"] = df["1stFlrSF"] + df["2ndFlrSF"]
df["Build"] =df["YearRemodAdd"]- df["YearBuilt"]
df["Std_Build"] = df[["YearRemodAdd","YearBuilt"]].std()
df["_Floor_cross"] = df["1stFlrSF"] * df["2ndFlrSF"]
df['Build'].replace(-1, np.nan, inplace= True)
df.drop("PoolArea", axis=1, inplace=True)
drop_list = ["MiscFeature","Fence","PoolQC","GarageCond","GarageQual","FireplaceQu"]
df.drop("Alley", axis=1, inplace=True)
df.drop(drop_list, axis=1, inplace=True)
df.drop("Heating", axis=1, inplace=True)
df.drop("index", axis=1, inplace=True)

In [123]:
aggregations_FC = { "GarageCars" : ["min", "max"],
                     "GarageArea" : ["mean"],
                     "EnclosedPorch": ["mean"],
                     "1stFlrSF" : ["mean", "min", "std", "max"],
                     "KitchenAbvGr": ["min", "max"],
                    "BedroomAbvGr": ["mean", "std", "var"],
                     "TotRmsAbvGrd": ["mean"]
                       }
Functional = df.groupby('Functional').agg(aggregations_FC)
 
Functional.columns = pd.Index(['Functional' + e[0] + "_" + e[1].upper() for e in Functional.columns.tolist()])

df.drop(["GarageCars","GarageArea","EnclosedPorch","1stFlrSF","KitchenAbvGr", "BedroomAbvGr","TotRmsAbvGrd"],axis=1, inplace=True)

In [124]:
aggregations_MS = {"LotFrontage":["sum","mean"],
                    "LotArea": ["min", "mean","sum","max"],
                    "Street":["nunique", "size"],
                     "LandContour": ["nunique"],
                    "LandSlope": ["nunique"],
                     "GrLivArea":["std","var"]
                        }
MSZoning = df.groupby('MSZoning').agg(aggregations_MS)

MSZoning.columns = pd.Index(['MSZoning' + e[0] + "_" + e[1].upper() for e in MSZoning.columns.tolist()])
 
df.drop(["LotFrontage","MiscVal", "LotArea", "Street", "LandContour", "LandSlope" ],axis=1, inplace=True)
    
df = df.join(MSZoning, how='left', on='MSZoning')   


In [128]:
aggregations_BC = { "WoodDeckSF": ["mean","sum","max"],
                    "OpenPorchSF":["min", "mean","sum","max"],
                    "TotalBsmtSF": ["mean","sum","max"],
                     "GrLivArea": ["min", "max", "size"],
                      "Build" : ["min","max"],
                       "2ndFlrSF" : ["var", "std", "mean","min"]
                       }
MSSubClass = df.groupby('MSSubClass').agg(aggregations_BC)

MSSubClass.columns = pd.Index(['MSSubClass' + e[0] + "_" + e[1].upper() for e in MSSubClass.columns.tolist()])
    
df.drop(["2ndFlrSF","Build", "WoodDeckSF", "OpenPorchSF", "GrLivArea",  "TotalBsmtSF" ],axis=1, inplace=True)

df = df.join(MSSubClass, how='left', on='MSSubClass')   



In [129]:
def one_hot_encoder(df, drop_first=True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, drop_first=drop_first)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

In [130]:
df, cat_colls = one_hot_encoder(df)

In [131]:
train_df = df[df['SalePrice'].notnull()]
test_df = df[df['SalePrice'].isnull()]

In [135]:
X =train_df.drop(["SalePrice", "Id"], axis=1)
y = np.log1p(train_df['SalePrice'])

In [136]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=17)

In [138]:
from lightgbm import LGBMRegressor

lgbm_model = LGBMRegressor(random_state=46)

rmse = np.mean(np.sqrt(-cross_val_score(lgbm_model,
                                        X, y, cv=10, scoring="neg_mean_squared_error")))
rmse

0.132641833635066

In [143]:
lgbm_params = {"learning_rate": [0.01,0.02, 0.04, 0.05, 0.1],
               "n_estimators": [500, 1500,2000, 5000, 6000, 7000, 8000, 9000, 10000, 15000, 20000]}
 
lgbm_gs_best = GridSearchCV(lgbm_model,
                            lgbm_params,
                            cv=10,
                            n_jobs=-1,
                            verbose=True).fit(X_train, y_train)

Fitting 10 folds for each of 55 candidates, totalling 550 fits


KeyboardInterrupt: 

In [None]:

final_model = lgbm_model.set_params(**lgbm_gs_best.best_params_).fit(X, y)

rmse = np.mean(np.sqrt(-cross_val_score(final_model, X, y, cv=10, scoring="neg_mean_squared_error")))
rmse