In [None]:
import pandas as pd
from sklearn.preprocessing import Imputer
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
import numpy as np
from sklearn.kernel_ridge import KernelRidge
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from sklearn.pipeline import make_pipeline

In [None]:
train = pd.read_csv('../input/train.csv')
y = train['SalePrice']
X = train[train['TotalBsmtSF'] < 4000]
X = X[X['BsmtFinSF1'] < 4000]
X.drop(X[(X['GrLivArea']>4000) & (X['SalePrice']<300000)].index)
X = train.drop(['SalePrice','Id'], axis=1)

In [None]:
LABEL_MAP = {'ExterQual': {'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1},
             'ExterCond': {'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1},
             'BsmtQual': {'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'NA':-1},
             'BsmtCond': {'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'NA':-1},
             'BsmtExposure': {'Gd':3,'Av':2,'Mn':1,'No':0, 'NA':-1},
             'BsmtFinType1': {'GLQ':5,'ALQ':4,'BLQ':3,'Rec':2,'LwQ':1,'Unf':0,'NA':-1},
             'BsmtFinType2': {'GLQ':5,'ALQ':4,'BLQ':3,'Rec':2,'LwQ':1,'Unf':0,'NA':-1},
             'HeatingQC':{'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1},
             'CentralAir': {'N':0,'Y':1},
             'KitchenQual':{'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1},
             'GarageQual':{'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'NA':-1},
             'GarageCond': {'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'NA':-1},
             'PavedDrive':{'Y':1,'P':0,'N':-1},
             'Functional': {'Typ':7,'Min1':6,'Min2':5,'Mod':4,'Maj1':3,'Maj2':2,'Sev':1,'Sal':0}
}

CATE_COLS = ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'MasVnrType',
             'LandSlope', 'Neighborhood', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd',
             'Heating', 'Electrical', 'GarageType', 'GarageFinish', 'Condition2',
             'SaleCondition', 'SaleType','BldgType','HouseStyle','Foundation','Condition1']

In [None]:
def preprocess(data):
    proc = pd.DataFrame(data)
    proc.drop(['MiscFeature', 'Fence', 'PoolQC', 'Alley', 'FireplaceQu'], axis=1, inplace=True)    
        
    for columns in ['LotFrontage', 'MasVnrArea','GarageYrBlt']:
        tryimputed = proc[columns]
        tryimputed = tryimputed.values.reshape(-1, 1)
        imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
        imp.fit(tryimputed)
        proc[columns] = imp.transform(tryimputed).astype(int)
        
    for column in proc.columns:
        proc[column].fillna(proc[column].mode()[0], inplace=True)    
        
    for col in CATE_COLS:
        encoding = proc.groupby(col).size()
        encoding = encoding/len(proc)
        proc[col] = proc[col].map(encoding)  
        
    for key, value in LABEL_MAP.items():
        proc[key] = proc[key].map(value)
    return proc

In [None]:
rf = preprocess(X)
from sklearn.ensemble import RandomForestRegressor
clf = RandomForestRegressor(n_estimators=1000, random_state=111,min_samples_leaf=2,max_depth=7)
model = clf.fit(rf, y)
values = sorted(zip(rf.columns, model.feature_importances_), key=lambda x: x[1] * -1)
values

In [None]:
important_index = ['OverallQual','GrLivArea', 'TotalBsmtSF','2ndFlrSF', 'BsmtFinSF1', '1stFlrSF', 
                   'GarageCars', 'GarageArea', 'LotArea', 'YearBuilt']
medium_index = ['KitchenQual', 'BsmtQual', 'GarageFinish','BsmtUnfSF','OpenPorchSF', 
                'WoodDeckSF',  'GarageYrBlt', 'OverallCond', 'Fireplaces', 'ExterQual','CentralAir',
                'TotRmsAbvGrd', 'YearRemodAdd', 'Neighborhood', 'LotFrontage','Fireplaces',
                'FullBath', 'MasVnrArea','BsmtExposure']

In [None]:
test = pd.read_csv('../input/test.csv')
test = preprocess(test)

In [None]:
param = {'num_leaves': 13,
         'min_data_in_leaf': 127,
         'objective':'regression',
         'max_depth': 7,
         'learning_rate': 0.005,
         "boosting": "gbdt",
         "feature_fraction": 0.75,
         "bagging_freq": 1,
         "bagging_fraction": 0.7 ,
         "bagging_seed": 17,
         "metric": 'rmse',
         "random_state": 133,
         "lambda_l1": 0.27,
         "verbosity": -1}

In [None]:
def kfold_train(X_data,y_data, X_test):
    folds = KFold(n_splits=5, shuffle=True, random_state=15)
    oof = np.zeros(len(X_data))
    predictions = np.zeros(len(X_test))
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_data.values, y_data.values)):
        print("fold n°{}".format(fold_))
        X_train = preprocess(X_data.iloc[trn_idx])
        X_val = preprocess(X_data.iloc[val_idx])
        pca = PCA(n_components=25)
        X_train = pd.DataFrame(pca.fit_transform(X_train))
        X_val = pd.DataFrame(pca.transform(X_val))
        trn_data = lgb.Dataset(X_train, label=y_data.iloc[trn_idx])
        val_data = lgb.Dataset(X_val,label=y.iloc[val_idx])

        num_round = 10000
        clf = lgb.train(param,trn_data,num_round,valid_sets=[trn_data, val_data],verbose_eval=100,early_stopping_rounds=200)

        pca_test = X_test
        pca_test = pca_test.drop('Id',axis=1)
        pca_test = pd.DataFrame(pca.transform(pca_test))
        oof[val_idx] = clf.predict(X_val, num_iteration=clf.best_iteration)
        predictions += clf.predict(pca_test, num_iteration=clf.best_iteration) / folds.n_splits

    print("CV score: {:<8.5f}".format(mean_squared_error(oof, y_data) ** 0.5))
    return predictions

In [None]:
test.shape

In [None]:
gdbt_pred = kfold_train(X, y, test)

In [None]:
KRR = make_pipeline(RobustScaler(), KernelRidge(alpha=0.005, kernel='polynomial', degree=2, coef0=2.5))
KRR.fit(rf[important_index], y)
krr_pred = KRR.predict(test[important_index])

In [None]:
gdbt_pred[:13]

In [None]:
krr_pred[:13]

In [None]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0001, random_state=1))
lasso_ind = medium_index+important_index
lasso.fit(rf[lasso_ind], y)
la_test = test[lasso_ind]
lasso_pred = lasso.predict(la_test)

In [None]:
lasso_pred[:13]

In [None]:
enet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
enet.fit(rf[medium_index], y)
enet_pred = enet.predict(test[medium_index])

In [None]:
enet_pred[0:13]

In [None]:
ensemble = 0.15*lasso_pred + 0.4*gdbt_pred + 0.3*krr_pred + 0.15*enet_pred
sub = pd.DataFrame()
sub['Id'] = test['Id']
sub['SalePrice'] = ensemble

In [None]:
sub[:13]

In [None]:
sub.to_csv('submission.csv',index=False)