In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# REFERENCE: https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb

In [None]:
train = pd.read_csv('../input/onehot/oneHotTrain.csv')
test = pd.read_csv('../input/onehot/oneHotTest.csv')

In [None]:
len(train)

In [None]:
train = train.drop(train.index[523])
train = train.drop(train.index[88])
train = train.drop(train.index[332])
train = train.drop(train.index[410])
train = train.drop(train.index[588])
train = train.drop(train.index[825])
train = train.drop(train.index[1000])

## Attempt 1: Random Forest

In [None]:
# splitting training data up to test different models

tmpY = train.SalePrice
tmpX = train.drop(columns=['SalePrice'])

X_train, X_test, y_train, y_test = train_test_split(tmpX, tmpY, test_size=0.25, random_state=0)

In [None]:
# best_score = 0
# for i in range(10,190,20):
#     for j in range(5,13,3):
#         for k in range(2,6):
#             for l in range(1,4):
#                 rf = RandomForestClassifier(n_estimators=i, 
#                                             criterion='entropy', 
#                                             max_depth=j, 
#                                             min_samples_split=k, 
#                                             min_samples_leaf=l)
#                 rf.fit(X_train, y_train)

#                 print('n_estimators=', i, ', max_depth=', j, ', min_samples_split=', k, ', min_samples_leaf=', l)
#                 score = rf.score(X_test, y_test)
#                 print(score)
#                 print()

#                 if score > best_score:
#                     best_score = score

# print('best: ', best_score)

I'm taking the four best classifiers and doing an ensemble of them for the submission

In [None]:
rf_g_0 = RandomForestClassifier(n_estimators=50, 
                            criterion='gini', 
                            max_depth=5, 
                            min_samples_split=4, 
                            min_samples_leaf=2)

rf_g_1 = RandomForestClassifier(n_estimators=170, 
                            criterion='gini', 
                            max_depth=5, 
                            min_samples_split=3, 
                            min_samples_leaf=1)

rf_e_0 = RandomForestClassifier(n_estimators=90, 
                            criterion='entropy', 
                            max_depth=5, 
                            min_samples_split=2, 
                            min_samples_leaf=2)

rf_e_1 = RandomForestClassifier(n_estimators=170, 
                            criterion='entropy', 
                            max_depth=5, 
                            min_samples_split=5, 
                            min_samples_leaf=1)

# testY = test.SalePrice
# testX = test.drop(columns=['SalePrice'])

rf_g_0.fit(tmpX, tmpY)
rf_g_1.fit(tmpX, tmpY)
rf_e_0.fit(tmpX, tmpY)
rf_e_1.fit(tmpX, tmpY)

prediction = (0.25 * rf_g_0.predict(test)) + (0.25 * rf_g_1.predict(test)) + \
             (0.25 * rf_e_0.predict(test)) + (0.25 * rf_e_1.predict(test))

In [None]:
sub = pd.DataFrame()
sub['ID'] = test['ID']
sub['SalePrice'] = prediction
sub.to_csv('submission.csv',index=False)

## Result: 0.27306
### Definitely room for improvement...

## Attempt 2: GradientBoostingRegressor

In [None]:
# Referenced code
gboost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state=0)

In [None]:
gboost.fit(tmpX, tmpY)
prediction = gboost.predict(test)

In [None]:
prediction

In [None]:
sub = pd.DataFrame()
sub['ID'] = test['ID']
sub['SalePrice'] = prediction
sub.to_csv('submission.csv',index=False)

## Result: 0.12328
### That's a lot better... now at number 918

## Attempt 3: XGBoost

In [None]:
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)

In [None]:
model_xgb.fit(tmpX, tmpY)
prediction = model_xgb.predict(test)
prediction

In [None]:
sub = pd.DataFrame()
sub['ID'] = test['ID']
sub['SalePrice'] = prediction
sub.to_csv('submission.csv',index=False)

## Result: 0.13220
### Slightly worse result - current place is 925

## Attempt 4: LGBMRegressor

In [None]:
# See reference code
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

In [None]:
model_lgb.fit(tmpX, tmpY)
prediction = model_lgb.predict(test)
prediction

In [None]:
sub = pd.DataFrame()
sub['ID'] = test['ID']
sub['SalePrice'] = prediction
sub.to_csv('submission.csv',index=False)

## Result: 0.12583
### Not terrible, but not better... maybe useful in an ensemble though

## Let's try combining these and see what happens

In [None]:
pred_gb = gboost.predict(test)
pred_xgb = model_xgb.predict(test)
pred_lgb = model_lgb.predict(test)

In [None]:
prediction = (0.4 * pred_xgb) + (0.3 * pred_gb) + (0.3 * pred_lgb)
prediction

In [None]:
sub = pd.DataFrame()
sub['ID'] = test['ID']
sub['SalePrice'] = prediction
sub.to_csv('submission.csv',index=False)

## Result: 0.12262
### Sweet! That's some progress - now 892 on leaderboard. Let's try something slightly different...

In [None]:
prediction = (0.25 * pred_xgb) + (0.4 * pred_gb) + (0.35 * pred_lgb)
prediction

In [None]:
sub = pd.DataFrame()
sub['ID'] = test['ID']
sub['SalePrice'] = prediction
sub.to_csv('submission.csv',index=False)

## Result: 0.12181
### Wow, we're on a roll! Number 851 now (top ~18.5%)

In [None]:
rf_pred = (0.25 * rf_g_0.predict(test)) + (0.25 * rf_g_1.predict(test)) + \
             (0.25 * rf_e_0.predict(test)) + (0.25 * rf_e_1.predict(test))

In [None]:
prediction = (0.15 * pred_xgb) + (0.45 * pred_gb) + (0.3 * pred_lgb) + (0.1 * rf_pred)
prediction

In [None]:
sub = pd.DataFrame()
sub['ID'] = test['ID']
sub['SalePrice'] = prediction
sub.to_csv('submission.csv',index=False)

## Result: 0.12270
### Slightly worse, but not too big of a deal

In [None]:
prediction = (0.15 * pred_xgb) + (0.45 * pred_gb) + (0.35 * pred_lgb) + (0.05 * rf_pred)
prediction

In [None]:
sub = pd.DataFrame()
sub['ID'] = test['ID']
sub['SalePrice'] = prediction
sub.to_csv('submission.csv',index=False)

## Result: 0.12163
### Nice, got a little better! Number 842 now. (top 18.2%)