## Load Required Libraries

In [3]:
### Import required libraries

import numpy as np
import pandas as pd
import gc

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor

from IPython.display import display # Allows the use of display() for DataFrames

import warnings
warnings.filterwarnings('ignore')

## Load Train and Test Data

In [4]:
# Read train and test files
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

### Train Data

In [5]:
train_df.head()

Unnamed: 0,ID,target,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000d6aaf2,38000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
1,000fbd867,600000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
2,0027d6b71,10000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
3,0028cbf45,2000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4,002a68644,14400000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0


### Test Data

In [6]:
test_df.head()

Unnamed: 0,ID,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,20aa07010,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000137c73,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00021489f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0004d7953,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,00056a333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,00056d8eb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Train and Test Data Info

In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4459 entries, 0 to 4458
Columns: 4993 entries, ID to 9fc776466
dtypes: float64(1845), int64(3147), object(1)
memory usage: 169.9+ MB


So there are a total of 4993 columns out of which 1845 are of type float64, 3147 are int64 and 1 is object (ID is the object column)

In [8]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49342 entries, 0 to 49341
Columns: 4992 entries, ID to 9fc776466
dtypes: float64(4991), object(1)
memory usage: 1.8+ GB


So there are a total of 4992 columns in the test set out of which 4991 are of type float64 and 1 is object (ID is the object column)

## Check for Missing Values

In [9]:
#### Check if there are any NULL values in Train Data
print("Total Train Features with NaN Values = " + str(train_df.columns[train_df.isnull().sum() != 0].size))
if (train_df.columns[train_df.isnull().sum() != 0].size):
    print("Features with NaN => {}".format(list(train_df.columns[train_df.isnull().sum() != 0])))
    train_df[train_df.columns[train_df.isnull().sum() != 0]].isnull().sum().sort_values(ascending = False)

Total Train Features with NaN Values = 0


In [10]:
#### Check if there are any NULL values in Test Data
print("Total Test Features with NaN Values = " + str(test_df.columns[test_df.isnull().sum() != 0].size))
if (test_df.columns[test_df.isnull().sum() != 0].size):
    print("Features with NaN => {}".format(list(test_df.columns[test_df.isnull().sum() != 0])))
    test_df[test_df.columns[test_df.isnull().sum() != 0]].isnull().sum().sort_values(ascending = False)

Total Test Features with NaN Values = 0


## Check and Remove Constant Features

In [11]:
# check and remove constant columns
colsToRemove = []
for col in train_df.columns:
    if col != 'ID' and col != 'target':
        if train_df[col].std() == 0: 
            colsToRemove.append(col)
        
# remove constant columns in the training set
train_df.drop(colsToRemove, axis=1, inplace=True)

# remove constant columns in the test set
test_df.drop(colsToRemove, axis=1, inplace=True) 

print("Removed `{}` Constant Columns\n".format(len(colsToRemove)))
print(colsToRemove)

Removed `256` Constant Columns

['d5308d8bc', 'c330f1a67', 'eeac16933', '7df8788e8', '5b91580ee', '6f29fbbc7', '46dafc868', 'ae41a98b6', 'f416800e9', '6d07828ca', '7ac332a1d', '70ee7950a', '833b35a7c', '2f9969eab', '8b1372217', '68322788b', '2288ac1a6', 'dc7f76962', '467044c26', '39ebfbfd9', '9a5ff8c23', 'f6fac27c8', '664e2800e', 'ae28689a2', 'd87dcac58', '4065efbb6', 'f944d9d43', 'c2c4491d5', 'a4346e2e2', '1af366d4f', 'cfff5b7c8', 'da215e99e', '5acd26139', '9be9c6cef', '1210d0271', '21b0a54cb', 'da35e792b', '754c502dd', '0b346adbd', '0f196b049', 'b603ed95d', '2a50e001c', '1e81432e7', '10350ea43', '3c7c7e24c', '7585fce2a', '64d036163', 'f25d9935c', 'd98484125', '95c85e227', '9a5273600', '746cdb817', '6377a6293', '7d944fb0c', '87eb21c50', '5ea313a8c', '0987a65a1', '2fb7c2443', 'f5dde409b', '1ae50d4c3', '2b21cd7d8', '0db8a9272', '804d8b55b', '76f135fa6', '7d7182143', 'f88e61ae6', '378ed28e0', 'ca4ba131e', '1352ddae5', '2b601ad67', '6e42ff7c7', '22196a84c', '0e410eb3d', '992e6d1d3', '90a7

## Remove Duplicate Columns

In [12]:
%%time
def duplicate_columns(frame):
    groups = frame.columns.to_series().groupby(frame.dtypes).groups
    dups = []

    for t, v in groups.items():

        cs = frame[v].columns
        vs = frame[v]
        lcs = len(cs)

        for i in range(lcs):
            ia = vs.iloc[:,i].values
            for j in range(i+1, lcs):
                ja = vs.iloc[:,j].values
                if np.array_equal(ia, ja):
                    dups.append(cs[i])
                    break

    return dups

colsToRemove = duplicate_columns(train_df)
print(colsToRemove)

['34ceb0081', '8d57e2749', '168b3e5bc', 'a765da8bc', 'acc5b709d']
CPU times: user 5min 31s, sys: 719 ms, total: 5min 32s
Wall time: 5min 33s


In [44]:
groups = train_df.columns.to_series().groupby(train_df.dtypes).groups
for t,v in groups.items():
    print(train_df[v])

      0deb4b6a8  a8cb14b00  2f0771a37  30347e683  d08d1fbe3  6ee66e115  \
0             0          0          0          0          0          0   
1             0          0          0          0          0          0   
2             0          0          0          0          0          0   
3             0          0          0          0          0          0   
4             0          0          0          0          0          0   
...         ...        ...        ...        ...        ...        ...   
4454          0          0          0          0          0          0   
4455          0          0          0          0          0          0   
4456          0          0          0          0          0          0   
4457          0          0          0          0          0          0   
4458          0          0          0          0          0          0   

      77c9823f2  8d6c2a0b2  4681de4fd  adf119b9a  ...  9437d8b64  2e84e09c5  \
0             0          0      

In [13]:
# remove duplicate columns in the training set
train_df.drop(colsToRemove, axis=1, inplace=True) 

# remove duplicate columns in the testing set
test_df.drop(colsToRemove, axis=1, inplace=True)

print("Removed `{}` Duplicate Columns\n".format(len(colsToRemove)))
print(colsToRemove)

Removed `5` Duplicate Columns

['34ceb0081', '8d57e2749', '168b3e5bc', 'a765da8bc', 'acc5b709d']


## Drop Sparse Data

In [14]:
def drop_sparse(train, test):
    flist = [x for x in train.columns if not x in ['ID','target']]
    for f in flist:
        if len(np.unique(train[f]))<2:
            train.drop(f, axis=1, inplace=True)
            test.drop(f, axis=1, inplace=True)
    return train, test

In [15]:
%%time
train_df, test_df = drop_sparse(train_df, test_df)

CPU times: user 716 ms, sys: 49.1 ms, total: 765 ms
Wall time: 845 ms


In [16]:
gc.collect()
print("Train set size: {}".format(train_df.shape))
print("Test set size: {}".format(test_df.shape))

Train set size: (4459, 4732)
Test set size: (49342, 4731)


## Build Train and Test Data for Modeling

In [17]:
X_train = train_df.drop(["ID", "target"], axis=1)
y_train = np.log1p(train_df["target"].values)

X_test = test_df.drop(["ID"], axis=1)

In [18]:
dev_X, val_X, dev_y, val_y = train_test_split(X_train, y_train, test_size = 0.2, random_state = 42)

## LightGBM

In [19]:
def run_lgb(train_X, train_y, val_X, val_y, test_X):
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 40,
        "learning_rate" : 0.004,
        "bagging_fraction" : 0.6,
        "feature_fraction" : 0.6,
        "bagging_frequency" : 6,
        "bagging_seed" : 42,
        "verbosity" : -1,
        "seed": 42
    }
    
    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    evals_result = {}
    model = lgb.train(params, lgtrain, 5000, 
                      valid_sets=[lgtrain, lgval], 
                      early_stopping_rounds=100, 
                      verbose_eval=150, 
                      evals_result=evals_result)
    
    pred_test_y = np.expm1(model.predict(test_X, num_iteration=model.best_iteration))
    return pred_test_y, model, evals_result

In [20]:
# Training LGB
pred_test, model, evals_result = run_lgb(dev_X, dev_y, val_X, val_y, X_test)
print("LightGBM Training Completed...")

Training until validation scores don't improve for 100 rounds
[150]	training's rmse: 1.5082	valid_1's rmse: 1.53919
[300]	training's rmse: 1.34436	valid_1's rmse: 1.46593
[450]	training's rmse: 1.23324	valid_1's rmse: 1.43393
[600]	training's rmse: 1.14931	valid_1's rmse: 1.41848
[750]	training's rmse: 1.08371	valid_1's rmse: 1.41315
[900]	training's rmse: 1.03011	valid_1's rmse: 1.41131
Early stopping, best iteration is:
[934]	training's rmse: 1.01913	valid_1's rmse: 1.41118
LightGBM Training Completed...


In [21]:
# feature importance
print("Features Importance...")
gain = model.feature_importance('gain')
featureimp = pd.DataFrame({'feature':model.feature_name(), 
                   'split':model.feature_importance('split'), 
                   'gain':100 * gain / gain.sum()}).sort_values('gain', ascending=False)
print(featureimp[:50])

Features Importance...
        feature  split      gain
4130  f190486d6    752  8.880463
2375  58e2e02e6    702  5.344246
3465  eeb9cd3aa    677  4.358434
4020  15ace8c9f    542  3.131752
2614  9fd594eec    360  2.882999
8     20aa07010    414  2.151287
3571  58232a6fb    359  1.414442
834   6eef030c1    324  1.369470
1457  b43a7cfd5    406  1.252368
3661  491b9ee45    284  1.047073
2687  fb0f5dbfe    416  1.002580
1482  024c577b9    266  1.001906
4508  c47340d97    326  0.889082
2079  58e056e12    343  0.886709
3867  2288333b4    187  0.857178
4343  1702b5bf0    289  0.853304
566   66ace2992    274  0.833897
4185  f74e8f13d    362  0.820876
4028  5c6487af1    207  0.801488
3791  ed8ff54b5    170  0.792289
828   6786ea46d    179  0.776514
3722  d6bb78916    292  0.747566
3220  ced6a7e91    242  0.703042
3886  50e4f96cf    141  0.692474
863   fc99f9426    234  0.673585
34    87ffda550    172  0.628633
1378  6cf7866c1    164  0.627456
3983  45f6d00da    237  0.614132
3811  adb64ff71    2

## XGB Modeling

In [22]:
def run_xgb(train_X, train_y, val_X, val_y, test_X):
    params = {'objective': 'reg:linear', 
          'eval_metric': 'rmse',
          'eta': 0.001,
          'max_depth': 10, 
          'subsample': 0.6, 
          'colsample_bytree': 0.6,
          'alpha':0.001,
          'random_state': 42, 
          'silent': True}
    
    tr_data = xgb.DMatrix(train_X, train_y)
    va_data = xgb.DMatrix(val_X, val_y)
    
    watchlist = [(tr_data, 'train'), (va_data, 'valid')]
    
    model_xgb = xgb.train(params, tr_data, 2000, watchlist, maximize=False, early_stopping_rounds = 100, verbose_eval=100)
    
    dtest = xgb.DMatrix(test_X)
    xgb_pred_y = np.expm1(model_xgb.predict(dtest, ntree_limit=model_xgb.best_ntree_limit))
    
    return xgb_pred_y, model_xgb

In [23]:
# Training XGB
pred_test_xgb, model_xgb = run_xgb(dev_X, dev_y, val_X, val_y, X_test)
print("XGB Training Completed...")

[0]	train-rmse:14.0877	valid-rmse:14.0768
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 100 rounds.
[100]	train-rmse:12.7684	valid-rmse:12.7562
[200]	train-rmse:11.5764	valid-rmse:11.5631
[300]	train-rmse:10.4999	valid-rmse:10.4855
[400]	train-rmse:9.52805	valid-rmse:9.51345
[500]	train-rmse:8.65078	valid-rmse:8.63645
[600]	train-rmse:7.85866	valid-rmse:7.84551
[700]	train-rmse:7.14378	valid-rmse:7.13295
[800]	train-rmse:6.49895	valid-rmse:6.49063
[900]	train-rmse:5.91695	valid-rmse:5.91283
[1000]	train-rmse:5.39254	valid-rmse:5.39331
[1100]	train-rmse:4.91972	valid-rmse:4.92571
[1200]	train-rmse:4.49371	valid-rmse:4.50632
[1300]	train-rmse:4.10977	valid-rmse:4.13031
[1400]	train-rmse:3.76482	valid-rmse:3.79406
[1500]	train-rmse:3.45498	valid-rmse:3.49404
[1600]	train-rmse:3.17657	valid-rmse:3.22676
[1700]	train-rmse:2.92688	valid-rmse:2.98922
[1800]	train-rmse:2.70353	valid-rmse:2.77872
[1900]	trai

## Catboost

In [24]:
cb_model = CatBoostRegressor(iterations=500,
                             learning_rate=0.05,
                             depth=10,
                             eval_metric='RMSE',
                             random_seed = 42,
                             bagging_temperature = 0.2,
                             od_type='Iter',
                             metric_period = 50,
                             od_wait=20)

In [25]:
cb_model.fit(dev_X, dev_y,
             eval_set=(val_X, val_y),
             use_best_model=True,
             verbose=50)



0:	learn: 1.7526762	test: 1.6875093	best: 1.6875093 (0)	total: 838ms	remaining: 6m 57s
50:	learn: 1.4761562	test: 1.5161235	best: 1.5161235 (50)	total: 40s	remaining: 5m 52s
100:	learn: 1.3754008	test: 1.4772765	best: 1.4772765 (100)	total: 1m 22s	remaining: 5m 24s
150:	learn: 1.3164265	test: 1.4622808	best: 1.4620402 (148)	total: 2m 2s	remaining: 4m 43s
200:	learn: 1.2485595	test: 1.4467180	best: 1.4467180 (200)	total: 2m 43s	remaining: 4m 2s
250:	learn: 1.1708898	test: 1.4352949	best: 1.4352949 (250)	total: 3m 23s	remaining: 3m 21s
300:	learn: 1.1167592	test: 1.4323582	best: 1.4322308 (299)	total: 4m 3s	remaining: 2m 40s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 1.431742338
bestIteration = 305

Shrink model to first 306 iterations.


<catboost.core.CatBoostRegressor at 0x131b147d0>

In [26]:
pred_test_cat = np.expm1(cb_model.predict(X_test))

## Combine Predictions

In [27]:
sub = pd.read_csv('../input/sample_submission.csv')

sub_lgb = pd.DataFrame()
sub_lgb["target"] = pred_test

sub_xgb = pd.DataFrame()
sub_xgb["target"] = pred_test_xgb

sub_cat = pd.DataFrame()
sub_cat["target"] = pred_test_cat

sub["target"] = (sub_lgb["target"] * 0.5 + sub_xgb["target"] * 0.3 + sub_cat["target"] * 0.2)

FileNotFoundError: [Errno 2] File b'../input/sample_submission.csv' does not exist: b'../input/sample_submission.csv'

In [None]:
print(sub.head())
sub.to_csv('sub_lgb_xgb_cat.csv', index=False)