## Library install and download data

In [0]:
!pip install catboost
!pip install lightgbm
!pip install xgboost

In [0]:
!pip install kaggle

In [0]:
from google.colab import files
files.upload()

In [4]:
!pwd

/content


In [0]:
!mkdir ~/.kaggle
!cp /content/kaggle.json ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json

In [6]:
!kaggle competitions download -c santander-value-prediction-challenge

Downloading sample_submission.csv.zip to /content
  0% 0.00/266k [00:00<?, ?B/s]
100% 266k/266k [00:00<00:00, 64.0MB/s]
Downloading test.csv.zip to /content
 21% 5.00M/24.2M [00:01<00:06, 3.10MB/s]
100% 24.2M/24.2M [00:01<00:00, 14.2MB/s]
Downloading train.csv.zip to /content
 72% 5.00M/6.95M [00:00<00:00, 9.01MB/s]
100% 6.95M/6.95M [00:00<00:00, 12.3MB/s]


In [0]:
!ls

In [7]:
!unzip test.csv.zip 
!unzip train.csv.zip
!unzip sample_submission.csv.zip

Archive:  test.csv.zip
  inflating: test.csv                
Archive:  train.csv.zip
  inflating: train.csv               
Archive:  sample_submission.csv.zip
  inflating: sample_submission.csv   


In [0]:
!ls 

## Starting with the imports and working with data

In [0]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor

%matplotlib inline
sns.set()

import warnings
warnings.filterwarnings('ignore')

In [0]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
sample = pd.read_csv('./sample_submission.csv')

In [10]:
train.head()

Unnamed: 0,ID,target,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000d6aaf2,38000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
1,000fbd867,600000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
2,0027d6b71,10000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
3,0028cbf45,2000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4,002a68644,14400000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0


In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4459 entries, 0 to 4458
Columns: 4993 entries, ID to 9fc776466
dtypes: float64(1845), int64(3147), object(1)
memory usage: 169.9+ MB


In [12]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49342 entries, 0 to 49341
Columns: 4992 entries, ID to 9fc776466
dtypes: float64(4991), object(1)
memory usage: 1.8+ GB


In [13]:
# Checcking for missing values
train.columns[train.isnull().sum()!=0].size

0

In [3]:
# Removing constant columns 
# cols to remove 
cols_remove =[]

for col in train.columns:
  if col!='ID' and col!='target':
    if train[col].std()==0:
      cols_remove.append(col)

print(len(cols_remove))

# dropping the cols with std =0
train.drop(cols_remove,axis=1,inplace=True)
test.drop(cols_remove,axis=1,inplace=True)

256


In [0]:
# dropping sparse columns
for col in train.columns:
  if col!='ID' and col!='target':
    # less than 2 unique values
    if len(np.unique(train[col]))<2:
      train.drop(col,axis=1,inplace=True)
      test.drop(col,axis=1,inplace=True)

In [10]:
train.shape,test.shape

((4459, 4737), (49342, 4736))

In [0]:
X = train.drop(['ID','target'],axis=1)
# calculate log(1+x)
y = np.log1p(train.target.values)

Xt = test.drop(['ID'],axis=1)

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=79)

###LightGBM

In [8]:
params={
    'objective':'regression',
    'metric':'rmse',
    'num_leaves':50,
    'learning_rate':0.01,
    'bagging_fraction':0.5,
    'feature_fraction':0.5,
    'bagging_frequency':7,
    'bagging_seed':33,
    'verbosity':-1,
    'seed':33
}

lgtrain = lgb.Dataset(X_train,label=y_train)
lgval = lgb.Dataset(X_test,label=y_test)

evals_result={}

model = lgb.train(params,lgtrain,5000,valid_sets=lgval,
                 early_stopping_rounds=100,verbose_eval=150,
                 evals_result=evals_result)

# exp(x) - 1
lgb_predy = np.expm1(model.predict(Xt,num_iterations=model.best_iteration))

Training until validation scores don't improve for 100 rounds.
[150]	valid_0's rmse: 1.47003
[300]	valid_0's rmse: 1.41794
[450]	valid_0's rmse: 1.41307
[600]	valid_0's rmse: 1.41422
Early stopping, best iteration is:
[514]	valid_0's rmse: 1.41269


### XGBoost

In [9]:
params ={
    'objective':'reg:linear',
    'eval_metric':'rmse',
    'eta':0.001,
    'max_depth':12,
    'subsample':0.5,
    'colsample_bytree':0.6,
    'alpha':0.001,
    'random_state':33,
    'silent':True
}

train_data = xgb.DMatrix(X_train,y_train)
valid_data = xgb.DMatrix(X_test,y_test)

watchlist =[(train_data,'train'),(valid_data,'valid')]

xgb_model = xgb.train(params,train_data,2000,watchlist,maximize=False,
                     early_stopping_rounds=100,verbose_eval=100)


test_data = xgb.DMatrix(Xt)

xgb_predy = np.expm1(xgb_model.predict(test_data,ntree_limit=xgb_model.best_ntree_limit))

[0]	train-rmse:14.0798	valid-rmse:14.1027
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 100 rounds.
[100]	train-rmse:12.7611	valid-rmse:12.7835
[200]	train-rmse:11.5699	valid-rmse:11.5917
[300]	train-rmse:10.4945	valid-rmse:10.5155
[400]	train-rmse:9.52331	valid-rmse:9.54394
[500]	train-rmse:8.6471	valid-rmse:8.66816
[600]	train-rmse:7.85669	valid-rmse:7.8788
[700]	train-rmse:7.14389	valid-rmse:7.16762
[800]	train-rmse:6.50017	valid-rmse:6.52616
[900]	train-rmse:5.91978	valid-rmse:5.9495
[1000]	train-rmse:5.39687	valid-rmse:5.43078
[1100]	train-rmse:4.92525	valid-rmse:4.96432
[1200]	train-rmse:4.49992	valid-rmse:4.54533
[1300]	train-rmse:4.11687	valid-rmse:4.16938
[1400]	train-rmse:3.77225	valid-rmse:3.83292
[1500]	train-rmse:3.46198	valid-rmse:3.53209
[1600]	train-rmse:3.18317	valid-rmse:3.26383
[1700]	train-rmse:2.93254	valid-rmse:3.02578
[1800]	train-rmse:2.70808	valid-rmse:2.81413
[1900]	train-r

### CatBoost

In [7]:
cb_model = CatBoostRegressor(iterations=500,
                           learning_rate=0.05,
                           depth=6,
                           eval_metric='RMSE',
                           random_seed=33,
                           bagging_temperature=0.4,
                           )


cb_model.fit(X_train,y_train,eval_set=(X_test,y_test),
             use_best_model=True,verbose=True)



0:	learn: 13.8755844	test: 13.9001098	best: 13.9001098 (0)	total: 1.34s	remaining: 11m 10s
1:	learn: 13.1989019	test: 13.2241997	best: 13.2241997 (1)	total: 2.26s	remaining: 9m 22s
2:	learn: 12.5550489	test: 12.5794542	best: 12.5794542 (2)	total: 3.12s	remaining: 8m 36s
3:	learn: 11.9443517	test: 11.9701256	best: 11.9701256 (3)	total: 4.02s	remaining: 8m 18s
4:	learn: 11.3626692	test: 11.3881177	best: 11.3881177 (4)	total: 4.92s	remaining: 8m 7s
5:	learn: 10.8120496	test: 10.8379742	best: 10.8379742 (5)	total: 5.81s	remaining: 7m 58s
6:	learn: 10.2899964	test: 10.3154183	best: 10.3154183 (6)	total: 6.71s	remaining: 7m 52s
7:	learn: 9.7937226	test: 9.8200811	best: 9.8200811 (7)	total: 7.62s	remaining: 7m 48s
8:	learn: 9.3228094	test: 9.3500760	best: 9.3500760 (8)	total: 8.54s	remaining: 7m 46s
9:	learn: 8.8763664	test: 8.9034277	best: 8.9034277 (9)	total: 9.46s	remaining: 7m 43s
10:	learn: 8.4531743	test: 8.4798557	best: 8.4798557 (10)	total: 10.4s	remaining: 7m 41s
11:	learn: 8.0519420

In [0]:
cb_predy=np.expm1(cb_model.predict(Xt))

### Combining Results

In [13]:
len(cb_predy),len(lgb_predy),len(xgb_predy)

(49342, 49342, 49342)

In [11]:
sample.head()

Unnamed: 0,ID,target
0,000137c73,5944923.0
1,00021489f,5944923.0
2,0004d7953,5944923.0
3,00056a333,5944923.0
4,00056d8eb,5944923.0


In [0]:
sub = pd.DataFrame(columns=['ID','target'])
sub['ID']=test.ID
sub['target']=(cb_predy*0.4)+(lgb_predy*0.3)+(xgb_predy*0.3)

In [17]:
sub.head()

Unnamed: 0,ID,target
0,000137c73,996156.0
1,00021489f,1302391.0
2,0004d7953,1354613.0
3,00056a333,3440653.0
4,00056d8eb,1270674.0


In [0]:
sub.to_csv('submission1.csv',index=False)

In [0]:
from google.colab import files
files.download('submission1.csv')
                                  