### Load data

In [1]:
# %load "../scripts/load_porto.py"
import pandas as pd
import numpy as np

#Import data
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

#Create train and test variables
train_id = train['id']
train_target = train['target']
train = train.drop(['id','target'],axis=1)

test_id = test['id']
test = test.drop('id',axis=1)

### Columns suggest to drop

In [2]:
col_drop = train.columns[train.columns.str.startswith('ps_calc')]

train = train.drop(col_drop,axis=1)
test = test.drop(col_drop,axis=1)

### Deal with missing data

In [3]:
# %load "../scripts/handle_missing_data.py"
# Replace -1 values (missing values) with nan
train = train.replace(to_replace=-1,value=np.nan)
test = test.replace(to_replace=-1,value=np.nan)

# Missing data 1: drop columns with 20%+ missing data
incomplete_columns = list(train.isnull().sum()[train.isnull().sum()>len(train)*.2].index)

train = train.drop(incomplete_columns,axis=1)
test = test.drop(incomplete_columns, axis=1)

# Missing data 2: fill binary and categorical columns with mode
fill_with_mode = [col for col in train.columns if col.endswith('cat')] +[col for col in train.columns if col.endswith('bin')]
train[fill_with_mode].fillna(train[fill_with_mode].mode())
test[fill_with_mode].fillna(test[fill_with_mode].mode())

# Missing data 3: impute median values to remaining columns
train = train.fillna(train.median())
test = test.fillna(test.median())

### Scaling data

In [4]:
"""from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()   #Initalize scaler estimator

scaler.fit(train) #Remember to only fit scaler to training data

train = pd.DataFrame(scaler.transform(train),columns=train.columns)
test = pd.DataFrame(scaler.transform(test),columns=test.columns)"""

'from sklearn.preprocessing import StandardScaler\n\nscaler = StandardScaler()   #Initalize scaler estimator\n\nscaler.fit(train) #Remember to only fit scaler to training data\n\ntrain = pd.DataFrame(scaler.transform(train),columns=train.columns)\ntest = pd.DataFrame(scaler.transform(test),columns=test.columns)'

### Convert categorical features to binary values

In [5]:
# %load "../scripts/convert_categorical_features_to_binary.py"
cat_feat = [col for col in train.columns if col.endswith('cat')]

for df in [train,test]:
    for column in cat_feat:
        dummies = pd.get_dummies(df[column],drop_first=True)
        df = pd.concat([df,dummies],axis=1)
        df = df.drop([column],axis=1)

### Gini calculators

In [6]:
# %load "../scripts/gini_calculator.py"
#From: https://www.kaggle.com/batzner/gini-coefficient-an-intuitive-explanation

def gini(actual, pred):
    assert (len(actual) == len(pred))
    all = np.asarray(np.c_[actual, pred, np.arange(len(actual))], dtype=np.float)
    all = all[np.lexsort((all[:, 2], -1 * all[:, 1]))]
    totalLosses = all[:, 0].sum()
    giniSum = all[:, 0].cumsum().sum() / totalLosses

    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)


def gini_normalized(actual, pred):
    return gini(actual, pred) / gini(actual, actual)

In [7]:
#LightGBM
#From: https://www.kaggle.com/the1owl/forza-baseline/code
def gini_lgb(preds, dtrain):
    y = list(dtrain.get_label())
    score = gini(y, preds) / gini(y, y)
    return 'gini', score, True

### Light GBM Classifier

In [9]:
from sklearn.model_selection import RandomizedSearchCV
from lightgbm import LGBMClassifier

In [None]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(train.values,train_target.values, test_size=0.3, random_state=42)

train_set = lgb.Dataset(X_train, label=y_train)
valid_set = lgb.Dataset(X_valid, label=y_valid)

### Randomized Search

In [10]:
lgb_class = LGBMClassifier()

In [61]:
lgb_params = {}
lgb_params['learning_rate']=[.01,.02,.05,.10]
lgb_params['n_estimators']=[20]
lgb_params['num_leaves']=[10,20,25,31,40]
lgb_params['max_depth']=[-1,3,4,5,6,10,30,50]
lgb_params['objective']='binary'
#lgb_params['boosting_type']=['gbdt','dart','goss','rf']

lgb_params

{'learning_rate': [0.01, 0.02, 0.05, 0.1],
 'max_depth': [-1, 3, 4, 5, 6, 10, 30, 50],
 'n_estimators': [20],
 'num_leaves': [10, 20, 25, 31, 40],
 'objective': 'binary'}

In [62]:
rand = RandomizedSearchCV(lgb_class,lgb_params,n_iter=30,verbose=3)

rand.fit(train,train_target)

Fitting 3 folds for each of 30 candidates, totalling 90 fits
[CV] objective=b, num_leaves=31, n_estimators=20, max_depth=3, learning_rate=0.02 
[CV]  objective=b, num_leaves=31, n_estimators=20, max_depth=3, learning_rate=0.02, score=0.9635493057130616, total=   1.2s
[CV] objective=b, num_leaves=31, n_estimators=20, max_depth=3, learning_rate=0.02 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.8s remaining:    0.0s


[CV]  objective=b, num_leaves=31, n_estimators=20, max_depth=3, learning_rate=0.02, score=0.9635541622144714, total=   1.3s
[CV] objective=b, num_leaves=31, n_estimators=20, max_depth=3, learning_rate=0.02 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.8s remaining:    0.0s


[CV]  objective=b, num_leaves=31, n_estimators=20, max_depth=3, learning_rate=0.02, score=0.96355397851847, total=   1.3s
[CV] objective=i, num_leaves=20, n_estimators=20, max_depth=30, learning_rate=0.02 
[CV]  objective=i, num_leaves=20, n_estimators=20, max_depth=30, learning_rate=0.02, score=0.9635493057130616, total=   1.5s
[CV] objective=i, num_leaves=20, n_estimators=20, max_depth=30, learning_rate=0.02 
[CV]  objective=i, num_leaves=20, n_estimators=20, max_depth=30, learning_rate=0.02, score=0.9635541622144714, total=   1.6s
[CV] objective=i, num_leaves=20, n_estimators=20, max_depth=30, learning_rate=0.02 
[CV]  objective=i, num_leaves=20, n_estimators=20, max_depth=30, learning_rate=0.02, score=0.96355397851847, total=   1.5s
[CV] objective=n, num_leaves=20, n_estimators=20, max_depth=50, learning_rate=0.1 
[CV]  objective=n, num_leaves=20, n_estimators=20, max_depth=50, learning_rate=0.1, score=0.9635493057130616, total=   1.6s
[CV] objective=n, num_leaves=20, n_estimators=

[CV]  objective=y, num_leaves=20, n_estimators=20, max_depth=10, learning_rate=0.1, score=0.9635493057130616, total=   1.6s
[CV] objective=y, num_leaves=20, n_estimators=20, max_depth=10, learning_rate=0.1 
[CV]  objective=y, num_leaves=20, n_estimators=20, max_depth=10, learning_rate=0.1, score=0.9635541622144714, total=   1.6s
[CV] objective=y, num_leaves=20, n_estimators=20, max_depth=10, learning_rate=0.1 
[CV]  objective=y, num_leaves=20, n_estimators=20, max_depth=10, learning_rate=0.1, score=0.96355397851847, total=   1.6s
[CV] objective=r, num_leaves=10, n_estimators=20, max_depth=4, learning_rate=0.05 
[CV]  objective=r, num_leaves=10, n_estimators=20, max_depth=4, learning_rate=0.05, score=0.9635493057130616, total=   1.4s
[CV] objective=r, num_leaves=10, n_estimators=20, max_depth=4, learning_rate=0.05 
[CV]  objective=r, num_leaves=10, n_estimators=20, max_depth=4, learning_rate=0.05, score=0.9635541622144714, total=   1.4s
[CV] objective=r, num_leaves=10, n_estimators=20, 

[CV]  objective=i, num_leaves=25, n_estimators=20, max_depth=10, learning_rate=0.02, score=0.9635541622144714, total=   1.6s
[CV] objective=i, num_leaves=25, n_estimators=20, max_depth=10, learning_rate=0.02 
[CV]  objective=i, num_leaves=25, n_estimators=20, max_depth=10, learning_rate=0.02, score=0.96355397851847, total=   1.6s
[CV] objective=a, num_leaves=25, n_estimators=20, max_depth=5, learning_rate=0.05 
[CV]  objective=a, num_leaves=25, n_estimators=20, max_depth=5, learning_rate=0.05, score=0.9635493057130616, total=   1.5s
[CV] objective=a, num_leaves=25, n_estimators=20, max_depth=5, learning_rate=0.05 
[CV]  objective=a, num_leaves=25, n_estimators=20, max_depth=5, learning_rate=0.05, score=0.9635541622144714, total=   1.4s
[CV] objective=a, num_leaves=25, n_estimators=20, max_depth=5, learning_rate=0.05 
[CV]  objective=a, num_leaves=25, n_estimators=20, max_depth=5, learning_rate=0.05, score=0.96355397851847, total=   1.5s
[CV] objective=r, num_leaves=10, n_estimators=20,

[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:  3.5min finished


RandomizedSearchCV(cv=None, error_score='raise',
          estimator=LGBMClassifier(boosting_type='gbdt', colsample_bytree=1.0, learning_rate=0.1,
        max_bin=255, max_depth=-1, min_child_samples=10,
        min_child_weight=5, min_split_gain=0.0, n_estimators=10, n_jobs=-1,
        num_leaves=31, objective=None, random_state=0, reg_alpha=0.0,
        reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=50000, subsample_freq=1),
          fit_params=None, iid=True, n_iter=30, n_jobs=1,
          param_distributions={'learning_rate': [0.01, 0.02, 0.05, 0.1], 'n_estimators': [20], 'num_leaves': [10, 20, 25, 31, 40], 'max_depth': [-1, 3, 4, 5, 6, 10, 30, 50], 'objective': 'binary'},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=3)

In [69]:
rand.best_estimator_

LGBMClassifier(boosting_type='gbdt', colsample_bytree=1.0, learning_rate=0.02,
        max_bin=255, max_depth=3, min_child_samples=10, min_child_weight=5,
        min_split_gain=0.0, n_estimators=20, n_jobs=-1, num_leaves=31,
        objective='b', random_state=0, reg_alpha=0.0, reg_lambda=0.0,
        silent=True, subsample=1.0, subsample_for_bin=50000,
        subsample_freq=1)

In [87]:
rand.best_params_

{'learning_rate': 0.02,
 'max_depth': 3,
 'n_estimators': 20,
 'num_leaves': 31,
 'objective': 'b'}

In [90]:
model_params = rand.best_params_
model_params['n_estimators']=100
model_params['max_bin']=255
model_params['objective']='binary'

In [28]:
"""model_params = {'learning_rate': 0.02,
 'max_bin': 10,
 'min_child_samples': 200,
 'n_estimators': 100,
 'objective': 'binary'}
 """

### Training and Predictions

In [91]:
lgb_model = lgb.train(params=model_params, train_set=train_set, num_boost_round=500, valid_sets=valid_set,verbose_eval=50, feval=gini_lgb, early_stopping_rounds=200)

Training until validation scores don't improve for 200 rounds.
[50]	valid_0's gini: 0.219986
[100]	valid_0's gini: 0.227547
[150]	valid_0's gini: 0.239065
[200]	valid_0's gini: 0.247652
[250]	valid_0's gini: 0.255276
[300]	valid_0's gini: 0.260702
[350]	valid_0's gini: 0.264862
[400]	valid_0's gini: 0.267693
[450]	valid_0's gini: 0.269861
[500]	valid_0's gini: 0.271891


In [57]:
lgb_pred = lgb_model.predict(test,num_iteration=lgb_model.best_iteration)

### Ensembling LGB Models

Train 3 models on different training set and then take average prediction for each value in the test set.

In [92]:
model_params

{'learning_rate': 0.02,
 'max_depth': 3,
 'n_estimators': 100,
 'num_leaves': 31,
 'objective': 'binary',
 'verbose': 1}

In [93]:
model_params2 = {'learning_rate': 0.02,
 'max_depth': 6,
 'n_estimators': 100,
 'num_leaves': 25,
 'objective': 'binary',
 'verbose': 1}

In [94]:
model_param3={'learning_rate': 0.02,
 'max_depth': 4,
 'n_estimators': 100,
 'num_leaves': 35,
 'objective': 'binary',
 'verbose': 1}

In [95]:
param_list = [model_params,model_params2,model_param3]

In [98]:
train_sets=[]
valid_sets=[]

model_list=[]

for i in range(3):
    X_train, X_valid, y_train, y_valid = train_test_split(train.values,train_target.values, test_size=0.3,random_state=i)

    train_sets.append(lgb.Dataset(X_train, label=y_train))
    valid_sets.append(lgb.Dataset(X_valid, label=y_valid))
    
    new_model = lgb.train(params=param_list[i], train_set=train_sets[i], num_boost_round=500, valid_sets=valid_sets[i],verbose_eval=50, feval=gini_lgb, early_stopping_rounds=200)
    model_list.append(new_model)

Training until validation scores don't improve for 200 rounds.
[50]	valid_0's gini: 0.210733
[100]	valid_0's gini: 0.219012
[150]	valid_0's gini: 0.234409
[200]	valid_0's gini: 0.245806
[250]	valid_0's gini: 0.255507
[300]	valid_0's gini: 0.262545
[350]	valid_0's gini: 0.266946
[400]	valid_0's gini: 0.270225
[450]	valid_0's gini: 0.273031
[500]	valid_0's gini: 0.275221
Training until validation scores don't improve for 200 rounds.
[50]	valid_0's gini: 0.243049
[100]	valid_0's gini: 0.248832
[150]	valid_0's gini: 0.259102
[200]	valid_0's gini: 0.266101
[250]	valid_0's gini: 0.271082
[300]	valid_0's gini: 0.276011
[350]	valid_0's gini: 0.279434
[400]	valid_0's gini: 0.281775
[450]	valid_0's gini: 0.282562
[500]	valid_0's gini: 0.282467
Training until validation scores don't improve for 200 rounds.
[50]	valid_0's gini: 0.216542
[100]	valid_0's gini: 0.22265
[150]	valid_0's gini: 0.236664
[200]	valid_0's gini: 0.246875
[250]	valid_0's gini: 0.254269
[300]	valid_0's gini: 0.258761
[350]	val

In [99]:
preds=[]

for model in model_list:
    preds.append(model.predict(test))
    
preds_mean=np.mean(preds,axis=0)

## 7. Create output csv

In [100]:
sub = pd.DataFrame({'id':test_id,'target':preds_mean})

sub.to_csv('../predictions/LGBM Mean of 3 Models with diff training sets.csv',index=False)