In [3]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [5]:
import lightgbm as lgbm
from scipy import sparse as ssp
from sklearn.model_selection import StratifiedKFold
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [6]:
def Gini(y_true, y_pred):
    # check and get number of samples
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0]

    # sort rows on prediction column
    # (from largest to smallest)
    arr = np.array([y_true, y_pred]).transpose()
    true_order = arr[arr[:, 0].argsort()][::-1, 0]
    pred_order = arr[arr[:, 1].argsort()][::-1, 0]

    # get Lorenz curves
    L_true = np.cumsum(true_order) * 1. / np.sum(true_order)
    L_pred = np.cumsum(pred_order) * 1. / np.sum(pred_order)
    L_ones = np.linspace(1 / n_samples, 1, n_samples)

    # get Gini coefficients (area between curves)
    G_true = np.sum(L_ones - L_true)
    G_pred = np.sum(L_ones - L_pred)

    # normalize to true Gini coefficient
    return G_pred * 1. / G_true

In [7]:
cv_only = True
save_cv = True
full_train = False

In [8]:
def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return 'gini', Gini(labels, preds), True

In [9]:
path = "../data/"

train = pd.read_csv(path+'train.csv')
train_label = train['target']
train_id = train['id']
test = pd.read_csv(path+'test.csv')
test_id = test['id']

In [12]:
NFOLDS = 5
kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=218)

In [14]:
y = train['target'].values
drop_feature = [
    'id',
    'target'
]
X = train.drop(drop_feature,axis=1)

In [15]:
feature_names = X.columns.tolist()

In [16]:
feature_names

['ps_ind_01',
 'ps_ind_02_cat',
 'ps_ind_03',
 'ps_ind_04_cat',
 'ps_ind_05_cat',
 'ps_ind_06_bin',
 'ps_ind_07_bin',
 'ps_ind_08_bin',
 'ps_ind_09_bin',
 'ps_ind_10_bin',
 'ps_ind_11_bin',
 'ps_ind_12_bin',
 'ps_ind_13_bin',
 'ps_ind_14',
 'ps_ind_15',
 'ps_ind_16_bin',
 'ps_ind_17_bin',
 'ps_ind_18_bin',
 'ps_reg_01',
 'ps_reg_02',
 'ps_reg_03',
 'ps_car_01_cat',
 'ps_car_02_cat',
 'ps_car_03_cat',
 'ps_car_04_cat',
 'ps_car_05_cat',
 'ps_car_06_cat',
 'ps_car_07_cat',
 'ps_car_08_cat',
 'ps_car_09_cat',
 'ps_car_10_cat',
 'ps_car_11_cat',
 'ps_car_11',
 'ps_car_12',
 'ps_car_13',
 'ps_car_14',
 'ps_car_15',
 'ps_calc_01',
 'ps_calc_02',
 'ps_calc_03',
 'ps_calc_04',
 'ps_calc_05',
 'ps_calc_06',
 'ps_calc_07',
 'ps_calc_08',
 'ps_calc_09',
 'ps_calc_10',
 'ps_calc_11',
 'ps_calc_12',
 'ps_calc_13',
 'ps_calc_14',
 'ps_calc_15_bin',
 'ps_calc_16_bin',
 'ps_calc_17_bin',
 'ps_calc_18_bin',
 'ps_calc_19_bin',
 'ps_calc_20_bin']

In [18]:
cat_features = [c for c in feature_names if ('cat' in c and 'count' not in c)]
num_features = [c for c in feature_names if ('cat' not in c and 'calc' not in c)]

In [19]:
train['missing'] = (train==-1).sum(axis=1).astype(float)
test['missing'] = (test==-1).sum(axis=1).astype(float)
num_features.append('missing')

In [20]:
for c in cat_features:
    le = LabelEncoder()
    le.fit(train[c])
    train[c] = le.transform(train[c])
    test[c] = le.transform(test[c])

In [26]:
enc = OneHotEncoder()
enc.fit(train[cat_features])

OneHotEncoder(categorical_features='all', dtype=<class 'numpy.float64'>,
       handle_unknown='error', n_values='auto', sparse=True)

In [27]:
X_cat = enc.transform(train[cat_features])
X_t_cat = enc.transform(test[cat_features])

In [28]:
ind_features = [c for c in feature_names if 'ind' in c]

In [32]:
count=0
for c in ind_features:
    if count==0:
        train['new_ind'] = train[c].astype(str)+'_'
        test['new_ind'] = test[c].astype(str)+'_'
        count+=1
    else:
        train['new_ind'] += train[c].astype(str)+'_'
        test['new_ind'] += test[c].astype(str)+'_'

In [33]:
train['new_ind'].head()

0    2_2_5_2_1_0_1_0_0_0_0_0_0_0_11_0_1_0_
1     1_1_7_1_1_0_0_1_0_0_0_0_0_0_3_0_0_1_
2    5_4_9_2_1_0_0_1_0_0_0_0_0_0_12_1_0_0_
3     0_1_2_1_1_1_0_0_0_0_0_0_0_0_8_1_0_0_
4     0_2_0_2_1_1_0_0_0_0_0_0_0_0_9_1_0_0_
Name: new_ind, dtype: object

In [37]:
cat_count_features = []
for c in cat_features+['new_ind']:
    d = pd.concat([train[c],test[c]]).value_counts().to_dict()
    train['%s_count'%c] = train[c].apply(lambda x:d.get(x,0))
    test['%s_count'%c] = test[c].apply(lambda x:d.get(x,0))
    cat_count_features.append('%s_count'%c)

In [39]:
train.head()

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_car_03_cat_count,ps_car_04_cat_count,ps_car_05_cat_count,ps_car_06_cat_count,ps_car_07_cat_count,ps_car_08_cat_count,ps_car_09_cat_count,ps_car_10_cat_count,ps_car_11_cat_count,new_ind_count
0,7,0,2,2,5,2,1,0,1,0,...,1028142,1241334,431560,77845,1383070,249663,486510,1475460,18326,6
1,9,0,1,1,7,1,1,0,0,1,...,1028142,1241334,666910,329890,1383070,1238365,883326,1475460,12535,36
2,13,0,5,4,9,2,1,0,0,1,...,1028142,1241334,666910,147714,1383070,1238365,883326,1475460,19943,24
3,16,0,0,1,2,1,1,1,0,0,...,183044,1241334,431560,329890,1383070,1238365,36798,1475460,212989,2784
4,17,0,0,2,0,2,1,1,0,0,...,1028142,1241334,666910,147714,1383070,1238365,883326,1475460,26161,258


In [40]:
train_list = [train[num_features+cat_count_features].values,X_cat,]
test_list = [test[num_features+cat_count_features].values,X_t_cat,]

In [41]:
X = ssp.hstack(train_list).tocsr()
X_test = ssp.hstack(test_list).tocsr()

In [43]:
learning_rate = 0.1
num_leaves = 15
min_data_in_leaf = 2000
feature_fraction = 0.6
num_boost_round = 10000
params = {"objective": "binary",
          "boosting_type": "gbdt",
          "learning_rate": learning_rate,
          "num_leaves": num_leaves,
           "max_bin": 256,
          "feature_fraction": feature_fraction,
          "verbosity": 0,
          "drop_rate": 0.1,
          "is_unbalance": False,
          "max_drop": 50,
          "min_child_samples": 10,
          "min_child_weight": 150,
          "min_split_gain": 0,
          "subsample": 0.9
          }


In [44]:
x_score = []
final_cv_train = np.zeros(len(train_label))
final_cv_pred = np.zeros(len(test_id))

In [50]:
for s in range(16):
    
    cv_train = np.zeros(len(train_label))
    cv_pred = np.zeros(len(test_id))
    
    params['seed'] = s

    if cv_only:
        kf = kfold.split(X, train_label)
        best_trees = []
        fold_scores = []

        for i, (train_fold, validate) in enumerate(kf):
                X_train, X_validate, label_train, label_validate = \
                    X[train_fold, :], X[validate, :], train_label[train_fold], train_label[validate]
                dtrain = lgbm.Dataset(X_train, label_train)
                dvalid = lgbm.Dataset(X_validate, label_validate, reference=dtrain)
                bst = lgbm.train(params, dtrain, num_boost_round, valid_sets=dvalid, feval=evalerror, verbose_eval=100,
                                early_stopping_rounds=100)
                best_trees.append(bst.best_iteration)
                cv_pred += bst.predict(X_test, num_iteration=bst.best_iteration)
                cv_train[validate] += bst.predict(X_validate)

                score = Gini(label_validate, cv_train[validate])
                print(score)
                fold_scores.append(score)
            
        cv_pred /= NFOLDS
        final_cv_train += cv_train
        final_cv_pred += cv_pred
        print("cv score:")
        print(Gini(train_label, cv_train))
        print("current score:", Gini(train_label, final_cv_train / (s + 1.)), s+1)
        print(fold_scores)
        print(best_trees, np.mean(best_trees))
    

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's gini: 0.290721
[200]	valid_0's gini: 0.294838
[300]	valid_0's gini: 0.294927
Early stopping, best iteration is:
[207]	valid_0's gini: 0.295615
0.295614539033
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's gini: 0.273505
[200]	valid_0's gini: 0.275315
[300]	valid_0's gini: 0.274964
Early stopping, best iteration is:
[233]	valid_0's gini: 0.276035
0.276034560822
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's gini: 0.278521
[200]	valid_0's gini: 0.283447
Early stopping, best iteration is:
[193]	valid_0's gini: 0.283716
0.283716182203
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's gini: 0.278669
[200]	valid_0's gini: 0.283511
[300]	valid_0's gini: 0.283751
Early stopping, best iteration is:
[254]	valid_0's gini: 0.285254
0.285253573616
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's gini

[300]	valid_0's gini: 0.275037
Early stopping, best iteration is:
[261]	valid_0's gini: 0.275521
0.275521396942
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's gini: 0.278297
[200]	valid_0's gini: 0.282414
[300]	valid_0's gini: 0.282122
Early stopping, best iteration is:
[204]	valid_0's gini: 0.282807
0.282807129754
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's gini: 0.279938
[200]	valid_0's gini: 0.284817
[300]	valid_0's gini: 0.28332
Early stopping, best iteration is:
[200]	valid_0's gini: 0.284817
0.284816821339
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's gini: 0.292336
[200]	valid_0's gini: 0.295123
[300]	valid_0's gini: 0.294082
Early stopping, best iteration is:
[202]	valid_0's gini: 0.29518
0.295180110895
cv score:
0.286959850648
current score: 0.289308238369 7
[0.29714082595060914, 0.27552139694185973, 0.2828071297539137, 0.28481682133886471, 0.29518011089533547]
[172, 261, 204, 2

[100]	valid_0's gini: 0.277087
[200]	valid_0's gini: 0.280784
[300]	valid_0's gini: 0.280142
Early stopping, best iteration is:
[235]	valid_0's gini: 0.281374
0.281374453322
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's gini: 0.28085
[200]	valid_0's gini: 0.286285
[300]	valid_0's gini: 0.287108
[400]	valid_0's gini: 0.284026
Early stopping, best iteration is:
[314]	valid_0's gini: 0.287347
0.287347404662
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's gini: 0.292452
[200]	valid_0's gini: 0.294391
Early stopping, best iteration is:
[165]	valid_0's gini: 0.295216
0.295215952252
cv score:
0.286943418591
current score: 0.289709357829 13
[0.29750475956776706, 0.27388011720844502, 0.2813744533219984, 0.28734740466242986, 0.29521595225180486]
[259, 205, 235, 314, 165] 235.6
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's gini: 0.291129
[200]	valid_0's gini: 0.296695
[300]	valid_0's gini: 0.295786
Ea

In [51]:
print(x_score)
pd.DataFrame({'id': test_id, 'target': final_cv_pred / 16.}).to_csv('../model/lgbm3_pred_avg.csv', index=False)
pd.DataFrame({'id': train_id, 'target': final_cv_train / 16.}).to_csv('../model/lgbm3_cv_avg.csv', index=False)

[]


FileNotFoundError: [Errno 2] No such file or directory: '../model/lgbm3_pred_avg.csv'

In [52]:
def interaction_features(train, test, fea1, fea2, prefix):
    train['inter_{}*'.format(prefix)] = train[fea1] * train[fea2]
    train['inter_{}/'.format(prefix)] = train[fea1] / train[fea2]

    test['inter_{}*'.format(prefix)] = test[fea1] * test[fea2]
    test['inter_{}/'.format(prefix)] = test[fea1] / test[fea2]

    return train, test