In [1]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from scipy.stats import skew
import time
import gc


In [2]:
gc.enable()

In [3]:
print(lgb.__version__)

2.1.0


In [4]:
def get_data():
    print('Reading data')
    data = pd.read_csv('data/train.csv', nrows=None)
    test = pd.read_csv('data/test.csv', nrows=None)
    print('Train shape ', data.shape, ' Test shape ', test.shape)
    return data, test


In [5]:
def get_selected_features():
    return ['f190486d6', '58e2e02e6', 'eeb9cd3aa', '9fd594eec', '6eef030c1', '15ace8c9f', 
        'fb0f5dbfe', '58e056e12', '20aa07010', '024c577b9', 'd6bb78916', 'b43a7cfd5',
        '58232a6fb', '1702b5bf0', '324921c7b', '62e59a501', '2ec5b290f', '241f0f867',
        'fb49e4212', '66ace2992', 'f74e8f13d', '5c6487af1', '963a49cdc', '26fc93eb7',
        '1931ccfdd', '703885424', '70feb1494', '491b9ee45', '23310aa6f', 'e176a204a',
        '6619d81fc', '1db387535']


In [6]:
def add_statistics(train, test):
    train_zeros = pd.DataFrame({'Percent_zero': ((train.values) == 0).mean(axis=0),
                                'Column': train.columns})
    
    high_vol_columns = train_zeros['Column'][train_zeros['Percent_zero'] < 0.70].values
    low_vol_columns = train_zeros['Column'][train_zeros['Percent_zero'] >= 0.70].values
    train = train.replace({0:np.nan})
    test = test.replace({0:np.nan})

    cluster_sets = {"low":low_vol_columns, "high":high_vol_columns}
    for cluster_key in cluster_sets:
        for df in [train,test]:
            df["count_not0_"+cluster_key] = df[cluster_sets[cluster_key]].count(axis=1)
            df["sum_"+cluster_key] = df[cluster_sets[cluster_key]].sum(axis=1)
            df["var_"+cluster_key] = df[cluster_sets[cluster_key]].var(axis=1)
            df["median_"+cluster_key] = df[cluster_sets[cluster_key]].median(axis=1)
            df["mean_"+cluster_key] = df[cluster_sets[cluster_key]].mean(axis=1)
            df["std_"+cluster_key] = df[cluster_sets[cluster_key]].std(axis=1)
            df["max_"+cluster_key] = df[cluster_sets[cluster_key]].max(axis=1)
            df["min_"+cluster_key] = df[cluster_sets[cluster_key]].min(axis=1)
            df["skew_"+cluster_key] = df[cluster_sets[cluster_key]].skew(axis=1)
            df["kurtosis_"+cluster_key] = df[cluster_sets[cluster_key]].kurtosis(axis=1)
    train_more_simplified = train.drop(high_vol_columns,axis=1).drop(low_vol_columns,axis=1)
    colnames = list(train_more_simplified)
    return train, test, colnames

In [7]:
def fit_predict(data, y, test,colnames):
    # Get the features we're going to train on
    features = get_selected_features() + colnames #+ ['nb_nans', 'the_median', 'the_mean', 'the_sum', 'the_std', 'the_kur','the_max','the_min','the_var','count_not0']
    # Create folds
    folds = KFold(n_splits=8, shuffle=True, random_state=1)
    # Convert to lightgbm Dataset
    dtrain = lgb.Dataset(data=data[features], label=np.log1p(y['target']), free_raw_data=False)
    # Construct dataset so that we can use slice()
    dtrain.construct()
    # Init predictions
    sub_preds = np.zeros(test.shape[0])
    oof_preds = np.zeros(data.shape[0])
    # Lightgbm parameters
    # Optimized version scores 0.40
    # Step |   Time |      Score |      Stdev |   p1_leaf |   p2_subsamp |   p3_colsamp |   p4_gain |   p5_alph |   p6_lamb |   p7_weight |
    #   41 | 00m04s |   -1.36098 |    0.02917 |    9.2508 |       0.7554 |       0.7995 |   -3.3108 |   -0.1635 |   -0.9460 |      0.6485 |
    lgb_params = {
        'objective': 'regression',
        'num_leaves': 60,
        'subsample': 0.6143,
        'colsample_bytree': 0.6453,
        'min_split_gain': np.power(10, -2.5988),
        'reg_alpha': np.power(10, -2.2887),
        'reg_lambda': np.power(10, 1.7570),
        'min_child_weight': np.power(10, -0.1477),
        'verbose': -1,
        'seed': 11,
        'boosting_type': 'gbdt',
        'max_depth': -1,
        'learning_rate': 0.03,
        'metric': 'l1',
    }
    # Run KFold
    for trn_idx, val_idx in folds.split(data):
        # Train lightgbm
        clf = lgb.train(
            params=lgb_params,
            train_set=dtrain.subset(trn_idx),
            valid_sets=dtrain.subset(val_idx),
            num_boost_round=10000,
            early_stopping_rounds=100,
            verbose_eval=50
        )
        # Predict Out Of Fold and Test targets
        # Using lgb.train, predict will automatically select the best round for prediction
        oof_preds[val_idx] = clf.predict(dtrain.data.iloc[val_idx])
        sub_preds += clf.predict(test[features]) / folds.n_splits
        # Display current fold score
        print(mean_squared_error(np.log1p(y['target'].iloc[val_idx]),
                                 oof_preds[val_idx]) ** .5)
    # Display Full OOF score (square root of a sum is not the sum of square roots)
    print('Full Out-Of-Fold score : %9.6f'
          % (mean_squared_error(np.log1p(y['target']), oof_preds) ** .5))

    return oof_preds, sub_preds


In [8]:
def main():
    # Get the data
    data, test = get_data()

    # Get target and ids
    y = data[['ID', 'target']].copy()
    del data['target'], data['ID']
    sub = test[['ID']].copy()
    del test['ID']

    # Free some memory
    gc.collect()

    # Add features
    data, test, colnames = add_statistics(data, test)

    # Predict test target
    oof_preds, sub_preds = fit_predict(data, y, test, colnames)

    # Store predictions
    #y['predictions'] = np.expm1(oof_preds)
    #y[['ID', 'target', 'predictions']].to_csv('reduced_set_oof.csv', index=False)
    sub['target'] = np.expm1(sub_preds)
    sub[['ID', 'target']].to_csv('leak.csv', index=False)


In [9]:
if __name__ == '__main__':
    main()

Reading data
Train shape  (4459, 4993)  Test shape  (49342, 4992)
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's l1: 1.11016
[100]	valid_0's l1: 1.04463
[150]	valid_0's l1: 1.03243
[200]	valid_0's l1: 1.03307
[250]	valid_0's l1: 1.03574
Early stopping, best iteration is:
[170]	valid_0's l1: 1.03088
1.3186253236388124
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's l1: 1.13962
[100]	valid_0's l1: 1.10367
[150]	valid_0's l1: 1.09976
[200]	valid_0's l1: 1.09651
[250]	valid_0's l1: 1.10168
[300]	valid_0's l1: 1.10297
Early stopping, best iteration is:
[202]	valid_0's l1: 1.09628
1.4166634974687327
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's l1: 1.11337
[100]	valid_0's l1: 1.05778
[150]	valid_0's l1: 1.04528
[200]	valid_0's l1: 1.04347
[250]	valid_0's l1: 1.04567
Early stopping, best iteration is:
[183]	valid_0's l1: 1.04188
1.3273981554583312
Training until validation scores don't improve for 100