In [15]:
import pandas as pd
import numpy  as np
import gc
import os
from tqdm import tqdm
import lightgbm as lgb
from sklearn.model_selection import ParameterGrid

In [16]:
def get_data():
    print('Reading data')
    data = pd.read_feather('../input/train.ftr')
    test = pd.read_feather('../input/test.ftr')
    print('Train shape ', data.shape, ' Test shape ', test.shape)
    return data, test

In [17]:
# Get the data
data, test = get_data()

Reading data
Train shape  (4459, 4993)  Test shape  (49342, 4992)


# add leak test

In [18]:
leak_test = pd.read_feather('../input/leak_test.ftr')

leak_test = leak_test[leak_test.compiled_leak != 0]

leak_test['target'] = leak_test['compiled_leak']
leak_test = leak_test[data.columns]

data = data.append(leak_test, ignore_index=True)

In [19]:
# Get target and ids
y = data[['ID', 'target']].copy()
del data['target'], data['ID']
sub = test[['ID']].copy()
del test['ID']
# Free some memory
gc.collect()

35

In [21]:
data = np.log1p(data)
test = np.log1p(test)

In [22]:
colset_filename = os.listdir('../input/continue_col/')
colset_filename_40 = [fn for fn in colset_filename if fn.split('.')[0].split('_')[1] == '40']

In [23]:
len(colset_filename_40)

90

In [24]:
colset_filename = os.listdir('../input/continue_col/')
colset_filename_40 = [fn for fn in colset_filename if fn.split('.')[0].split('_')[1] == '40']
col_set = []
for c in colset_filename_40:
    col_set.append(list(np.load('../input/continue_col/' + c)))

In [25]:
def get_selected_features():
    return [
        'f190486d6', 'c47340d97', 'eeb9cd3aa', '66ace2992', 'e176a204a',
        '491b9ee45', '1db387535', 'c5a231d81', '0572565c2', '024c577b9',
        '15ace8c9f', '23310aa6f', '9fd594eec', '58e2e02e6', '91f701ba2',
        'adb64ff71', '2ec5b290f', '703885424', '26fc93eb7', '6619d81fc',
        '0ff32eb98', '70feb1494', '58e056e12', '1931ccfdd', '1702b5bf0',
        '58232a6fb', '963a49cdc', 'fc99f9426', '241f0f867', '5c6487af1',
        '62e59a501', 'f74e8f13d', 'fb49e4212', '190db8488', '324921c7b',
        'b43a7cfd5', '9306da53f', 'd6bb78916', 'fb0f5dbfe', '6eef030c1'
    ]

def add_statistics(train, test):
    train_zeros = pd.DataFrame({'Percent_zero': ((train.values) == 0).mean(axis=0),
                                'Column': train.columns})
    
    high_vol_columns = train_zeros['Column'][train_zeros['Percent_zero'] < 0.70].values
    low_vol_columns = train_zeros['Column'][train_zeros['Percent_zero'] >= 0.70].values
    #This is part of the trick I think, plus lightgbm has a special process for NaNs
    train = train.replace({0:np.nan})
    test = test.replace({0:np.nan})

    cluster_sets = {"low":low_vol_columns, "high":high_vol_columns, 'gsf':get_selected_features()}
    tmp_train = pd.DataFrame()
    tmp_test = pd.DataFrame()
    for cluster_key in tqdm(cluster_sets):
        for df, tmp in [(train, tmp_train),(test, tmp_test)]:
            tmp["count_not0_"+cluster_key] = df[cluster_sets[cluster_key]].count(axis=1)
            tmp["sum_"+cluster_key] = df[cluster_sets[cluster_key]].sum(axis=1)
            tmp["median_"+cluster_key] = df[cluster_sets[cluster_key]].median(axis=1)
            tmp["mean_"+cluster_key] = df[cluster_sets[cluster_key]].mean(axis=1)
            tmp["std_"+cluster_key] = df[cluster_sets[cluster_key]].std(axis=1)
            tmp["max_"+cluster_key] = df[cluster_sets[cluster_key]].max(axis=1)
            tmp["min_"+cluster_key] = df[cluster_sets[cluster_key]].min(axis=1)
            tmp["skew_"+cluster_key] = df[cluster_sets[cluster_key]].skew(axis=1)
            tmp["kurtosis_"+cluster_key] = df[cluster_sets[cluster_key]].kurtosis(axis=1)
            tmp["nunique_"+cluster_key] = df[cluster_sets[cluster_key]].nunique(axis=1)   
            
    return tmp_train, tmp_test

def add_statistics2(train, test):
    
    #This is part of the trick I think, plus lightgbm has a special process for NaNs
    train = train.replace({0:np.nan})
    test = test.replace({0:np.nan})
    
    colset_filename = os.listdir('../input/continue_col/')
    colset_filename_40 = [fn for fn in colset_filename if fn.split('.')[0].split('_')[1] == '40']
    col_set = []
    for c in colset_filename_40:
        col_set.append(list(np.load('../input/continue_col/' + c)))
    
    cluster_sets = {}
    for i, cols in enumerate(col_set):
        cluster_sets['set_col_' + str(i)] = cols
        
    tmp_train = pd.DataFrame()
    tmp_test = pd.DataFrame()
    for cluster_key in cluster_sets:
        for df, tmp in [(train, tmp_train),(test, tmp_test)]:
            tmp["count_not0_"+cluster_key] = df[cluster_sets[cluster_key]].count(axis=1)
            tmp["sum_"+cluster_key] = df[cluster_sets[cluster_key]].sum(axis=1)
            tmp["median_"+cluster_key] = df[cluster_sets[cluster_key]].median(axis=1)
            tmp["mean_"+cluster_key] = df[cluster_sets[cluster_key]].mean(axis=1)
            tmp["std_"+cluster_key] = df[cluster_sets[cluster_key]].std(axis=1)
            tmp["max_"+cluster_key] = df[cluster_sets[cluster_key]].max(axis=1)
            tmp["min_"+cluster_key] = df[cluster_sets[cluster_key]].min(axis=1)
            tmp["skew_"+cluster_key] = df[cluster_sets[cluster_key]].skew(axis=1)
            tmp["kurtosis_"+cluster_key] = df[cluster_sets[cluster_key]].kurtosis(axis=1)
            tmp["nunique_"+cluster_key] = df[cluster_sets[cluster_key]].nunique(axis=1)
        
    col_set = []
    for c in colset_filename_40:
        col_set += list(np.load('../input/continue_col/' + c))
        
    cluster_sets = {'40columns_flatten': col_set}
    for cluster_key in cluster_sets:
        for df, tmp in [(train, tmp_train),(test, tmp_test)]:
            tmp["count_not0_"+cluster_key] = df[cluster_sets[cluster_key]].count(axis=1)
            tmp["sum_"+cluster_key] = df[cluster_sets[cluster_key]].sum(axis=1)
            tmp["median_"+cluster_key] = df[cluster_sets[cluster_key]].median(axis=1)
            tmp["mean_"+cluster_key] = df[cluster_sets[cluster_key]].mean(axis=1)
            tmp["std_"+cluster_key] = df[cluster_sets[cluster_key]].std(axis=1)
            tmp["max_"+cluster_key] = df[cluster_sets[cluster_key]].max(axis=1)
            tmp["min_"+cluster_key] = df[cluster_sets[cluster_key]].min(axis=1)
            tmp["skew_"+cluster_key] = df[cluster_sets[cluster_key]].skew(axis=1)
            tmp["kurtosis_"+cluster_key] = df[cluster_sets[cluster_key]].kurtosis(axis=1)
            tmp["nunique_"+cluster_key] = df[cluster_sets[cluster_key]].nunique(axis=1)
   
    return tmp_train, tmp_test

In [26]:
tmp_data, tmp_test = add_statistics(data, test)

100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [01:23<00:00, 27.72s/it]


In [27]:
tmp_data2, tmp_test2 = add_statistics2(data, test)

In [28]:
data = pd.concat([data, tmp_data, tmp_data2], axis=1)
test = pd.concat([test, tmp_test, tmp_test2], axis=1)

In [29]:
def fit_cv(data, y, test):
    features = data.columns.tolist()
    print(data.shape, test.shape, len(features))
    dtrain = lgb.Dataset(data=data[features], label=np.log1p(y['target']), free_raw_data=False)
    dtrain.construct()
    res_lis = []
    for seeds in [57]:
        
        all_params = {
            'objective': ['regression'],
            'num_leaves': [90],
            'subsample': [0.1],
            'colsample_bytree': [0.05],
            'min_split_gain': [0.1],
            'reg_alpha':  [10],
            'reg_lambda': [1],
            'min_child_weight': [10],
            'verbose': [-1],
            'seed': [57],
            'boosting_type': ['gbdt'],
            'max_depth': [-1],
            'learning_rate': [0.01],
            'metric': ['root_mean_squared_error'],
            'nthread':[-1]
        }
         
        all_params['seed'] = [seeds]

        # Run KFold
        best_score = 100
        best_params = None
        best_iteration = None
        for params in tqdm(list(ParameterGrid(all_params))):
            print(params)
            clf = lgb.cv(params,
                         dtrain,
                         num_boost_round=10000,
                         nfold=4,
                         stratified=False,
                         early_stopping_rounds=200,
                         verbose_eval=100,
                            )
            full_score = clf['rmse-mean'][-1]
            print(full_score)
            if best_score > full_score:
                best_score = full_score
                best_iteration = len(clf['rmse-mean'])
                best_params = params.copy()
                print('update best score %.6f' % best_score)
                print('update best iteration : ', best_iteration)
                print('update best param : ', best_params)
        res_lis.append([best_score, best_params, best_iteration])
    
    return res_lis


In [30]:
res_result = fit_cv(data, y, test)

(12310, 5931) (49342, 5931) 5931


  0%|                                                                                            | 0/1 [00:00<?, ?it/s]

{'boosting_type': 'gbdt', 'colsample_bytree': 0.05, 'learning_rate': 0.01, 'max_depth': -1, 'metric': 'root_mean_squared_error', 'min_child_weight': 10, 'min_split_gain': 0.1, 'nthread': -1, 'num_leaves': 90, 'objective': 'regression', 'reg_alpha': 10, 'reg_lambda': 1, 'seed': 57, 'subsample': 0.1, 'verbose': -1}
[100]	cv_agg's rmse: 1.43136 + 0.0161511
[200]	cv_agg's rmse: 1.34164 + 0.0126086
[300]	cv_agg's rmse: 1.32147 + 0.0117232
[400]	cv_agg's rmse: 1.31654 + 0.0115015
[500]	cv_agg's rmse: 1.31585 + 0.0117695
[600]	cv_agg's rmse: 1.31608 + 0.0118481
1.3157823323484557
update best score 1.315782
update best iteration :  473
update best param :  {'boosting_type': 'gbdt', 'colsample_bytree': 0.05, 'learning_rate': 0.01, 'max_depth': -1, 'metric': 'root_mean_squared_error', 'min_child_weight': 10, 'min_split_gain': 0.1, 'nthread': -1, 'num_leaves': 90, 'objective': 'regression', 'reg_alpha': 10, 'reg_lambda': 1, 'seed': 57, 'subsample': 0.1, 'verbose': -1}


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:28<00:00, 28.50s/it]


In [31]:
res = res_result[0]

In [32]:
sub_preds = np.zeros(len(test))
features = data.columns.tolist()
# Convert to lightgbm Dataset
dtrain = lgb.Dataset(data=data, label=np.log1p(y['target']), free_raw_data=False)
# Construct dataset so that we can use slice()
dtrain.construct()
for seed in tqdm([57 ,114, 2018, 1, 11, 27, 536, 79, 6165, 119]):
    res[1]['seed'] = seed
    clf = lgb.train(res[1],
                 dtrain,
                 num_boost_round=int(res[2] * 1.1)
                    )
    pred = clf.predict(test) / 10
    
    sub_preds += pred

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [03:52<00:00, 23.27s/it]


In [33]:
sub['target'] = np.expm1(sub_preds)

sub.to_csv('../submit/LateSub_SA10_log.csv', index=False)