In [None]:
fname = 'lgb_1181'
fname_base = 'base_006'
fname_bazin = 'bazin_003'
fname_newling = 'newling_003'

In [None]:
# number of train time augmentations.
n_tta = 6

seed = 0

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
from scipy.optimize import curve_fit
import gc
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import xgboost as xgb
import logging
from tqdm import tqdm_notebook
import itertools
import pickle as pkl

pd.options.display.max_columns = 400

In [None]:
import random as rn
def init_seeds(seed):

    # The below is necessary for starting Numpy generated random numbers
    # in a well-defined initial state.

    np.random.seed(seed)

    # The below is necessary for starting core Python generated random numbers
    # in a well-defined state.

    rn.seed(seed)


init_seeds(seed)

In [None]:
def create_logger():
    logger_ = logging.getLogger('main')
    logger_.setLevel(logging.DEBUG)
    fh = logging.FileHandler('simple_lightgbm.log')
    fh.setLevel(logging.DEBUG)
    ch = logging.StreamHandler()
    ch.setLevel(logging.DEBUG)
    formatter = logging.Formatter('[%(levelname)s]%(asctime)s:%(name)s:%(message)s')
    fh.setFormatter(formatter)
    ch.setFormatter(formatter)
    # add the handlers to the logger
    logger_.addHandler(fh)
    logger_.addHandler(ch)


def get_logger():
    return logging.getLogger('main')

In [None]:
def xgb_multi_weighted_logloss(preds, dtrain):
    labels = dtrain.get_label()
    return 'xgb_multi_weighted_loss', lgb_multi_weighted_logloss(labels, preds)

def eval_lgb_multi_weighted_logloss(preds, train_data, n_tta=n_tta):
    
    label = train_data.get_label()
    classes = list(range(14))
    class_weight = {0: 1, 1: 2, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 2, 8: 1, 9: 1, 10: 1, 11: 1, 12: 1, 13: 1}
    res = lgb_multi_weighted_logloss(label, preds, classes, class_weight, n_tta)
    
    return res
    
def lgb_multi_weighted_logloss(y_true, y_preds,
                               classes=[6, 15, 16, 42, 52, 53, 62, 64, 65, 67, 88, 90, 92, 95],
                               class_weight={6: 1, 15: 2, 16: 1, 42: 1, 52: 1, 53: 1, 62: 1, 64: 2, 
                                             65: 1, 67: 1, 88: 1, 90: 1, 92: 1, 95: 1},
                               n_tta = n_tta,
):
    
    if len(np.unique(y_true)) > 14:
        classes.append(99)
        class_weight[99] = 2
    y_preds = y_preds.reshape(y_true.shape[0], len(classes), order='F')
    
    size = y_true.shape[0] // n_tta
    y_true = y_true[:size]
    y_p = np.zeros((size, len(classes)))
    
    for i in range(n_tta):
         y_p += y_preds[i * size : (i+1) * size]
    y_p /= n_tta
    # Trasform y_true in dummies
    y_ohe = pd.get_dummies(y_true)
    # Normalize rows and limit y_preds to 1e-15, 1-1e-15
    y_p = np.clip(a=y_p, a_min=1e-15, a_max=1 - 1e-15)
    # Transform to log
    y_p_log = np.log(y_p)
    # Get the log for ones, .values is used to drop the index of DataFrames
    # Exclude class 99 for now, since there is no class99 in the training set
    # we gave a special process for that class
    y_log_ones = np.sum(y_ohe.values * y_p_log, axis=0)
    # Get the number of positives for each class
    nb_pos = y_ohe.sum(axis=0).values.astype(float)
    # Weight average and divide by the number of positives
    class_arr = np.array([class_weight[k] for k in sorted(class_weight.keys())])
    y_w = y_log_ones * class_arr / nb_pos

    loss = - np.sum(y_w) / np.sum(class_arr)
    return 'wloss', loss, False


def multi_weighted_logloss(y_true, y_preds,
                              classes = [6, 15, 16, 42, 52, 53, 62, 64, 65, 67, 88, 90, 92, 95],
    class_weight = {6: 1, 15: 2, 16: 1, 42: 1, 52: 1, 53: 1, 62: 1, 
                    64: 2, 65: 1, 67: 1, 88: 1, 90: 1, 92: 1, 95: 1}
    ):
    """
    @author olivier https://www.kaggle.com/ogrellier
    multi logloss for PLAsTiCC challenge
    """
    # class_weights taken from Giba's topic : https://www.kaggle.com/titericz
    # https://www.kaggle.com/c/PLAsTiCC-2018/discussion/67194
    # with Kyle Boone's post https://www.kaggle.com/kyleboone
    if len(np.unique(y_true)) > 14:
        classes.append(99)
        class_weight[99] = 2
    y_p = y_preds
    # Trasform y_true in dummies
    y_ohe = pd.get_dummies(y_true)
    # Normalize rows and limit y_preds to 1e-15, 1-1e-15
    y_p = np.clip(a=y_p, a_min=1e-15, a_max=1 - 1e-15)
    # Transform to log
    y_p_log = np.log(y_p)
    # Get the log for ones, .values is used to drop the index of DataFrames
    # Exclude class 99 for now, since there is no class99 in the training set
    # we gave a special process for that class
    y_log_ones = np.sum(y_ohe.values * y_p_log, axis=0)
    # Get the number of positives for each class
    nb_pos = y_ohe.sum(axis=0).values.astype(float)
    # Weight average and divide by the number of positives
    class_arr = np.array([class_weight[k] for k in sorted(class_weight.keys())])
    y_w = y_log_ones * class_arr / nb_pos

    loss = - np.sum(y_w) / np.sum(class_arr)
    return loss

def save_importances(importances_):
    mean_gain = importances_[['gain', 'feature']].groupby('feature').mean()
    importances_['mean_gain'] = importances_['feature'].map(mean_gain['gain'])
    plt.figure(figsize=(8, 12))
    sns.barplot(x='gain', y='feature', data=importances_.sort_values('mean_gain', ascending=False))
    plt.tight_layout()
    plt.savefig('importances.png')

In [None]:
def get_importances(clfs):
    importances = [clf.feature_importance('gain') for clf in clfs]
    importances = np.vstack(importances)
    mean_gain = np.mean(importances, axis=0)
    features = clfs[0].feature_name()
    data = pd.DataFrame({'gain':mean_gain, 'feature':features})
    plt.figure(figsize=(8, 30))
    sns.barplot(x='gain', y='feature', data=data.sort_values('gain', ascending=False))
    plt.tight_layout()
    plt.savefig('importances.png')
    return data

def train_classifiers(lgb_params, full_train=None, y=None, w=None, verbose=2000, 
                      folds=5, ttas=None):
    print(full_train.shape[1], 'features')
    kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1)
    clfs = []
    importances = pd.DataFrame()
    oof_preds = np.zeros((len(full_train), np.unique(y).shape[0]))
    tta_preds = np.zeros((len(full_train), np.unique(y).shape[0]))
    for fold_, (trn_, val_) in tqdm_notebook(enumerate(kf.split(y, y)), total=folds):
        print()
        print('fold %2d' % fold_)
        trn_x, trn_y, trn_w = full_train.iloc[trn_], y.iloc[trn_], w.iloc[trn_]
        val_x, val_y, val_w = full_train.iloc[val_], y.iloc[val_], w.iloc[val_]
        size = val_y.shape[0]
        for tta in ttas:
            tta_x = tta.iloc[trn_]
            trn_x = pd.concat((trn_x, tta_x), axis=0)
            tta_y = y.iloc[trn_]
            trn_y = pd.concat((trn_y, tta_y), axis=0)
            tta_w = w.iloc[trn_]
            trn_w = pd.concat((trn_w, tta_w), axis=0)
            val_x = pd.concat((val_x, tta.iloc[val_]), axis=0)
            val_y = pd.concat((val_y, y.iloc[val_]), axis=0)
            val_w = pd.concat((val_w, w.iloc[val_]), axis=0)
        trn_x = lgb.Dataset(trn_x, label=trn_y, weight=trn_w)
        val_x = lgb.Dataset(val_x, label=val_y, weight=val_w)
        clf = lgb.train(
            lgb_params,
            trn_x, 
            num_boost_round = 4000,
            valid_sets=[trn_x, val_x],
            valid_names = ['train', 'val'],
            feval=eval_lgb_multi_weighted_logloss,
            verbose_eval=verbose,
            early_stopping_rounds=100
        )
        val_x = full_train.iloc[val_]
        
        oof_pred = clf.predict(val_x)
        oof_preds[val_, :] = oof_pred
        tta_pred = np.zeros(oof_pred.shape)
        for tta in ttas:
            val_x = tta.iloc[val_]
            tta_pred += clf.predict(val_x)
        tta_pred /= len(ttas)
        tta_preds[val_, :] = tta_pred
        print('val mwloss: %0.3f' % multi_weighted_logloss( y.iloc[val_], oof_pred),
              'tta mwloss: %0.3f' % multi_weighted_logloss( y.iloc[val_], tta_pred),
             )
        
        clfs.append(clf)

    get_logger().info('MULTI WEIGHTED LOG LOSS : %.5f ' % multi_weighted_logloss(y_true=y, y_preds=oof_preds))

    importances = get_importances(clfs)
    return clfs, importances, oof_preds, tta_preds

In [None]:
def get_data(full_train, train_bazin, newling):
    # get the right object_ids for the result.
    full_train = full_train.merge(train_bazin, how='left', on='object_id')
    full_train = full_train.merge(newling, how='left', on='object_id')
    for pb in range(6):
        full_train['bazin_A_%d' % pb] *= full_train.hostgal_photoz **2 * full_train.scale_mean
        full_train['newling_A_%d' % pb] *= full_train.hostgal_photoz **2 * full_train.scale_mean
    full_train['bazin_magnitude'] = full_train[['bazin_A_%d' % pb  for pb in range(6)]].max(axis=1)
    full_train['newling_magnitude'] = full_train[['newling_A_%d' % pb  for pb in range(6)]].max(axis=1)
    for pb in range(6):
        full_train['bazin_A_%d' % pb] /= full_train['bazin_magnitude']
        full_train['newling_A_%d' % pb] /= full_train['newling_magnitude']
    full_train.hostgal_photoz = 1*(full_train.hostgal_photoz > 0)
    return full_train

In [None]:
with open('../data/ttas_%s.pkl' % fname_base, 'rb') as file:
    ttas = pkl.load(file)
full_train = ttas[0]
ttas = ttas[1 : n_tta]
full_train.head()

In [None]:
meta_cols = ['object_id', 'hostgal_photoz', 'mwebv', 'target']
meta_train = pd.read_csv('../data/train_meta.csv')[meta_cols]
meta_cols = ['object_id', 'hostgal_photoz', 'mwebv']
meta_test = pd.read_csv('../input/test_set_metadata.csv')[meta_cols]

meta_train.head()

In [None]:
with open('../data/tta_0_%s.pkl' % fname_bazin, 'rb') as file:
    train_bazin = pkl.load(file)
train_bazin.head()
    

In [None]:
ttas_bazin = []
for i in range(1, n_tta):
    with open('../data/tta_%d_%s.pkl' % (i, fname_bazin), 'rb') as file:
        ttas_bazin.append(pkl.load(file))

In [None]:
ttas_bazin[0].head()

In [None]:
with open('../data/tta_0_%s.pkl' % fname_newling, 'rb') as file:
    train_newling = pkl.load(file)
train_newling.head()

In [None]:
ttas_newling = []
for i in range(1, n_tta):
    with open('../data/tta_%d_%s.pkl' % (i, fname_newling ), 'rb') as file:
        ttas_newling.append(pkl.load(file))

In [None]:
full_train = get_data(full_train, train_bazin, train_newling)
full_train.head()

In [None]:
n_tta = 6
init_seeds(seed)

ttas = [get_data(full_train, train_bazin, train_newling) \
        for full_train, train_bazin, train_newling \
        in tqdm_notebook(zip(ttas, ttas_bazin, ttas_newling))]
#for tta in ttas:
#    tta.fillna(train_mean, inplace=True)

In [None]:
classes = sorted(np.unique(meta_train.target))
classes

class_names = ['class_%d' % c for c in classes]

weights = [1/18  if i not in [15, 64, 99] else 1/9 for i in classes]
weights

df = meta_train.groupby('target').object_id.count().to_frame('freq')
df.freq /= df.freq.sum()
df['weight'] = weights
df['adjust'] = df.weight / df.freq
df

In [None]:
y = meta_train['target']

ws = y.copy()
for c,w in zip(classes, df.adjust.values):
    print(c, w)
    ws[y == c] = w

In [None]:
y_lgb = y.copy()
for i,c in enumerate(classes):
    y_lgb[y_lgb == c] = i

In [None]:
list(full_train.columns)

In [None]:
removed = [
    'object_id',
    'target',
    
    'newling_A_1',
    'newling_A_2',
    'newling_A_3',
    'newling_A_4',
    'newling_A_5',
    'newling_k_1',
    'newling_k_2',
    'newling_k_3',
    'newling_k_4',
    'newling_k_5',
    
    'bazin_pcov_0',
    'bazin_pcov_1',
    'bazin_pcov_2',
    'bazin_pcov_3',
    'bazin_pcov_4',
    'bazin_pcov_5',
    'bazin_max_0',
    'bazin_max_1',
    'bazin_max_2',
    'bazin_max_3',
    'bazin_max_4',
    'bazin_max_5',
    #'bazin_A_0',
    #'bazin_A_1',
    #'bazin_A_2',
    #'bazin_A_3',
    #'bazin_A_4',
    #'bazin_A_5',
    'bazin_before_0',
    'bazin_before_1',
    'bazin_before_2',
    'bazin_before_3',
    'bazin_before_4',
    'bazin_before_5',
    'bazin_after_0',
    'bazin_after_1',
    'bazin_after_2',
    'bazin_after_3',
    'bazin_after_4',
    'bazin_after_5',
    
    #'bazin_trise',
    
    'mwebv',
    'num_obs',
]

features = [c for c in full_train.columns if c not in removed]

features

full_train1 = full_train[features].copy()
ttas1 = [tta[features].copy() for tta in ttas]

In [None]:
features

In [None]:
lgb_params = {
    'boosting_type': 'goss',
    'objective': 'multiclass',
    'num_class': 14,
    'metric': 'None',
    'learning_rate': 0.015,
    'colsample_bytree': .5,
    'feature_fraction_seed':seed+2,
    'reg_alpha': .01,
    'reg_lambda': .1,
    'min_split_gain': 0.1,
    'min_child_weight': 20 * (1 + len(ttas)),
    #'n_estimators': 4000,
    #'silent': -1,
    'verbose': -1,
    #'max_depth': 4,
    'num_leaves' : 7,
    #'num_threads': 10,
}
clfs, importances, oof_preds, tta_preds = train_classifiers(lgb_params, full_train1, y_lgb, ws, 
                                                            folds=10, ttas=ttas1)

#save_importances(importances_=importances)
print('%0.5f' % multi_weighted_logloss(y, oof_preds), 
      '%0.5f' % multi_weighted_logloss(y, tta_preds))

In [None]:
with open('../data/oof_preds_%s.pkl' % fname, 'wb') as file:
    pkl.dump(oof_preds, file)
    
with open('../data/tta_preds_%s.pkl' % fname, 'wb') as file:
    pkl.dump(tta_preds, file)

In [None]:
def predict_chunk(clfs_, features, class_names, chunk_id, fname_base, 
                  fname_bazin=fname_bazin, fname_newling=fname_newling, 
                  ):

    with open('../data/full_test_chunk_%s_%d.pkl' % (fname_base, chunk_id), 'rb') as file:
        full_test = pkl.load(file)
        
    with open('../data/bazin_test_%d_%s.pkl' % (chunk_id, fname_bazin), 'rb') as file:
        test_bazin = pkl.load(file)
    
    with open('../data/test_%d_%s.pkl' % (chunk_id, fname_newling), 'rb') as file:
            test_newling = pkl.load(file)  
            
    if ('newling_sigma_1') not in test_newling.columns:
        test_newling['newling_sigma_1'] = np.NaN
            
    full_test = get_data(full_test, test_bazin, test_newling)
    #full_test = full_test.fillna(train_mean)
    # Make predictions
    preds_ = None
    for clf in clfs_:
        if preds_ is None:
            preds_ = clf.predict(full_test[features]) / len(clfs_)
        else:
            preds_ += clf.predict(full_test[features]) / len(clfs_)

    # Compute preds_99 as the proba of class not being any of the others
    # preds_99 = 0.1 gives 1.769
    preds_99 = np.ones(preds_.shape[0])
    for i in range(preds_.shape[1]):
        preds_99 *= (1 - preds_[:, i])
    
    # Create DataFrame from predictions
    preds_df_ = pd.DataFrame(preds_, columns=class_names)
    preds_df_['object_id'] = full_test['object_id']
    preds_df_['class_99'] = preds_99

    print(preds_df_['class_99'].mean())

    del full_test, preds_
    gc.collect()

    return preds_df_


In [None]:
import time

start = time.time()
chunks = 5000000
remain_df = None

for i_c in tqdm_notebook(range(91)):

    preds_df = predict_chunk(clfs_=clfs,
                             features=features,
                             class_names=class_names,
                             chunk_id=i_c,
                             fname_base=fname_base,
                            )

    if i_c == 0:
        print(preds_df.mean(axis=0))
        preds_df.to_csv('../submissions/%s.csv' %fname, header=True, index=False, float_format='%.6f')
    else:
        preds_df.to_csv('../submissions/%s.csv' %fname, header=False, mode='a', index=False, float_format='%.6f')

    del preds_df
    gc.collect()

    if (i_c + 1) % 10 == 0:
        get_logger().info('%15d done in %5.1f' % (chunks * (i_c + 1), (time.time() - start) / 60))
        print('%15d done in %5.1f' % (chunks * (i_c + 1), (time.time() - start) / 60))

# Compute last object in remain_df

preds_df = predict_chunk(clfs_=clfs,
                         rnn_test=rnn_test,
                         features=features,
                         class_names=class_names,
                         chunk_id=100,
                         fname_base=fname_base,
                        )

preds_df.to_csv('../submissions/%s.csv' %fname, 
                header=False, mode='a', index=False, float_format='%.6f')

In [None]:
z = pd.read_csv('../submissions/%s.csv' %fname)

z = z.groupby('object_id').mean()

z.shape

In [None]:
meta_cols = ['hostgal_photoz', 'target']
meta_train2 = pd.read_csv('../input/training_set_metadata.csv')[meta_cols]
meta_train2.head()

df = meta_train2.groupby('target').hostgal_photoz.mean()

galactic = ['class_%d' % c for c in df[df == 0].index]
extragal = ['class_%d' % c for c in df[df > 0].index]
galactic, extragal

In [None]:
z = z.reset_index()

z['class_99'] *= (0.18 / z['class_99'].mean())

z.loc[meta_test.hostgal_photoz == 0, extragal] = 0

z.loc[meta_test.hostgal_photoz > 0, galactic] = 0

z.mean(axis=0)

In [None]:
#z.to_csv('../submissions/gal_%s.csv' %fname, index=False, float_format='%.6f')

z['class_99'] = (1. - z[z.columns[1:-1]]).prod(axis=1)

z.mean(axis=0)

In [None]:
z['class_99'] *= (0.18 / z['class_99'].mean())

z.to_csv('../submissions/gal_2_%s.csv' %fname, index=False, float_format='%.6f')