In [1]:
fname='base_006'

n_tta = 6

seed = 0

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
from scipy.optimize import curve_fit
import gc
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import xgboost as xgb
import logging
from tqdm import tqdm_notebook
import itertools
import pickle as pkl

from multiprocessing import Pool

In [3]:
import random as rn
def init_seeds(seed):

    # The below is necessary for starting Numpy generated random numbers
    # in a well-defined initial state.

    np.random.seed(seed)

    # The below is necessary for starting core Python generated random numbers
    # in a well-defined state.

    rn.seed(seed)


init_seeds(seed)

In [5]:
def get_aggregations():
    aggs = {
        'flux' : ['min', 'max', 'mean', 'std', 'skew'],
        'flux_delta' : ['mean', 'median', 'std'],
        'flux_err' : ['min', 'max', 'mean', 'median', 'std'],
        'detected' : ['mean'],  # ''min', 'max', 'mean', 'median', 'std'],
        'flux_ratio_sq' : ['sum'],
        'flux_by_flux_ratio_sq' : ['sum'],
        'mjd_detected' : ['min', 'max'],
        'mjd_detected_std' : ['min', 'max'],
        'flux_detected' : ['mean'],  # ''min', 'max', 'mean', 'median', 'std'],
        'flux_slope_change' : ['mean'],
        'scale':['mean'],
        'magnitude':['mean'],
    }
    
    for pb in range(6):
        flux_pb = 'flux_%d' % pb
        aggs[flux_pb] = ['min', 'max', 'mean', 'median', 'std', 'skew']
        flux_delta_pb = 'flux_delta_%d' % pb
        aggs[flux_delta_pb] = ['std']
        detected_pb = 'detected_%d' % pb
        aggs[detected_pb] = ['mean']
        flux_pb_detected = 'flux_%d_detected' % pb
        aggs[flux_pb_detected] = ['mean']
        flux_ratio_sq_pb = 'flux_ratio_sq_%d' % pb
        aggs[flux_ratio_sq_pb] = ['sum']
        flux_by_flux_ratio_sq_pb = 'flux_by_flux_ratio_sq_%d' % pb
        aggs[flux_by_flux_ratio_sq_pb] = ['sum']
    return aggs


def get_new_columns(aggs):
    return [k + '_' + agg for k in aggs.keys() for agg in aggs[k]]

def apply_kurt(df):
    cols = ['flux'] + ['flux_%d' % pb for pb in range(6)]
    agg =  df.groupby('object_id')[cols].apply(pd.DataFrame.kurt)
    agg.columns = [c+'_kurt' for c in agg.columns]
    return agg

def apply_kurt_delta(df):
    cols = ['flux_delta'] 
    agg =  df.groupby('object_id')[cols].apply(pd.DataFrame.kurt)
    agg.columns = [c+'_kurt' for c in agg.columns]
    return agg

def add_features_to_agg(df):
    df['mjd_detected_diff'] = df['mjd_detected_max'] - df['mjd_detected_min']
    del df['mjd_detected_max'], df['mjd_detected_min']
    df['mjd_detected_std_diff'] = df['mjd_detected_std_max'] - df['mjd_detected_std_min']
    del df['mjd_detected_std_max'], df['mjd_detected_std_min']
    df['flux_diff'] = df['flux_max'] - df['flux_min']
    df['flux_dif2'] = (df['flux_max'] - df['flux_min']) / df['flux_mean']
    df['flux_w_mean'] = df['flux_by_flux_ratio_sq_sum'] / df['flux_ratio_sq_sum']
    df['flux_dif3'] = (df['flux_max'] - df['flux_min']) / df['flux_w_mean']
    df['flux_detected_ratio'] = df['flux_detected_mean'] / df['flux_mean']
    df['flux_delta_mean_ratio'] = df['flux_delta_mean'] / df['flux_mean']
    df['flux_delta_std_ratio'] = df['flux_delta_std'] / df['flux_std']
    #df['flux_delta_skew_ratio'] = df['flux_delta_skew'] / df['flux_skew']
    #del df['flux_delta_skew']
    #df['flux_delta_kurt_ratio'] = df['flux_delta_kurt'] / df['flux_kurt']
    for pb in range(6):
        #df['flux_%d_diff' % pb] = df['flux_%d_max' % pb] - df['flux_%d_min' % pb]
        #df['flux_%d_diff_2' % pb] = (df['flux_%d_max' % pb] - df['flux_%d_min' % pb]) / df['flux_%d_mean' % pb]

        df['flux_%d_detected_ratio' % pb] = df['flux_%d_detected_mean' % pb] / df['flux_%d_mean' % pb]
        df['flux_%d_mean' % pb] /= df.flux_mean
        df['flux_%d_detected_mean' % pb] /= df.flux_detected_mean
        df['flux_%d_max_ratio' % pb] = df['flux_%d_max' % pb] / df['flux_max']
        #df['flux_%d_min_ratio' % pb] = df['flux_%d_min' % pb] / df['flux_min']
        #df['flux_delta_%d_std_ratio' % pb] = df['flux_delta_%d_std' % pb] / df['flux_delta_std']
        df['flux_delta_%d_std_ratio_2' % pb] = df['flux_delta_%d_std' % pb] / df['flux_std']
        #df['flux_%d_std_ratio' % pb] = df['flux_%d_std' % pb] / df['flux_std']
        df['flux_%d_w_mean' % pb] = df['flux_by_flux_ratio_sq_%d_sum' % pb] / df['flux_ratio_sq_%d_sum' % pb]
        df['flux_%d_dif3'] = (df['flux_%d_max' % pb] - df['flux_%d_min' % pb]) / df['flux_%d_w_mean' % pb]
        df['flux_%d_w_mean' % pb] /= df['flux_w_mean']
        #df['flux_ratio_sq_%d_sum' % pb] /= df['flux_ratio_sq_sum']
        del df['flux_delta_%d_std' % pb], df['flux_by_flux_ratio_sq_%d_sum' % pb]
        #del df['flux_ratio_sq_%d_sum' % pb]
    del df['flux_by_flux_ratio_sq_sum']
    return df

In [6]:
def add_features_before_agg(df):
    
    df['flux_ratio_sq'] = np.power(df['flux'] / df['flux_err'], 2.0)
    df['flux_by_flux_ratio_sq'] = df['flux'] * df['flux_ratio_sq']
    
    df['mjd_detected'] = np.NaN
    df.loc[df.detected == 1, 'mjd_detected'] = df.loc[df.detected == 1, 'mjd']
    
    df['flux_detected'] = np.NaN
    df.loc[df.detected == 1, 'flux_detected'] = df.loc[df.detected == 1, 'flux']

    df['mjd_detected_std'] = np.NaN
    df.loc[df.detected_std == 1, 'mjd_detected_std'] = df.loc[df.detected_std == 1, 'mjd']
    
    gr = df.groupby(['object_id', 'passband'])
    df['flux_prev'] = gr.flux.shift(1)
    df['mjd_prev'] = gr.mjd.shift(1)
    
    df['flux_delta'] = (df.flux - df.flux_prev) 
    df['flux_delta_abs'] = np.abs(df.flux_delta)

    df.loc[df.flux_delta_abs * df.scale < 10, 'flux_delta'] = np.NaN
    df.loc[(df.mjd - df.mjd_prev) > 100, 'flux_delta'] = np.NaN
    df['flux_slope'] = np.sign(df.flux_delta) 
                               
    df['flux_slope_prev'] = gr.flux_slope.shift(1).fillna('prev')
    df['flux_slope_change'] = 1*(df['flux_slope'] != df['flux_slope_prev'])                               
    del df['flux_prev'], df['flux_slope_prev'], df['flux_slope'], df['mjd_prev']

    for pb in range(6):
        filter_p = (df.passband == pb)
        
        flux_pb = 'flux_%d' % pb
        df[flux_pb] = np.NaN
        df.loc[filter_p, flux_pb] = df.loc[filter_p, 'flux']

        flux_delta_pb = 'flux_delta_%d' % pb
        df[flux_delta_pb] = np.NaN
        df.loc[filter_p, flux_delta_pb] = df.loc[filter_p, 'flux_delta']
        
        detected_pb = 'detected_%d' % pb
        df[detected_pb] = 0
        df.loc[filter_p, detected_pb] = df.loc[filter_p, 'detected']
        
        flux_pb_detected = 'flux_%d_detected' % pb
        df[flux_pb_detected] = np.NaN
        df.loc[filter_p, flux_pb_detected] = df.loc[filter_p, 'flux_detected']
        
        flux_ratio_sq_pb = 'flux_ratio_sq_%d' % pb
        df[flux_ratio_sq_pb] = np.NaN
        df.loc[filter_p, flux_ratio_sq_pb] = df.loc[filter_p, 'flux_ratio_sq']

        flux_by_flux_ratio_sq_pb = 'flux_by_flux_ratio_sq_%d' % pb
        df[flux_by_flux_ratio_sq_pb] = np.NaN
        df.loc[filter_p, flux_by_flux_ratio_sq_pb] = df.loc[filter_p, 'flux_by_flux_ratio_sq']

   

In [8]:
def add_features(df_, meta_, throughputs=throughputs):
    df_ = df_.copy()
    
    #df_['scale'] = 1
    gr = df_.groupby('object_id')
    df_['scale'] = gr.flux.transform('max')
    df_['magnitude'] = df_['scale'] - gr.flux.transform('min')
    
    df_.flux /= df_.scale
    df_.flux_err /= df_.scale
    
    gr = df_.groupby(['object_id', 'passband'])  
    flux_err_mean = gr.flux_err.transform('mean')
    flux_err_std = gr.flux_err.transform('std')
    df_ = df_[df_.flux_err <= flux_err_mean + 6*flux_err_std].copy()

    gr = df_.groupby(['object_id', 'passband'])  
    flux_std = gr.flux.transform('std')
    flux_mean = gr.flux.transform('mean')
    df_['detected_std'] = df_.detected * (df_.flux > flux_mean + 1*flux_std)
    
    add_features_before_agg(df_)

    aggs = get_aggregations()
    new_columns = get_new_columns(aggs)

    agg_ = df_.groupby('object_id').agg(aggs)
    agg_.columns = new_columns

    agg_ = add_features_to_agg(df=agg_)
    
    agg_kurt = apply_kurt(df_)
    
    #agg_kurt_delta = apply_kurt_delta(df_)
    
    agg_ = pd.concat([agg_, agg_kurt], axis=1).reset_index()
    #agg_ = agg_.merge(agg_bazin, how='left', on='object_id')

    # Merge with meta data
    full_df = agg_.merge(
        right=meta_,
        how='left',
        on='object_id'
    )
    full_df['magnitude_mean'] *= (full_df.hostgal_photoz ** 2)

    del agg_
    return full_df

In [9]:
# training time data augmentation

def get_tta(train, meta_train, i):
    df = train.copy()
    init_seeds(i)
    if i > 0:
        df['flux'] += df['flux_err'] * np.random.randn(*df['flux_err'].shape)
    df = add_features(df, meta_train)
    
    return df

In [10]:
train = pd.read_csv('../input/training_set.csv')
train.head()

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected
0,615,59750.4229,2,-544.810303,3.622952,1
1,615,59750.4306,1,-816.434326,5.55337,1
2,615,59750.4383,3,-471.385529,3.801213,1
3,615,59750.445,4,-388.984985,11.395031,1
4,615,59752.407,2,-681.858887,4.041204,1


In [11]:
meta_cols = ['object_id', 'ddf', 'hostgal_photoz', 'target']
meta_train = pd.read_csv('../input/training_set_metadata.csv')[meta_cols]
meta_train.head()

Unnamed: 0,object_id,ddf,hostgal_photoz,target
0,615,1,0.0,92
1,713,1,1.6267,88
2,730,1,0.2262,42
3,745,1,0.2813,90
4,1124,1,0.2415,90


In [12]:
n_tta = 11

ttas = [get_tta(train, meta_train, i) for i in tqdm_notebook(range(11))]

#for tta in ttas:
#    tta.fillna(train_mean, inplace=True)

HBox(children=(IntProgress(value=0, max=11), HTML(value='')))




In [13]:
with open('../data/ttas_%s.pkl' % fname, 'wb') as file:
    pkl.dump(ttas, file)


In [14]:
def work_test(param):
    (chunk_id, fname) = param
    print('starting worker', chunk_id)
    meta_test = pd.read_csv('../input/test_set_metadata.csv')
    with open('../input/test_chunk_%d.csv' % chunk_id, 'rb') as file:
        test_chunk = pkl.load(file)
    full_test = add_features(test_chunk, meta_test)
    
    with open('../data/full_test_chunk_%s_%d.pkl' % (fname, chunk_id), 'wb') as file:
        pkl.dump(full_test, file)
    print('ending worker', chunk_id)
    return 'done'

In [15]:
params = [(i, fname) for i in range(91)]
params.append((100, fname))

if 1: 
    pool = Pool(processes=5, maxtasksperchild=1)
    ls   = pool.map( work_test, params, chunksize=1 )
    pool.close()
else:
    ls = [work_test(param) for param in params]

starting worker 0
starting worker 1
starting worker 3
starting worker 2
starting worker 4
ending worker 0
starting worker 5
ending worker 1
starting worker 6
ending worker 2
starting worker 7
ending worker 3
starting worker 8
ending worker 4
starting worker 9
ending worker 5
starting worker 10
ending worker 6
starting worker 11
ending worker 7
starting worker 12
ending worker 9
starting worker 13
ending worker 8
starting worker 14
ending worker 10
starting worker 15
ending worker 11
starting worker 16
ending worker 12
starting worker 17
ending worker 13
starting worker 18
ending worker 14
starting worker 19
ending worker 15
starting worker 20
ending worker 16
starting worker 21
ending worker 17
starting worker 22
ending worker 18
starting worker 23
ending worker 19
starting worker 24
ending worker 20
starting worker 25
ending worker 21
starting worker 26
ending worker 22
starting worker 27
ending worker 23
starting worker 28
ending worker 24
starting worker 29
ending worker 25
starting