In [2]:
from itertools import product

import numpy as np
import pandas as pd
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from dask.array import stats

In [3]:
series_df = pd.read_csv('../data/sets/base/train.csv')

In [4]:
series_df.head()

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected
0,615,59750.4229,2,-544.810303,3.622952,1
1,615,59750.4306,1,-816.434326,5.55337,1
2,615,59750.4383,3,-471.385529,3.801213,1
3,615,59750.445,4,-388.984985,11.395031,1
4,615,59752.407,2,-681.858887,4.041204,1


In [4]:
gbo = series_df.groupby(['object_id', 'passband'])

### Generating time series features on the series
As some people pointed out, there are some useful features that can be extracted from the series.

In [5]:
series_df['flux_ratio_sq'] = np.power(series_df['flux'] / series_df['flux_err'], 2.0)
series_df['flux_by_flux_ratio_sq'] = series_df['flux'] * series_df['flux_ratio_sq']

We want to calculate following aggregations for every column that were not included in the group_by (not only flux):

In [6]:
aggrs = {
    'mjd': ['min', 'max', 'mean', 'count'],
    'flux': ['min', 'max', 'mean', 'median', 'std', 'skew'],  # all relevant pandas aggregations except count (which is same for all columns and calculated for mjd)
    'flux_err': ['min', 'max', 'mean', 'median', 'std', 'skew'],  # keep these same as for flux - might be useful for future transformations
    'detected': ['mean'],  # this is binary so knowing mean and count translates to knowing how many actual samples were marked as detected
    'flux_ratio_sq': ['min', 'max', 'sum', 'skew'],
    'flux_by_flux_ratio_sq': ['min', 'max', 'sum', 'skew'],
}

In [7]:
%%time
aggr_df = series_df.groupby(['passband', 'object_id']).agg(aggrs)

CPU times: user 1min 9s, sys: 608 ms, total: 1min 9s
Wall time: 44.8 s


In [8]:
aggr_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,mjd,mjd,mjd,mjd,flux,flux,flux,flux,flux,flux,...,flux_err,detected,flux_ratio_sq,flux_ratio_sq,flux_ratio_sq,flux_ratio_sq,flux_by_flux_ratio_sq,flux_by_flux_ratio_sq,flux_by_flux_ratio_sq,flux_by_flux_ratio_sq
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,count,min,max,mean,median,std,skew,...,skew,mean,min,max,sum,skew,min,max,sum,skew
passband,object_id,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
0,615,59819.1532,60617.0295,60278.94073,63,-116.913223,125.182808,-3.254554,-10.015225,83.944735,0.128917,...,-0.038907,0.857143,2.628774,1513.497011,32478.06881,0.438299,-167468.250525,140443.469449,-626381.909593,-0.213985
0,713,59851.2006,60674.0798,60315.87707,70,-14.735178,14.509829,-2.720398,-3.096804,7.113509,0.260052,...,0.42536,0.142857,0.01904,54.24831,831.646485,1.434431,-626.861507,318.265094,-5916.693412,-1.228117
0,730,59818.274,60648.0642,60247.701419,72,-3.45996,5.942166,-0.04808,0.024093,1.828872,0.35691,...,0.23511,0.0,0.000744,4.193923,52.306159,1.859868,-12.102754,24.920987,2.274455,2.788126
0,745,59818.2219,60620.1257,60247.740832,72,-3.874349,18.014029,1.797523,1.056714,4.374445,2.023211,...,0.311095,0.027778,0.00388,91.957529,336.127785,4.863112,-19.610791,1656.5256,4016.30749,5.519682
0,1124,59819.1532,60617.0295,60278.94073,63,-6.804703,5.330927,0.660948,0.581027,2.360084,-0.332169,...,0.255342,0.0,4e-06,5.046377,69.056825,1.145677,-34.339094,20.571364,104.843171,-1.166071


In [9]:
def flatten_columns(df, col_prefix):
    df.columns = [col_prefix + '_'.join(col).strip() for col in df.columns.values]
    return df

In [10]:
out_dfs = [flatten_columns(aggr_df.xs(passband), f"passband_{passband}_") for passband in range(1,6)]

In [11]:
# out_df = pd.DataFrame(index=series_df['object_id'].unique())
out_df = pd.concat(out_dfs, axis=1)

In [16]:
for passband in range(1,6):
    # flux-related features
    for differential_colname in ['flux', 'flux_err']:
        colname_base = f'passband_{passband}_{differential_colname}'
        out_df[f'{colname_base}_diff'] = out_df[f'{colname_base}_max'] - out_df[f'{colname_base}_min']
        out_df[f'{colname_base}_diff2'] = out_df[f'{colname_base}_diff'] / out_df[f'{colname_base}_mean']
    out_df[f'passband_{passband}_flux_w_mean'] = out_df[f'passband_{passband}_flux_by_flux_ratio_sq_sum'] / out_df[f'passband_{passband}_flux_ratio_sq_sum']
    out_df[f'passband_{passband}_flux_dif3'] = (out_df[f'passband_{passband}_flux_max'] - out_df[f'passband_{passband}_flux_min']) / out_df[f'passband_{passband}_flux_w_mean']
    # other features
    out_df[f'passband_{passband}_detected_count'] = out_df[f'passband_{passband}_detected_mean']*out_df[f'passband_{passband}_mjd_count']
    out_df[f'passband_{passband}_detected_mjd_diff'] = out_df[f'passband_{passband}_mjd_max'] - out_df[f'passband_{passband}_mjd_mean']

In [22]:
[col for col in out_df.columns if 'passband_1' in col]

['passband_1_mjd_min',
 'passband_1_mjd_max',
 'passband_1_mjd_mean',
 'passband_1_mjd_count',
 'passband_1_flux_min',
 'passband_1_flux_max',
 'passband_1_flux_mean',
 'passband_1_flux_median',
 'passband_1_flux_std',
 'passband_1_flux_skew',
 'passband_1_flux_err_min',
 'passband_1_flux_err_max',
 'passband_1_flux_err_mean',
 'passband_1_flux_err_median',
 'passband_1_flux_err_std',
 'passband_1_flux_err_skew',
 'passband_1_detected_mean',
 'passband_1_flux_ratio_sq_min',
 'passband_1_flux_ratio_sq_max',
 'passband_1_flux_ratio_sq_sum',
 'passband_1_flux_ratio_sq_skew',
 'passband_1_flux_by_flux_ratio_sq_min',
 'passband_1_flux_by_flux_ratio_sq_max',
 'passband_1_flux_by_flux_ratio_sq_sum',
 'passband_1_flux_by_flux_ratio_sq_skew',
 'passband_1_flux_diff',
 'passband_1_flux_diff2',
 'passband_1_flux_err_diff',
 'passband_1_flux_err_diff2',
 'passband_1_flux_w_mean',
 'passband_1_flux_dif3']

In [18]:
out_df.head()

Unnamed: 0_level_0,passband_1_mjd_min,passband_1_mjd_max,passband_1_mjd_mean,passband_1_mjd_count,passband_1_flux_min,passband_1_flux_max,passband_1_flux_mean,passband_1_flux_median,passband_1_flux_std,passband_1_flux_skew,...,passband_4_flux_err_diff,passband_4_flux_err_diff2,passband_4_flux_w_mean,passband_4_flux_dif3,passband_5_flux_diff,passband_5_flux_diff2,passband_5_flux_err_diff,passband_5_flux_err_diff2,passband_5_flux_w_mean,passband_5_flux_dif3
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
615,59750.4306,60624.1836,60175.754255,58,-1100.440063,660.626343,-385.699911,-488.057969,601.787302,0.41558,...,8.495519,2.236392,-111.843477,-7.189854,801.003235,-16.88105,6.772019,0.886852,-126.326197,-6.340753
713,59825.2676,60668.0723,60241.982255,56,-11.715749,9.129021,-1.019804,-0.561735,5.712334,-0.087865,...,2.125249,0.936706,-4.179287,-5.291509,28.98205,-16.153417,5.592603,1.05234,-4.90486,-5.908843
730,59798.3281,60652.1365,60214.495175,52,-3.39308,5.693109,0.141057,0.171336,1.807229,0.471342,...,2.379797,0.951035,33.303042,1.411171,66.46987,14.64289,7.106016,1.182095,32.979248,2.015506
745,59770.374,60624.0425,60188.647621,56,-3.61841,192.244293,5.717394,0.888115,25.964659,7.007099,...,23.164925,8.313364,121.77942,1.63515,151.762677,14.122532,52.155043,7.786257,74.94827,2.024899
1124,59750.4306,60624.1836,60175.754255,58,-2.622109,37.170177,4.634637,1.154596,8.107525,2.377222,...,8.575437,3.064312,115.267128,1.389329,120.018125,17.402149,6.792831,1.041269,86.915714,1.380856


In [13]:
len(out_df)

7848

In [5]:
from plasticc.features.simple import _extract_features



In [6]:
meta_df = pd.read_csv('../data/sets/base/meta/train.csv')

In [8]:
%%time
out_df = _extract_features(series_df, meta_df)

CPU times: user 1min 4s, sys: 500 ms, total: 1min 4s
Wall time: 38.4 s


In [10]:
out_df[[col for col in out_df.columns if '_mjd_count' in col]].head(10)

Unnamed: 0_level_0,passband_1_mjd_count,passband_2_mjd_count,passband_3_mjd_count,passband_4_mjd_count,passband_5_mjd_count
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
615,58,58,58,58,57
713,56,56,56,56,56
730,52,52,52,51,51
745,56,56,56,56,55
1124,58,58,58,58,57
1227,52,52,52,51,51
1598,58,58,58,58,57
1632,58,58,58,58,57
1920,45,45,45,45,45
1926,45,45,45,45,45
