In [49]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from IPython.display import display
from scipy import stats
from tqdm import tnrange, tqdm_notebook, tqdm
from collections import OrderedDict
from cesium.time_series import TimeSeries
import cesium.featurize as featurize

### データの読み込み

In [2]:
train_ts = pd.read_csv("../data/input/training_set.csv")
test_ts = pd.read_csv("../data/input/test_set_sample.csv")
print(train_ts.shape, test_ts.shape)
display(train_ts.head())
display(test_ts.head())

(1421705, 6) (1000000, 6)


Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected
0,615,59750.4229,2,-544.810303,3.622952,1
1,615,59750.4306,1,-816.434326,5.55337,1
2,615,59750.4383,3,-471.385529,3.801213,1
3,615,59750.445,4,-388.984985,11.395031,1
4,615,59752.407,2,-681.858887,4.041204,1


Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected
0,13,59798.3205,2,-1.299735,1.357315,0
1,13,59798.3281,1,-2.095392,1.148654,0
2,13,59798.3357,3,-0.923794,1.763655,0
3,13,59798.3466,4,-4.009815,2.602911,0
4,13,59798.3576,5,-3.403503,5.367328,0


In [3]:
train_meta = pd.read_csv("../data/input/training_set_metadata.csv")
test_meta = pd.read_csv("../data/input/test_set_metadata.csv")
print(train_meta.shape, test_meta.shape)
display(train_meta.head())
display(test_meta.head())

(7848, 12) (3492890, 11)


Unnamed: 0,object_id,ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target
0,615,349.046051,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,,0.017,92
1,713,53.085938,-27.784405,223.525509,-54.460748,1,1.8181,1.6267,0.2552,45.4063,0.007,88
2,730,33.574219,-6.579593,170.455585,-61.548219,1,0.232,0.2262,0.0157,40.2561,0.021,42
3,745,0.189873,-45.586655,328.254458,-68.969298,1,0.3037,0.2813,1.1523,40.7951,0.007,90
4,1124,352.711273,-63.823658,316.922299,-51.059403,1,0.1934,0.2415,0.0176,40.4166,0.024,90


Unnamed: 0,object_id,ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv
0,13,34.453125,-5.229529,169.987075,-59.956185,1,0.3048,0.3193,0.0542,41.1123,0.019
1,14,33.398438,-4.331149,167.226341,-59.936551,1,,0.6323,0.0179,42.8774,0.018
2,17,348.529419,-61.75544,321.29398,-51.763351,1,,0.8297,0.0605,43.6,0.016
3,23,34.804688,-5.829153,171.307861,-60.174401,1,,0.6533,0.1479,42.964,0.023
4,34,351.321442,-64.198746,317.458993,-50.429931,1,0.4557,0.4617,0.0122,42.054,0.023


### cesiumを試してみる

In [102]:
row['hostgal_photoz_err'].values[0]

0.01

In [129]:
for i in list(ts_dict.values()):
    print(i)

<cesium.time_series.TimeSeries object at 0x7f7aefa74630>
<cesium.time_series.TimeSeries object at 0x7f7aefa5c2b0>
<cesium.time_series.TimeSeries object at 0x7f7aefa5cac8>
<cesium.time_series.TimeSeries object at 0x7f7aefa5c588>
<cesium.time_series.TimeSeries object at 0x7f7aefa5c9b0>
<cesium.time_series.TimeSeries object at 0x7f7aefa5cb00>
<cesium.time_series.TimeSeries object at 0x7f7aefa5ca20>
<cesium.time_series.TimeSeries object at 0x7f7aefa5ce10>


In [103]:
ts_dict = OrderedDict()   # save TimeSeries object.
object_list = train_meta['object_id']
pbmap = OrderedDict([(0,'u'), (1,'g'), (2,'r'), (3,'i'), (4, 'z'), (5, 'y')])
pbnames = list(pbmap.values())

for object_id in tqdm(object_list, ascii=True):
    row = train_meta.query('object_id == @object_id')
    target = row['target']
    
    meta = {
        'z':row['hostgal_photoz'].values[0],
        'zerr':row['hostgal_photoz_err'].values[0],
        'mwebv':row['mwebv'].values[0]
    }
    
    extract_ts = train_ts.query('object_id == @object_id')
    pbind = [(extract_ts['passband'] == pb) for pb in pbmap]
    t = [extract_ts['mjd'][mask].values for mask in pbind]
    m = [extract_ts['flux'][mask].values for mask in pbind]
    e = [extract_ts['flux_err'][mask].values for mask in pbind]

    ts_dict[object_id] = TimeSeries(
        t=t, m=m, e=e, label=target, name=object_id,
        meta_features=meta, channel_names=pbnames
    )
    
    if object_id == 1632:
        break


  0%|          | 0/7848 [00:00<?, ?it/s][A
  0%|          | 5/7848 [00:00<02:38, 49.48it/s][A
[A

In [104]:
features_to_use = ["amplitude",
                   "percent_beyond_1_std",
                   "maximum",
                   "max_slope",
                   "median",
                   "median_absolute_deviation",
                   "percent_close_to_median",
                   "minimum",
                   "skew",
                   "std",
                   "weighted_average"]

In [105]:
features_list = []
for object_id in object_list:
    result = featurize.featurize_single_ts(ts_dict[object_id], features_to_use=features_to_use)
    features_list.append(result)
    if object_id == 1632:
        break

In [116]:
featuretable = featurize.assemble_featureset(features_list=features_list, time_series=ts_dict.values())

In [117]:
featuretable

feature,amplitude,amplitude,amplitude,amplitude,amplitude,amplitude,percent_beyond_1_std,percent_beyond_1_std,percent_beyond_1_std,percent_beyond_1_std,...,std,weighted_average,weighted_average,weighted_average,weighted_average,weighted_average,weighted_average,mwebv,z,zerr
channel,0,1,2,3,4,5,0,1,2,3,...,5,0,1,2,3,4,5,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
615,121.048016,880.533203,646.921722,488.190827,402.069122,400.501618,0.47619,0.586207,0.568966,0.551724,...,292.182295,-17.061118,-212.397193,-102.220639,-101.206639,-54.744845,-59.688379,0.017,0.0,0.0
713,14.622504,10.422385,10.29848,11.862454,11.057367,14.491025,0.385714,0.392857,0.410714,0.410714,...,7.030448,-3.500958,-1.322397,-1.030469,-1.382941,-1.407879,-1.876399,0.007,1.6267,0.2552
730,4.701063,4.543095,11.921774,19.503951,23.498145,33.234935,0.333333,0.442308,0.134615,0.115385,...,13.201397,-0.016423,-0.03417,2.059833,2.988513,4.486335,5.05769,0.021,0.2262,0.0157
745,10.944189,97.931352,111.477483,104.097369,99.56379,75.881338,0.180556,0.035714,0.071429,0.089286,...,25.822133,1.176322,3.652226,6.716857,12.514694,12.247387,8.760515,0.007,0.2813,1.1523
1124,6.067815,19.896143,54.378113,71.309338,80.071971,60.009063,0.333333,0.172414,0.137931,0.103448,...,21.245772,0.82438,3.617169,7.842645,8.830427,8.463856,5.602845,0.024,0.2415,0.0176
1227,37.748134,5.33723,2.909254,3.841772,10.914535,15.231738,0.027778,0.307692,0.326923,0.423077,...,5.538789,0.836955,0.35661,-0.044129,0.264363,0.688027,-0.878473,0.02,0.0,0.0
1598,141.604315,726.110041,646.552765,502.997459,371.318081,333.449232,0.095238,0.034483,0.034483,0.034483,...,113.293962,10.668888,2.024489,2.944821,5.016078,8.315349,13.931464,0.019,0.182,0.0304
1632,4.496465,5.0916,3.522365,6.789302,11.43584,24.542787,0.349206,0.431034,0.344828,0.362069,...,10.122379,-0.01592,0.151905,0.245103,0.948476,2.863544,6.151543,0.021,0.7014,0.01


In [118]:
old_names = featuretable.columns.values
new_names = ['{}_{}'.format(x, pbmap.get(y, 'meta')) for x, y in old_names]
cols = [featuretable[col] for col in old_names]
featuretable.columns = new_names
featuretable

Unnamed: 0,amplitude_u,amplitude_g,amplitude_r,amplitude_i,amplitude_z,amplitude_y,percent_beyond_1_std_u,percent_beyond_1_std_g,percent_beyond_1_std_r,percent_beyond_1_std_i,...,std_y,weighted_average_u,weighted_average_g,weighted_average_r,weighted_average_i,weighted_average_z,weighted_average_y,mwebv_meta,z_meta,zerr_meta
615,121.048016,880.533203,646.921722,488.190827,402.069122,400.501618,0.47619,0.586207,0.568966,0.551724,...,292.182295,-17.061118,-212.397193,-102.220639,-101.206639,-54.744845,-59.688379,0.017,0.0,0.0
713,14.622504,10.422385,10.29848,11.862454,11.057367,14.491025,0.385714,0.392857,0.410714,0.410714,...,7.030448,-3.500958,-1.322397,-1.030469,-1.382941,-1.407879,-1.876399,0.007,1.6267,0.2552
730,4.701063,4.543095,11.921774,19.503951,23.498145,33.234935,0.333333,0.442308,0.134615,0.115385,...,13.201397,-0.016423,-0.03417,2.059833,2.988513,4.486335,5.05769,0.021,0.2262,0.0157
745,10.944189,97.931352,111.477483,104.097369,99.56379,75.881338,0.180556,0.035714,0.071429,0.089286,...,25.822133,1.176322,3.652226,6.716857,12.514694,12.247387,8.760515,0.007,0.2813,1.1523
1124,6.067815,19.896143,54.378113,71.309338,80.071971,60.009063,0.333333,0.172414,0.137931,0.103448,...,21.245772,0.82438,3.617169,7.842645,8.830427,8.463856,5.602845,0.024,0.2415,0.0176
1227,37.748134,5.33723,2.909254,3.841772,10.914535,15.231738,0.027778,0.307692,0.326923,0.423077,...,5.538789,0.836955,0.35661,-0.044129,0.264363,0.688027,-0.878473,0.02,0.0,0.0
1598,141.604315,726.110041,646.552765,502.997459,371.318081,333.449232,0.095238,0.034483,0.034483,0.034483,...,113.293962,10.668888,2.024489,2.944821,5.016078,8.315349,13.931464,0.019,0.182,0.0304
1632,4.496465,5.0916,3.522365,6.789302,11.43584,24.542787,0.349206,0.431034,0.344828,0.362069,...,10.122379,-0.01592,0.151905,0.245103,0.948476,2.863544,6.151543,0.021,0.7014,0.01


In [123]:
meta_feature = [col for col in featuretable.columns if col.find("_meta") != -1]
featuretable.reset_index(drop=True).drop(meta_feature, axis=1)

Unnamed: 0,amplitude_u,amplitude_g,amplitude_r,amplitude_i,amplitude_z,amplitude_y,percent_beyond_1_std_u,percent_beyond_1_std_g,percent_beyond_1_std_r,percent_beyond_1_std_i,...,std_r,std_i,std_z,std_y,weighted_average_u,weighted_average_g,weighted_average_r,weighted_average_i,weighted_average_z,weighted_average_y
0,121.048016,880.533203,646.921722,488.190827,402.069122,400.501618,0.47619,0.586207,0.568966,0.551724,...,451.180827,332.520885,289.276965,292.182295,-17.061118,-212.397193,-102.220639,-101.206639,-54.744845,-59.688379
1,14.622504,10.422385,10.29848,11.862454,11.057367,14.491025,0.385714,0.392857,0.410714,0.410714,...,5.718981,6.392561,6.349526,7.030448,-3.500958,-1.322397,-1.030469,-1.382941,-1.407879,-1.876399
2,4.701063,4.543095,11.921774,19.503951,23.498145,33.234935,0.333333,0.442308,0.134615,0.115385,...,5.505767,8.112835,10.604821,13.201397,-0.016423,-0.03417,2.059833,2.988513,4.486335,5.05769
3,10.944189,97.931352,111.477483,104.097369,99.56379,75.881338,0.180556,0.035714,0.071429,0.089286,...,31.671373,34.65408,32.772464,25.822133,1.176322,3.652226,6.716857,12.514694,12.247387,8.760515
4,6.067815,19.896143,54.378113,71.309338,80.071971,60.009063,0.333333,0.172414,0.137931,0.103448,...,21.135263,26.043193,26.633303,21.245772,0.82438,3.617169,7.842645,8.830427,8.463856,5.602845
5,37.748134,5.33723,2.909254,3.841772,10.914535,15.231738,0.027778,0.307692,0.326923,0.423077,...,1.325623,1.605822,3.563588,5.538789,0.836955,0.35661,-0.044129,0.264363,0.688027,-0.878473
6,141.604315,726.110041,646.552765,502.997459,371.318081,333.449232,0.095238,0.034483,0.034483,0.034483,...,215.250206,173.640682,129.250487,113.293962,10.668888,2.024489,2.944821,5.016078,8.315349,13.931464
7,4.496465,5.0916,3.522365,6.789302,11.43584,24.542787,0.349206,0.431034,0.344828,0.362069,...,1.35487,2.630942,4.82871,10.122379,-0.01592,0.151905,0.245103,0.948476,2.863544,6.151543
