In [1]:
import sys
import itertools
import numpy as np
import pandas as pd
import lightgbm as lgb
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import StratifiedKFold

from tqdm import tqdm
from tsfresh.feature_extraction import extract_features
tqdm.pandas(desc="apply progress")
# from google.colab import drive

  from pandas.core import datetools


In [2]:
data_dir = "/home/hidehisa/.kaggle/competitions/plasticc"
#train = pd.read_csv(data_dir + "/train_with_cluster.csv")
#meta = pd.read_csv(data_dir + "/training_set_metadata.csv")

In [3]:
from multiprocessing import Pool

In [4]:
def elbow(d):
    data = d.mjd.values.reshape([-1, 1])
    kms = [KMeans(n_clusters=i).fit(data) for i in range(2, 6)]
    inertias = [km.inertia_ for km in kms]
    diff1 = inertias[0] - inertias[1]
    diff2 = inertias[1] - inertias[2]
    diff3 = inertias[2] - inertias[3]
    if diff1 / diff2 > diff2 / diff3:
        return kms[1].predict(data)
    else:
        return kms[2].predict(data)

def add_cluster(df):
    new_df = (df.groupby("object_id").progress_apply(lambda x: elbow(x))
                .to_frame("cluster")
                .apply(lambda x: x.apply(pd.Series).stack())
                .reset_index()
                .drop("level_1", axis=1)
             )
    new_df = new_df.astype({"cluster": int})
    df = pd.concat([df, new_df.drop("object_id", axis=1)], axis=1)
    return df


def add_cluster_multi(d):
    n_record = d.shape[0]
    default_chunk = n_record // 8
    head = 0
    df_pool = []
    for _ in range(7):
        new_df = d.loc[head:head+default_chunk, :]
        last_id = new_df.object_id.unique()[-1]
        len_last = new_df.query("object_id == @last_id").shape[0]
        new_df = new_df.loc[head:head+default_chunk-len_last, :]
        df_pool.append(new_df)
        head = head + default_chunk - len_last+1
    df_pool.append(d.loc[head:, :])
    pool = Pool(8)
    dfs = pool.map(add_cluster, df_pool)
    pool.close()
    return pd.concat(dfs)

In [7]:
import gc
gc.enable()
import time

start = time.time()
test = pd.read_csv(data_dir + "/test_set.csv", nrows=80000000, skiprows=49999911)
test.columns = pd.Index(["object_id", "mjd", "passband", "flux", "flux_err", "detected"])
last_id = test.object_id.unique()[-1]
len_last = test.query("object_id == @last_id").shape[0]

In [8]:
test.head()

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected
0,116295367,59638.3997,4,106.338104,17.677881,1
1,116295367,59652.2258,5,-0.092378,22.527664,0
2,116295367,59653.1986,5,34.854744,26.550417,0
3,116295367,59655.3781,5,45.486931,34.392929,0
4,116295367,59660.2444,5,32.464844,23.456642,0


In [9]:
test.tail()

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected
79999995,9961118,60100.3136,5,136.355408,17.892918,1
79999996,9961118,60113.2619,0,5.346103,8.11661,0
79999997,9961118,60118.2337,0,-7.174642,10.757877,0
79999998,9961118,60119.2818,2,23.029713,3.885554,0
79999999,9961118,60120.2236,1,5.018671,2.827637,0


In [10]:
len_last

59

In [11]:
test = test.loc[:50000000-len_last-1, :]

In [12]:
from joblib import Parallel, delayed

In [29]:
n_record = test.shape[0]
default_chunk = n_record // 8
head = 0
df_pool = []

In [30]:
default_chunk

6249992

In [31]:
for _ in range(7):
    new_df = test.loc[head:head+default_chunk, :]
    last_id = new_df.object_id.unique()[-1]
    len_last = new_df.query("object_id == @last_id").shape[0]
    new_df = new_df.loc[:head+default_chunk-len_last, :]
    df_pool.append(new_df)
    head = head + default_chunk - len_last +1
df_pool.append(test.loc[head:, :])

In [None]:
test_cluster = add_cluster(test)


apply progress:   0%|          | 0/390805 [00:00<?, ?it/s][A
apply progress:   0%|          | 1/390805 [00:01<167:14:08,  1.54s/it][A
apply progress:   0%|          | 5/390805 [00:01<118:09:09,  1.09s/it][A
apply progress:   0%|          | 8/390805 [00:01<84:00:05,  1.29it/s] [A
apply progress:   0%|          | 11/390805 [00:01<59:59:21,  1.81it/s][A
apply progress:   0%|          | 14/390805 [00:02<43:12:57,  2.51it/s][A
apply progress:   0%|          | 17/390805 [00:02<31:24:52,  3.46it/s][A
apply progress:   0%|          | 20/390805 [00:02<23:08:23,  4.69it/s][A
apply progress:   0%|          | 23/390805 [00:02<17:20:37,  6.26it/s][A
apply progress:   0%|          | 26/390805 [00:02<13:24:13,  8.10it/s][A
apply progress:   0%|          | 29/390805 [00:02<10:40:02, 10.18it/s][A
apply progress:   0%|          | 32/390805 [00:02<8:37:41, 12.58it/s] [A
apply progress:   0%|          | 35/390805 [00:02<7:13:03, 15.04it/s][A
apply progress:   0%|          | 38/390805 [00:02<

In [46]:
test_cluster.to_csv("test_with_cluster_0_49999910.csv", index=False, mode="a", header=False)

In [47]:
test_cluster.tail()

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected,cluster
49999936,1171834,60356.0376,5,20.973017,26.246626,0,0
49999937,1171834,60362.0649,5,-26.85944,38.530792,0,0
49999938,1171834,60370.0168,4,7.192351,16.337788,0,0
49999939,1171834,60486.4107,5,8.694453,26.434843,0,0
49999940,1171834,60496.395,2,17.79748,1.947458,1,0


In [48]:
49999910 + 49999940

99999850

In [49]:
!mv test_with_cluster_0_49999910.csv test_with_cluster_0_99999850.csv

In [50]:
!zip test_with_cluster_0_99999850.csv.zip test_with_cluster_0_99999850.csv
!mv test_with_cluster_0_99999850.csv.zip /home/hidehisa/.kaggle/competitions/plasticc

  adding: test_with_cluster_0_99999850.csv (deflated 66%)


In [52]:
test_cluster.to_csv("test_with_cluster_4999911_99999850.csv", index=False)

In [56]:
!mv test_with_cluster_4999911_99999850.csv test_with_cluster_49999911_99999850.csv

In [57]:
!zip test_with_cluster_49999911_99999850.csv.zip test_with_cluster_49999911_99999850.csv

  adding: test_with_cluster_49999911_99999850.csv (deflated 66%)


In [58]:
!mv test_with_cluster_49999911_99999850.csv.zip /home/hidehisa/.kaggle/competitions/plasticc/

In [59]:
del test
del test_cluster

## 99999850から

In [60]:
gc.collect()

2741456

In [61]:
del df_pool

In [62]:
gc.collect()

1581

In [4]:
import gc
gc.enable()
import time

start = time.time()
test = pd.read_csv(data_dir + "/test_set.csv", nrows=50000000, skiprows=99999851)
test.columns = pd.Index(["object_id", "mjd", "passband", "flux", "flux_err", "detected"])
last_id = test.object_id.unique()[-1]
len_last = test.query("object_id == @last_id").shape[0]

In [5]:
test.head()

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected
0,1171834,60496.395,2,17.79748,1.947458,1
1,1171834,60499.3847,0,11.280182,8.344309,0
2,1171834,60501.4,1,13.954027,2.067783,1
3,1171834,60502.3737,2,13.885885,2.614949,1
4,1171834,60515.4373,5,59.090069,27.069122,0


In [6]:
test_with_cluster = pd.read_csv("test_with_cluster_0_99999850.csv")
len_last = test_with_cluster.query("object_id == 1171834").shape[0]

In [7]:
len_last

98

In [8]:
test_with_cluster.head()

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected,cluster
0,13,59798.3205,2,-1.299735,1.357315,0,0
1,13,59798.3281,1,-2.095392,1.148654,0,0
2,13,59798.3357,3,-0.923794,1.763655,0,0
3,13,59798.3466,4,-4.009815,2.602911,0,0
4,13,59798.3576,5,-3.403503,5.367328,0,0


In [9]:
test_with_cluster.tail()

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected,cluster
99999847,1171834,60356.0376,5,20.973017,26.246626,0,0
99999848,1171834,60362.0649,5,-26.85944,38.530792,0,0
99999849,1171834,60370.0168,4,7.192351,16.337788,0,0
99999850,1171834,60486.4107,5,8.694453,26.434843,0,0
99999851,1171834,60496.395,2,17.79748,1.947458,1,0


In [11]:
test_with_cluster.loc[:test_with_cluster.shape[0]-len_last-1, :].tail()

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected,cluster
99999749,1171831,60559.9902,4,-1.706096,13.895929,0,1
99999750,1171831,60561.0608,2,-4.865917,2.39546,0,1
99999751,1171831,60588.0855,1,1.811535,1.670206,0,1
99999752,1171831,60610.0822,1,0.494893,1.794319,0,1
99999753,1171831,60612.0371,0,2.683231,6.093516,0,1


In [13]:
test_with_cluster.loc[test_with_cluster.shape[0]-len_last:, :].head()

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected,cluster
99999754,1171834,59813.421,5,9.938117,17.855148,0,1
99999755,1171834,59817.3233,1,-3.006077,1.61261,0,2
99999756,1171834,59818.3524,0,-1.674294,4.821917,0,2
99999757,1171834,59818.4088,3,-4.209298,4.842067,0,2
99999758,1171834,59818.4228,4,-6.420411,13.399401,0,2


In [19]:
test = pd.concat([test_with_cluster.loc[test_with_cluster.shape[0]-len_last:, :].drop("cluster", axis=1), test])

In [24]:
test.reset_index(inplace=True)

In [25]:
test_with_cluster = test_with_cluster.loc[:test_with_cluster.shape[0]-len_last-1, :]

In [26]:
test_with_cluster.to_csv("test_with_cluster_0_99999850.csv", index=False)

In [27]:
del test_with_cluster

In [28]:
!mv test_with_cluster_0_99999850.csv test_with_cluster_0_99999753.csv
!zip test_with_cluster_0_99999753.csv.zip test_with_cluster_0_99999753.csv
!mv test_with_cluster_0_99999753.csv.zip /home/hidehisa/.kaggle/competitions/plasticc/

  adding: test_with_cluster_0_99999753.csv (deflated 66%)


In [29]:
test.head()

Unnamed: 0,index,object_id,mjd,passband,flux,flux_err,detected
0,99999754,1171834,59813.421,5,9.938117,17.855148,0
1,99999755,1171834,59817.3233,1,-3.006077,1.61261,0
2,99999756,1171834,59818.3524,0,-1.674294,4.821917,0
3,99999757,1171834,59818.4088,3,-4.209298,4.842067,0
4,99999758,1171834,59818.4228,4,-6.420411,13.399401,0


In [30]:
test.shape

(50000098, 7)

## 仕切り直し

In [5]:
import gc
gc.enable()
import time

start = time.time()
test = pd.read_csv(data_dir + "/test_set.csv", nrows=80000000, skiprows=99999754)
test.columns = pd.Index(["object_id", "mjd", "passband", "flux", "flux_err", "detected"])
last_id = test.object_id.unique()[-1]
len_last = test.query("object_id == @last_id").shape[0]

In [6]:
test.head()

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected
0,1171834,59813.421,5,9.938117,17.855148,0
1,1171834,59817.3233,1,-3.006077,1.61261,0
2,1171834,59818.3524,0,-1.674294,4.821917,0
3,1171834,59818.4088,3,-4.209298,4.842067,0
4,1171834,59818.4228,4,-6.420411,13.399401,0


In [9]:
test.loc[:80000000-len_last-1, :]

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected
0,1171834,59813.4210,5,9.938117,17.855148,0
1,1171834,59817.3233,1,-3.006077,1.612610,0
2,1171834,59818.3524,0,-1.674294,4.821917,0
3,1171834,59818.4088,3,-4.209298,4.842067,0
4,1171834,59818.4228,4,-6.420411,13.399401,0
5,1171834,59819.2758,0,-5.168740,12.078398,0
6,1171834,59820.2587,0,-2.698601,8.029207,0
7,1171834,59823.3007,1,0.828275,1.603460,0
8,1171834,59824.2449,3,-4.604712,4.752285,0
9,1171834,59827.2787,4,-8.635461,6.301585,0


In [8]:
test.tail(15)

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected
79999985,24628047,60670.3297,1,39.272068,32.073841,0
79999986,24628047,60671.1954,0,-160.361694,166.74733,0
79999987,24628047,60673.1856,1,45.761223,35.498524,0
79999988,24628047,60674.1867,1,22.662331,25.106251,0
79999989,24628088,59583.0974,0,4.57044,13.780951,0
79999990,24628088,59588.1153,5,27.427721,31.85298,0
79999991,24628088,59590.0493,5,-24.217974,26.709337,0
79999992,24628088,59591.1338,4,4.503709,11.824526,0
79999993,24628088,59594.053,5,-9.076132,32.737686,0
79999994,24628088,59598.0455,4,0.958214,17.955286,0


In [10]:
test = test.loc[:80000000-len_last-1, :]


In [11]:
test_with_cluster = add_cluster(test)

apply progress: 100%|██████████| 625163/625163 [6:13:51<00:00, 27.87it/s]


In [13]:
test_with_cluster.tail()

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected,cluster
79999984,24628047,60665.3233,3,-25.415995,16.718096,0,3
79999985,24628047,60670.3297,1,39.272068,32.073841,0,3
79999986,24628047,60671.1954,0,-160.361694,166.74733,0,3
79999987,24628047,60673.1856,1,45.761223,35.498524,0,3
79999988,24628047,60674.1867,1,22.662331,25.106251,0,3


In [14]:
99999753 + 79999988

179999741

In [15]:
test_with_cluster.to_csv("test_with_cluster_0_99999753.csv", index=False, mode="a", header=False)

In [16]:
!mv test_with_cluster_0_99999753.csv test_with_cluster_0_179999741.csv
!zip test_with_cluster_0_179999741.csv.zip test_with_cluster_0_179999741.csv
!mv test_with_cluster_0_179999741.csv.zip /home/hidehisa/.kaggle/competitions/plasticc

  adding: test_with_cluster_0_179999741.csv (deflated 66%)


In [17]:
test_with_cluster.to_csv("test_with_cluster_99999754_179999741.csv", index=False)

In [18]:
!zip test_with_cluster_99999754_179999741.csv.zip test_with_cluster_99999754_179999741.csv
!mv test_with_cluster_99999754_179999741.csv.zip /home/hidehisa/.kaggle/competitions/plasticc/
!rm -f test_with_cluster_99999754_179999741.csv

  adding: test_with_cluster_99999754_179999741.csv (deflated 65%)


In [19]:
del test_with_cluster
del test

## 179999742~

In [7]:
import gc
gc.enable()
import time

start = time.time()
test = pd.read_csv(data_dir + "/test_set.csv", nrows=50000000, skiprows=179999743)
test.columns = pd.Index(["object_id", "mjd", "passband", "flux", "flux_err", "detected"])
last_id = test.object_id.unique()[-1]
len_last = test.query("object_id == @last_id").shape[0]

In [8]:
test.head()

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected
0,24628088,59583.0974,0,4.57044,13.780951,0
1,24628088,59588.1153,5,27.427721,31.85298,0
2,24628088,59590.0493,5,-24.217974,26.709337,0
3,24628088,59591.1338,4,4.503709,11.824526,0
4,24628088,59594.053,5,-9.076132,32.737686,0


In [9]:
test.loc[:50000000-len_last-1, :].tail()

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected
49999944,39281452,60594.0505,2,2.127341,3.392093,0
49999945,39281452,60597.0811,5,24.883595,26.544952,0
49999946,39281452,60602.0696,3,-3.78965,3.678388,0
49999947,39281452,60625.0313,4,-0.130972,6.443676,0
49999948,39281452,60670.0377,5,-16.273022,32.130787,0


In [10]:
last_id

39281488

In [11]:
test = test.loc[:50000000-len_last-1, :]


In [12]:
test_with_cluster = add_cluster(test)

apply progress: 100%|██████████| 390500/390500 [3:53:14<00:00, 27.90it/s]


In [13]:
test_with_cluster.head()

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected,cluster
0,24628088,59583.0974,0,4.57044,13.780951,0,0
1,24628088,59588.1153,5,27.427721,31.85298,0,0
2,24628088,59590.0493,5,-24.217974,26.709337,0,0
3,24628088,59591.1338,4,4.503709,11.824526,0,0
4,24628088,59594.053,5,-9.076132,32.737686,0,0


In [14]:
test_with_cluster.to_csv("test_with_cluster_0_179999742.csv", index=False, mode="a", header=False)

In [16]:
test_with_cluster.shape

(49999949, 7)

In [17]:
179999742 + 49999949

229999691

In [18]:
!mv test_with_cluster_0_179999742.csv test_with_cluster_0_229999691.csv
!zip test_with_cluster_0_229999691.csv.zip test_with_cluster_0_229999691.csv
!mv test_with_cluster_0_229999691.csv.zip /home/hidehisa/.kaggle/competitions/plasticc/

  adding: test_with_cluster_0_229999691.csv (deflated 66%)


In [19]:
test_with_cluster.to_csv("test_with_cluster_179999743_229999691.csv", index=False)

In [20]:
!zip test_with_cluster_179999743_229999691.csv.zip test_with_cluster_179999743_229999691.csv
!mv test_with_cluster_179999743_229999691.csv.zip /home/hidehisa/.kaggle/competitions/plasticc/
!rm -f test_with_cluster_179999743_229999691.csv

  adding: test_with_cluster_179999743_229999691.csv (deflated 66%)


## 229999692~

In [5]:
import gc
gc.enable()
import time

start = time.time()
test = pd.read_csv(data_dir + "/test_set.csv", nrows=80000000, skiprows=229999692)
test.columns = pd.Index(["object_id", "mjd", "passband", "flux", "flux_err", "detected"])
last_id = test.object_id.unique()[-1]
len_last = test.query("object_id == @last_id").shape[0]

In [6]:
test.tail()

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected
79999995,62741233,60329.0574,5,100.459549,45.646461,0
79999996,62741233,60338.0441,5,23.939013,37.877529,0
79999997,62741233,60458.4376,5,-19.269091,28.597336,0
79999998,62741233,60460.4297,5,-63.811634,33.788696,0
79999999,62741233,60487.3599,4,-9.367855,12.313096,0


In [7]:
test.head()

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected
0,39281488,59621.2363,5,-24.042404,23.049286,0
1,39281488,59652.1981,5,33.427605,29.960974,0
2,39281488,59653.1695,5,-18.306482,25.311556,0
3,39281488,59655.2768,5,24.107056,30.240215,0
4,39281488,59660.2447,5,-4.923117,23.614109,0


In [8]:
len_last

86

In [9]:
test = test.loc[:80000000-len_last-1, :]

In [10]:
test_with_cluster = add_cluster(test)

apply progress: 100%|██████████| 625185/625185 [6:09:39<00:00, 28.19it/s]


In [11]:
test_with_cluster.to_csv("test_with_cluster_0_229999691.csv", index=False, mode="a", header=False)

In [12]:
test_with_cluster.shape

(79999914, 7)

In [13]:
229999691 + 79999914

309999605

In [14]:
!mv test_with_cluster_0_229999691.csv test_with_cluster_0_309999605.csv
!zip test_with_cluster_0_309999605.csv.zip test_with_cluster_0_309999605.csv
!mv test_with_cluster_0_309999605.csv.zip /home/hidehisa/.kaggle/competitions/plasticc/

  adding: test_with_cluster_0_309999605.csv (deflated 66%)


In [15]:
test_with_cluster.to_csv("test_with_cluster_229999691_309999605.csv", index=False)

In [16]:
!zip test_with_cluster_229999691_309999605.csv.zip test_with_cluster_229999691_309999605.csv
!mv test_with_cluster_229999691_309999605.csv.zip /home/hidehisa/.kaggle/competitions/plasticc/
!rm -f test_with_cluster_229999691_309999605.csv

  adding: test_with_cluster_229999691_309999605.csv (deflated 66%)
