In [1]:
import sys
import itertools
import numpy as np
import pandas as pd
import lightgbm as lgb
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import StratifiedKFold

from tqdm import tqdm
from tsfresh.feature_extraction import extract_features
tqdm.pandas(desc="apply progress")

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
data_dir = "/Users/hidehisa/.kaggle/competitions/plasticc/"
train = pd.read_csv(data_dir + "training_set.csv")

In [4]:
from multiprocessing import Pool

In [5]:
def elbow(d):
    data = d.mjd.values.reshape([-1, 1])
    kms = [KMeans(n_clusters=i).fit(data) for i in range(2, 6)]
    inertias = [km.inertia_ for km in kms]
    diff1 = inertias[0] - inertias[1]
    diff2 = inertias[1] - inertias[2]
    diff3 = inertias[2] - inertias[3]
    if diff1 / diff2 > diff2 / diff3:
        return kms[1].predict(data)
    else:
        return kms[2].predict(data)
    
    
def add_cluster(df):
    new_df = (df.groupby("object_id").progress_apply(lambda x: elbow(x))
                .to_frame("cluster")
                .apply(lambda x: x.apply(pd.Series).stack())
                .reset_index()
                .drop("level_1", axis=1)
             )
    new_df = new_df.astype({"cluster": int})
    df = pd.concat([df, new_df.drop("object_id", axis=1)], axis=1)
    return df


def get_inertia(km, data):
    km.fit(data)
    return km.inertia_


def elbow_multi(d, kms):
    data = d.mjd.values.reshape([-1, 1])
    inertias = [get_inertia(km, data) for km in kms]
    diff1 = inertias[0] - inertias[1]
    diff2 = inertias[1] - inertias[2]
    diff3 = inertias[2] - inertias[3]
    if diff1 / diff2 > diff2 / diff3:
        return kms[1].predict(data)
    else:
        return kms[2].predict(data)
    
    
def add_cluster_multi(df):
    kms = [KMeans(n_clusters=i) for i in range(2, 6)]
    new_df = (df.groupby("object_id").progress_apply(lambda x: elbow_multi(x, kms))
                .to_frame("cluster")
                .apply(lambda x: x.apply(pd.Series).stack())
                .reset_index()
                .drop("level_1", axis=1)
             )
    new_df = new_df.astype({"cluster": int})
    df = pd.concat([df, new_df.drop("object_id", axis=1)], axis=1)
    return df

In [6]:
train_with_cluster = add_cluster(train)

apply progress: 100%|██████████| 7848/7848 [08:12<00:00, 15.27it/s]


In [16]:
pool.close()

In [6]:
%%time
train_with_cluster = add_cluster_multi(train)

apply progress: 100%|██████████| 7848/7848 [08:25<00:00, 15.07it/s]


CPU times: user 31min 13s, sys: 1min 4s, total: 32min 18s
Wall time: 8min 29s


In [7]:
from sklearn.cluster import MeanShift

In [8]:
train.head()

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected
0,615,59750.4229,2,-544.810303,3.622952,1
1,615,59750.4306,1,-816.434326,5.55337,1
2,615,59750.4383,3,-471.385529,3.801213,1
3,615,59750.445,4,-388.984985,11.395031,1
4,615,59752.407,2,-681.858887,4.041204,1


In [9]:
train.query("object_id == 615").shape

(352, 6)

In [12]:
ms = MeanShift()

In [16]:
%%time
ms.fit(train.query("object_id == 615").mjd.values.reshape([-1, 1]))
ms.labels_

CPU times: user 893 ms, sys: 11.4 ms, total: 904 ms
Wall time: 332 ms


In [15]:
%%time
elbow(train.query("object_id == 615"))

CPU times: user 329 ms, sys: 8.49 ms, total: 337 ms
Wall time: 84.3 ms


array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [17]:
from sklearn.cluster import AffinityPropagation

In [18]:
ap = AffinityPropagation(max_iter=100)

In [19]:
%%time
ap.fit_predict(train.query("object_id == 615").mjd.values.reshape([-1, 1]))

CPU times: user 671 ms, sys: 14.5 ms, total: 686 ms
Wall time: 177 ms


array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18