## Libraries

In [2]:
!pip install tsfresh

Collecting tsfresh
[?25l  Downloading https://files.pythonhosted.org/packages/2f/32/265c651f4fd70751f5ada348af0f9e322b058eddcda6a6f9bb305c8d270a/tsfresh-0.11.1-py2.py3-none-any.whl (1.2MB)
[K    100% |████████████████████████████████| 1.2MB 646kB/s 
Installing collected packages: tsfresh
Successfully installed tsfresh-0.11.1


In [3]:
!pip install ipdb



In [4]:
!pip install lightgbm



In [5]:
!pip install --upgrade pandas

Requirement already up-to-date: pandas in /home/hidehisa/anaconda3/lib/python3.6/site-packages (0.23.4)


In [6]:
import sys
import itertools
import numpy as np
import pandas as pd
import lightgbm as lgb
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import StratifiedKFold

from tqdm import tqdm
from tsfresh.feature_extraction import extract_features
# from google.colab import drive

  from pandas.core import datetools


In [6]:
# drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
# !cp /content/drive/My\ Drive/plasticc/* ./

^C


In [None]:
# !unzip -q sample_submission.csv.zip
# !unzip -q test_set.csv.zip
# !unzip -q test_set_metadata.csv.zip
# !unzip -q training_set.csv.zip

## Open training set

In [42]:
data_dir = "/home/hidehisa/.kaggle/competitions/plasticc"
train = pd.read_csv(data_dir + "/training_set.csv")
meta = pd.read_csv(data_dir + "/training_set_metadata.csv")

## Add Cluster to training set

In [8]:
from multiprocessing import Pool

In [41]:
def elbow(d):
    data = d.mjd.values.reshape([-1, 1])
    kms = [KMeans(n_clusters=i).fit(data) for i in range(2, 6)]
    inertias = [km.inertia_ for km in kms]
    diff1 = inertias[0] - inertias[1]
    diff2 = inertias[1] - inertias[2]
    diff3 = inertias[2] - inertias[3]
    if diff1 / diff2 > diff2 / diff3:
        return kms[1].predict(data)
    else:
        return kms[2].predict(data)

def add_cluster(df):
    new_df = (df.groupby("object_id").apply(lambda x: elbow(x))
                .to_frame("cluster")
                .apply(lambda x: x.apply(pd.Series).stack())
                .reset_index()
                .drop("level_1", axis=1)
             )
    new_df = new_df.astype({"cluster": int})
    df = pd.concat([df, new_df.drop("object_id", axis=1)], axis=1)
    return df


def add_cluster_multi(d):
    n_record = d.shape[0]
    default_chunk = n_record // 8
    head = 0
    df_pool = []
    for _ in range(7):
        new_df = d.loc[head:head+default_chunk, :]
        last_id = new_df.object_id.unique()[-1]
        len_last = new_df.query("object_id == @last_id").shape[0]
        new_df = new_df.loc[head:head+default_chunk-len_last, :]
        df_pool.append(new_df)
        head = head + default_chunk - len_last+1
    df_pool.append(d.loc[head:, :])
    pool = Pool(8)
    dfs = pool.map(add_cluster, df_pool)
    pool.close()
    return pd.concat(dfs)

In [44]:
%%time
train = add_cluster(train)

CPU times: user 4min 28s, sys: 0 ns, total: 4min 28s
Wall time: 4min 28s


In [23]:
train.query("object_id == 615").cluster.nunique()

3

In [None]:
# train.to_csv("train_with_cluster.csv", index=False)

In [33]:
# !zip train_with_cluster.csv.zip train_with_cluster.csv

  adding: train_with_cluster.csv (deflated 67%)


In [None]:
# !cp train_with_cluster.csv.zip /content/drive/My\ Drive/plasticc/

## Train Features

In [24]:
def basic(d):
    df = d.copy()
    df["flux_ratio_sq"] = np.power(df["flux"] / df["flux_err"], 2)
    df["flux_by_flux_ratio_sq"] = df["flux"] * df["flux_ratio_sq"]

    aggs = {
        'mjd': ['min', 'max', 'size'],
        'flux': ['min', 'max', 'mean', 'median', 'std', 'skew'],
        'flux_err': ['min', 'max', 'mean', 'median', 'std', 'skew'],
        'detected': ['mean'],
        'flux_ratio_sq': ['sum', 'skew'],
        'flux_by_flux_ratio_sq': ['sum', 'skew'],
    }
    agg_df = df.groupby('object_id').agg(aggs)
    new_columns = [k + '_' + agg for k in aggs.keys() for agg in aggs[k]]
    agg_df.columns = new_columns
    agg_df['mjd_diff'] = agg_df['mjd_max'] - agg_df['mjd_min']
    agg_df['flux_diff'] = agg_df['flux_max'] - agg_df['flux_min']
    agg_df['flux_dif2'] = (
        agg_df['flux_max'] - agg_df['flux_min']) / agg_df['flux_mean']
    agg_df['flux_w_mean'] = agg_df['flux_by_flux_ratio_sq_sum'] / agg_df[
        'flux_ratio_sq_sum']
    agg_df['flux_dif3'] = (
        agg_df['flux_max'] - agg_df['flux_min']) / agg_df['flux_w_mean']

    del agg_df['mjd_max'], agg_df['mjd_min']

    fcp = {
        'fft_coefficient': [{
            'coeff': 0,
            'attr': 'abs'
        }, {
            'coeff': 1,
            'attr': 'abs'
        }],
        'kurtosis':
        None,
        'skewness':
        None
    }
    agg_df_ts = extract_features(
        df,
        column_id='object_id',
        column_sort='mjd',
        column_kind='passband',
        column_value='flux',
        default_fc_parameters=fcp,
        n_jobs=8)
    df_det = df[df['detected'] == 1].copy()

    agg_df_mjd = extract_features(
        df_det,
        column_id='object_id',
        column_value='mjd',
        default_fc_parameters={
            'maximum': None,
            'minimum': None
        },
        n_jobs=8)
    agg_df_mjd['mjd_diff_det'] = agg_df_mjd['mjd__maximum'] - agg_df_mjd[
        'mjd__minimum']
    del agg_df_mjd['mjd__maximum'], agg_df_mjd['mjd__minimum']
    agg_df_ts = pd.merge(agg_df_ts, agg_df_mjd, on='id')
    # tsfresh returns a dataframe with an index name='id'
    agg_df_ts.index.rename('object_id', inplace=True)
    agg_df = pd.merge(agg_df, agg_df_ts, on='object_id')
    return agg_df


def with_cluster(d):
    df = d.copy()
    df["flux_ratio_sq"] = np.power(df["flux"] / df["flux_err"], 2)
    df["flux_by_flux_ratio_sq"] = df["flux"] * df["flux_ratio_sq"]
    aggs = {
        'mjd': ['min', 'max', 'size'],
        'flux': ['min', 'max', 'mean', 'median', 'std', 'skew'],
        'flux_err': ['min', 'max', 'mean', 'median', 'std', 'skew'],
        'detected': ['mean'],
        'flux_ratio_sq': ['sum', 'skew'],
        'flux_by_flux_ratio_sq': ['sum', 'skew'],
    }
    agg_df = df.groupby(['object_id', "cluster"]).agg(aggs)
    new_columns = [k + '_' + agg for k in aggs.keys() for agg in aggs[k]]
    agg_df.columns = new_columns
    agg_df['mjd_diff'] = agg_df['mjd_max'] - agg_df['mjd_min']
    agg_df['flux_diff'] = agg_df['flux_max'] - agg_df['flux_min']
    agg_df['flux_dif2'] = (
        agg_df['flux_max'] - agg_df['flux_min']) / agg_df['flux_mean']
    agg_df['flux_w_mean'] = agg_df['flux_by_flux_ratio_sq_sum'] / agg_df[
        'flux_ratio_sq_sum']
    agg_df['flux_dif3'] = (
        agg_df['flux_max'] - agg_df['flux_min']) / agg_df['flux_w_mean']
    agg_df.reset_index(inplace=True)
    del agg_df['mjd_max'], agg_df['mjd_min']
    agg_df.drop("cluster", axis=1, inplace=True)
    agg_df = agg_df.groupby("object_id").agg(["min", "max", "std", "skew"])
    agg_df.columns = pd.Index([e[0] + "_" + e[1] for e in agg_df.columns])

    return agg_df


def cluster_mean_diff(df):
    new_df = df.groupby(["object_id", "cluster"]).agg({
        "flux": ["mean", "max", "min"]
    })
    new_df.columns = pd.Index(
        [e[0] + "_" + e[1] for e in new_df.columns.tolist()])
    new_df["normalized_mean"] = new_df["flux_mean"] / (
        new_df["flux_max"] - new_df["flux_min"])
    new_df.reset_index(inplace=True)
    return new_df.groupby("object_id").agg({"normalized_mean": "std"})


def passband_std_difference(df):
    std_df = df.groupby(["object_id", "cluster", "passband"]).agg({
        "flux": "std"
    }).reset_index().groupby(["object_id",
                              "passband"])["flux"].mean().reset_index()
    std_df_max = std_df.groupby("object_id")["flux"].max()
    std_df_min = std_df.groupby("object_id")["flux"].min()
    return (std_df_max / std_df_min).reset_index()


def linear_slope(df):
    new_df = df.groupby(["object_id", "cluster", "passband"]).agg({
        "flux": ["max", "min"]
    })
    new_df.columns = pd.Index([e[0] + "_" + e[1] for e in new_df.columns])
    new_df.reset_index(inplace=True)
    new_df["flux_range"] = new_df["flux_max"] - new_df["flux_min"]
    new_df = pd.merge(
        df, new_df, how="left", on=["object_id", "cluster", "passband"])
    new_df["flux_normalized"] = new_df["flux"] / new_df["flux_range"]
    lr = LinearRegression()
    template = pd.DataFrame({
        "object_id": new_df.object_id.unique(),
        "passband0": 0,
        "passband1": 0,
        "passband2": 0,
        "passband3": 0,
        "passband4": 0,
        "passband5": 0
    })
    for objid in new_df.object_id.unique():
        obj_df = new_df.query("object_id == @objid")[[
            "mjd", "cluster", "passband", "flux_normalized"
        ]]
        passbands = [[] for _ in range(6)]
        for cl in obj_df.cluster.unique():
            cluster_df = obj_df.query("cluster == @cl")
            for ps in cluster_df.passband.unique():
                ps_df = cluster_df.query("passband == @ps")
                if ps_df.shape[0] <= 1:
                    passbands[ps].append(0)
                    continue
                lr.fit(ps_df["mjd"].values.reshape([-1, 1]),
                       ps_df["flux_normalized"].values.reshape([-1, 1]))
                passbands[ps].append(np.abs(lr.coef_)[0][0])
        passbands = [np.mean(p) for p in passbands]
        for i, ps in enumerate(passbands):
            template.loc[template.query("object_id == @objid").
                         index, f"passband{i}"] = ps
    return template


def linear_slope_multi(d):
    n_record = d.shape[0]
    default_chunk = n_record // 8
    head = 0
    df_pool = []
    for _ in range(7):
        new_df = d.loc[head:head+default_chunk, :]
        last_id = new_df.object_id.unique()[-1]
        len_last = new_df.query("object_id == @last_id").shape[0]
        new_df = new_df.loc[head:head+default_chunk-len_last, :]
        df_pool.append(new_df)
        head = head + default_chunk - len_last+1
    df_pool.append(d.loc[head:, :])
    pool = Pool(8)
    dfs = pool.map(linear_slope, df_pool)
    pool.close()
    return pd.concat(dfs)


def num_outliers(df):
    new_df = df.groupby("object_id").agg({"flux": ["mean", "std"]})
    new_df.columns = pd.Index([e[0] + "_" + e[1] for e in new_df.columns])
    new_df["upper_sigma"] = new_df["flux_mean"] + new_df["flux_std"]
    new_df["upper_2sigma"] = new_df["flux_mean"] + 2 * new_df["flux_std"]
    new_df["lower_sigma"] = new_df["flux_mean"] - new_df["flux_std"]
    new_df["lower_2sigma"] = new_df["flux_mean"] - 2 * new_df["flux_std"]
    new_df.drop(["flux_mean", "flux_std"], axis=1, inplace=True)
    new_df = pd.merge(df, new_df, how="left", on="object_id")
    new_df["outside_sigma"] = (
        (new_df["flux"] > new_df["upper_sigma"]) |
        (new_df["flux"] < new_df["lower_sigma"])).astype(int)
    new_df["outside_2sigma"] = (
        (new_df["flux"] > new_df["upper_2sigma"]) |
        (new_df["flux"] < new_df["lower_2sigma"])).astype(int)

    return_df = new_df.groupby("object_id").agg({
        "outside_sigma": "sum",
        "outside_2sigma": "sum"
    })
    return_df.reset_index(inplace=True)
    return return_df

In [25]:
def get_full(df, meta):
    agg_basic = basic(df)
    agg_cluster = with_cluster(df)

    cl_mean_diff = cluster_mean_diff(df)
    ps_std_diff = passband_std_difference(df)
    lin_sl = linear_slope(df)
    num_out = num_outliers(df)

    full = pd.merge(agg_basic, agg_cluster, how="left", on="object_id")
    full = pd.merge(full, cl_mean_diff, how="left", on="object_id")
    full = pd.merge(full, ps_std_diff, how="left", on="object_id")
    full = pd.merge(full, lin_sl, how="left", on="object_id")
    full = pd.merge(full, num_out, how="left", on="object_id")

    full = pd.merge(full, meta, how="left", on="object_id")
    if "target" in full.columns:
        full.drop("target", axis=1, inplace=True)
    return full


def train_data(df, meta):
    full = get_full(df, meta)
    y = meta.target
    classes = sorted(y.unique())
    class_weight = {c: 1 for c in classes}

    for c in [64, 15]:
        class_weight[c] = 2
    oof_df = full[["object_id"]]
    del full['object_id'], full['distmod'], full['hostgal_specz']
    del full['ra'], full['decl'], full['gal_l'], full['gal_b'], full['ddf']
    return full, y, classes, class_weight, oof_df

In [26]:
%%time
full, y, classes, class_weight, oof_df = train_data(train, meta)


Feature Extraction: 100%|██████████| 40/40 [00:04<00:00,  8.40it/s]
Feature Extraction: 100%|██████████| 40/40 [00:00<00:00, 92.25it/s]


CPU times: user 7min 3s, sys: 4.57 s, total: 7min 7s
Wall time: 6min 39s


In [27]:
train_mean = full.mean(axis=0)
full.fillna(0, inplace=True)

In [None]:
# full.to_csv("full_train.csv", index= False)

In [37]:
# !zip full_train.csv.zip full_train.csv
# !cp full_train.csv.zip /content/drive/My\ Drive/plasticc/

  adding: full_train.csv (deflated 55%)


## Loss Functions

In [28]:
def multi_weighted_logloss(y_true, y_preds):
    """
    @author olivier https://www.kaggle.com/ogrellier
    multi logloss for PLAsTiCC challenge
    """
    # class_weights taken from Giba's topic : https://www.kaggle.com/titericz
    # https://www.kaggle.com/c/PLAsTiCC-2018/discussion/67194
    # with Kyle Boone's post https://www.kaggle.com/kyleboone
    classes = [6, 15, 16, 42, 52, 53, 62, 64, 65, 67, 88, 90, 92, 95]
    class_weight = {
        6: 1,
        15: 2,
        16: 1,
        42: 1,
        52: 1,
        53: 1,
        62: 1,
        64: 2,
        65: 1,
        67: 1,
        88: 1,
        90: 1,
        92: 1,
        95: 1
    }
    if len(np.unique(y_true)) > 14:
        classes.append(99)
        class_weight[99] = 2
    y_p = y_preds
    # Trasform y_true in dummies
    y_ohe = pd.get_dummies(y_true)
    # Normalize rows and limit y_preds to 1e-15, 1-1e-15
    y_p = np.clip(a=y_p, a_min=1e-15, a_max=1 - 1e-15)
    # Transform to log
    y_p_log = np.log(y_p)
    # Get the log for ones, .values is used to drop the index of DataFrames
    # Exclude class 99 for now, since there is no class99 in the training set
    # we gave a special process for that class
    y_log_ones = np.sum(y_ohe.values * y_p_log, axis=0)
    # Get the number of positives for each class
    nb_pos = y_ohe.sum(axis=0).values.astype(float)
    # Weight average and divide by the number of positives
    class_arr = np.array(
        [class_weight[k] for k in sorted(class_weight.keys())])
    y_w = y_log_ones * class_arr / nb_pos

    loss = -np.sum(y_w) / np.sum(class_arr)
    return loss


def lgb_multi_weighted_logloss(y_true, y_preds):
    """
    @author olivier https://www.kaggle.com/ogrellier
    multi logloss for PLAsTiCC challenge
    """
    # class_weights taken from Giba's topic : https://www.kaggle.com/titericz
    # https://www.kaggle.com/c/PLAsTiCC-2018/discussion/67194
    # with Kyle Boone's post https://www.kaggle.com/kyleboone
    classes = [6, 15, 16, 42, 52, 53, 62, 64, 65, 67, 88, 90, 92, 95]
    class_weight = {
        6: 1,
        15: 2,
        16: 1,
        42: 1,
        52: 1,
        53: 1,
        62: 1,
        64: 2,
        65: 1,
        67: 1,
        88: 1,
        90: 1,
        92: 1,
        95: 1
    }
    if len(np.unique(y_true)) > 14:
        classes.append(99)
        class_weight[99] = 2
    y_p = y_preds.reshape(y_true.shape[0], len(classes), order='F')

    # Trasform y_true in dummies
    y_ohe = pd.get_dummies(y_true)
    # Normalize rows and limit y_preds to 1e-15, 1-1e-15
    y_p = np.clip(a=y_p, a_min=1e-15, a_max=1 - 1e-15)
    # Transform to log
    y_p_log = np.log(y_p)
    # Get the log for ones, .values is used to drop the index of DataFrames
    # Exclude class 99 for now, since there is no class99 in the training set
    # we gave a special process for that class
    y_log_ones = np.sum(y_ohe.values * y_p_log, axis=0)
    # Get the number of positives for each class
    nb_pos = y_ohe.sum(axis=0).values.astype(float)
    # Weight average and divide by the number of positives
    class_arr = np.array(
        [class_weight[k] for k in sorted(class_weight.keys())])
    y_w = y_log_ones * class_arr / nb_pos

    loss = -np.sum(y_w) / np.sum(class_arr)
    return 'wloss', loss, False


## Plotting

In [33]:
def save_importances(importances_):
    mean_gain = importances_[['gain', 'feature']].groupby('feature').mean()
    importances_['mean_gain'] = importances_['feature'].map(mean_gain['gain'])
    plt.figure(figsize=(8, 12))
    sns.barplot(
        x='gain',
        y='feature',
        data=importances_.sort_values('mean_gain', ascending=False)[:250])
    plt.tight_layout()
    plt.savefig('importances.png')


def plot_confusion_matrix(cm,
                          classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(
            j,
            i,
            format(cm[i, j], fmt),
            horizontalalignment="center",
            color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()


def save_cm(y, oof_preds, path):
    unique_y = np.unique(y)
    class_map = dict()
    for i, val in enumerate(unique_y):
        class_map[val] = i

    y_map = np.zeros((y.shape[0], ))
    y_map = np.array([class_map[val] for val in y])

    # Compute confusion matrix
    cnf_matrix = confusion_matrix(y_map, np.argmax(oof_preds, axis=-1))
    np.set_printoptions(precision=2)

    sample_sub = pd.read_csv(path)
    class_names = list(sample_sub.columns[1:-1])
    del sample_sub

    # Plot non-normalized confusion matrix
    plt.figure(figsize=(12, 12))
    plot_confusion_matrix(
        cnf_matrix,
        classes=class_names,
        normalize=True,
        title='Confusion matrix')
    plt.savefig("confusion_matrix.png")

## Train

In [30]:
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
clfs = []
importances = pd.DataFrame()

lgb_params = {
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': 14,
    'metric': 'multi_logloss',
    'learning_rate': 0.03,
    'subsample': .9,
    'colsample_bytree': 0.5,
    'reg_alpha': .01,
    'reg_lambda': .01,
    'min_split_gain': 0.01,
    'min_child_weight': 10,
    'n_estimators': 1000,
    'silent': -1,
    'verbose': -1,
    'max_depth': 3
}

# Compute weights
w = y.value_counts()
weights = {i: np.sum(w) / w[i] for i in w.index}
oof_preds = np.zeros((len(full), np.unique(y).shape[0]))

for fold_, (trn_, val_) in enumerate(folds.split(y, y)):
    trn_x, trn_y = full.iloc[trn_], y.iloc[trn_]
    val_x, val_y = full.iloc[val_], y.iloc[val_]

    clf = lgb.LGBMClassifier(**lgb_params)
    clf.fit(
        trn_x,
        trn_y,
        eval_set=[(trn_x, trn_y), (val_x, val_y)],
        eval_metric=lgb_multi_weighted_logloss,
        verbose=100,
        early_stopping_rounds=50,
        sample_weight=trn_y.map(weights))
    oof_preds[val_, :] = clf.predict_proba(
        val_x, num_iteration=clf.best_iteration_)
    print(multi_weighted_logloss(val_y, oof_preds[val_, :]))

    imp_df = pd.DataFrame()
    imp_df['feature'] = full.columns
    imp_df['gain'] = clf.feature_importances_
    imp_df['fold'] = fold_ + 1
    importances = pd.concat([importances, imp_df], axis=0, sort=False)

    clfs.append(clf)

print('MULTI WEIGHTED LOG LOSS : %.5f ' % multi_weighted_logloss(
    y_true=y, y_preds=oof_preds))
save_importances(importances_=importances)

Training until validation scores don't improve for 50 rounds.
[100]	training's multi_logloss: 0.790666	training's wloss: 0.782197	valid_1's multi_logloss: 1.15701	valid_1's wloss: 0.963602
[200]	training's multi_logloss: 0.518249	training's wloss: 0.506779	valid_1's multi_logloss: 0.929172	valid_1's wloss: 0.780153
[300]	training's multi_logloss: 0.39523	training's wloss: 0.383376	valid_1's multi_logloss: 0.840354	valid_1's wloss: 0.73581
[400]	training's multi_logloss: 0.32126	training's wloss: 0.309444	valid_1's multi_logloss: 0.79401	valid_1's wloss: 0.727857
Early stopping, best iteration is:
[402]	training's multi_logloss: 0.320092	training's wloss: 0.308276	valid_1's multi_logloss: 0.793086	valid_1's wloss: 0.727502
0.7275017494599382
Training until validation scores don't improve for 50 rounds.
[100]	training's multi_logloss: 0.794842	training's wloss: 0.788567	valid_1's multi_logloss: 1.14184	valid_1's wloss: 0.966297
[200]	training's multi_logloss: 0.519131	training's wloss: 0

In [34]:
save_importances(importances_=importances)
save_cm(y, oof_preds, data_dir + "/sample_submission.csv")

Normalized confusion matrix


In [36]:
full.head()

Unnamed: 0,mjd_size,flux_min,flux_max,flux_mean,flux_median,flux_std,flux_skew,flux_err_min,flux_err_max,flux_err_mean,...,passband1,passband2,passband3,passband4,passband5,outside_sigma,outside_2sigma,hostgal_photoz,hostgal_photoz_err,mwebv
0,352,-1100.440063,660.626343,-123.096998,-89.477524,394.109851,-0.34954,2.13051,12.845472,4.482743,...,0.003049,0.003407,0.003532,0.003279,0.002969,115,17,0.0,0.0,0.017
1,350,-14.735178,14.770886,-1.423351,-0.873033,6.471144,0.014989,0.639458,9.115748,2.35962,...,0.002676,0.003259,0.002563,0.00422,0.001639,136,4,1.6267,0.2552,0.007
2,330,-19.159811,47.310059,2.267434,0.409172,8.022239,3.177854,0.695106,11.281384,2.471061,...,0.002277,0.004382,0.003062,0.003725,0.003291,31,18,0.2262,0.0157,0.021
3,351,-15.494463,220.795212,8.909206,1.035895,27.558208,4.979826,0.56717,55.892746,2.555576,...,0.002489,0.002071,0.002362,0.002687,0.003098,21,15,0.2813,1.1523,0.007
4,352,-16.543753,143.600189,7.145702,1.141288,20.051722,4.406298,0.695277,11.38369,2.753004,...,0.003188,0.002722,0.002255,0.002364,0.002554,27,12,0.2415,0.0176,0.024


## Test

In [39]:
def predict_chunk(df_, clfs_, meta_, features, train_mean):
    # Group by object id
    df_ = add_cluster_multi(df_)
    if i_c == 0:
        df_.to_csv('test_with_cluster.csv', header=True, mode='a', index=False)
    else:
        df_.to_csv('test_with_cluster.csv', header=False, mode='a', index=False)
    agg_ = get_full(df_, meta_)

    full_test = agg_.fillna(0)
    
    if i_c == 0:
        full_test.to_csv('full_test.csv', header=True, mode='a', index=False)
    else:
        full_test.to_csv('full_test.csv', header=False, mode='a', index=False)
    # Make predictions
    preds_ = None
    for clf in clfs_:
        if preds_ is None:
            preds_ = clf.predict_proba(full_test[features]) / len(clfs_)
        else:
            preds_ += clf.predict_proba(full_test[features]) / len(clfs_)

    # Compute preds_99 as the proba of class not being any of the others
    # preds_99 = 0.1 gives 1.769
    preds_99 = np.ones(preds_.shape[0])
    for i in range(preds_.shape[1]):
        preds_99 *= (1 - preds_[:, i])

    # Create DataFrame from predictions
    preds_df_ = pd.DataFrame(preds_, columns=['class_' + str(s) for s in clfs_[0].classes_])
    preds_df_['object_id'] = full_test['object_id']
    preds_df_['class_99'] = 0.14 * preds_99 / np.mean(preds_99) 
    return preds_df_

In [40]:
import gc
gc.enable()
meta_test = pd.read_csv(data_dir + '/test_set_metadata.csv')
# meta_test.set_index('object_id',inplace=True)
import time

start = time.time()
chunks = 10000000
remain_df = None

for i_c, df in enumerate(pd.read_csv(data_dir + '/test_set.csv', chunksize=chunks, iterator=True)):
    # Check object_ids
    # I believe np.unique keeps the order of group_ids as they appear in the file
    unique_ids = np.unique(df['object_id'])
    new_remain_df = df.loc[df['object_id'] == unique_ids[-1]].copy()
    if remain_df is None:
        df = df.loc[df['object_id'].isin(unique_ids[:-1])]
    else:
        df = pd.concat([remain_df, df.loc[df['object_id'].isin(unique_ids[:-1])]], axis=0)
    # Create remaining samples df
    remain_df = new_remain_df
    preds_df = predict_chunk(df_=df,
                             clfs_=clfs,
                             meta_=meta_test,
                             features=full.columns,
                             train_mean=train_mean)

    if i_c == 0:
        preds_df.to_csv('predictions.csv', header=True, mode='a', index=False)
    else:
        preds_df.to_csv('predictions.csv', header=False, mode='a', index=False)

    del preds_df
    gc.collect()
    
    print('%15d done in %5.1f minutes' % (chunks * (i_c + 1), (time.time() - start) / 60), flush=True)

# Compute last object in remain_df
preds_df = predict_chunk(df_=remain_df,
                         clfs_=clfs,
                         meta_=meta_test,
                         features=full_train.columns,
                         train_mean=train_mean)

preds_df.to_csv('predictions.csv', header=False, mode='a', index=False)
#!zip test_with_cluster.csv.zip test_with_cluster.csv
#!zip full_test.csv.zip full_test.csv
#!cp test_with_cluster.csv.zip /content/drive/My\ Drive/plasticc/
#!cp full_test.csv.zip /content/drive/My\ Drive/plasticc

Feature Extraction: 100%|██████████| 40/40 [00:16<00:00,  2.42it/s]
Feature Extraction: 100%|██████████| 40/40 [00:01<00:00, 24.69it/s]


KeyboardInterrupt: 