In [1]:
import sys
import itertools
import numpy as np
import pandas as pd
import lightgbm as lgb
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import StratifiedKFold

from tqdm import tqdm
from tsfresh.feature_extraction import extract_features
from multiprocessing import Pool
tqdm.pandas(desc="apply progress")

  from pandas.core import datetools


In [2]:
data_dir = "/home/hidehisa/.kaggle/competitions/plasticc"
train = pd.read_csv(data_dir + "/train_with_cluster.csv")
meta = pd.read_csv(data_dir + "/training_set_metadata.csv")

In [3]:
def basic(d):
    df = d.copy()
    df["flux_ratio_sq"] = np.power(df["flux"] / df["flux_err"], 2)
    df["flux_by_flux_ratio_sq"] = df["flux"] * df["flux_ratio_sq"]

    aggs = {
        'flux': ['min', 'max', 'mean', 'median', 'std', 'skew'],
        'flux_err': ['min', 'max', 'mean', 'median', 'std', 'skew'],
        'detected': ['mean'],
        'flux_ratio_sq': ['sum', 'skew'],
        'flux_by_flux_ratio_sq': ['sum', 'skew'],
    }
    agg_df = df.groupby('object_id').agg(aggs)
    new_columns = [k + '_' + agg for k in aggs.keys() for agg in aggs[k]]
    agg_df.columns = new_columns
    agg_df['flux_diff'] = agg_df['flux_max'] - agg_df['flux_min']
    agg_df['flux_dif2'] = (
        agg_df['flux_max'] - agg_df['flux_min']) / agg_df['flux_mean']
    agg_df['flux_w_mean'] = agg_df['flux_by_flux_ratio_sq_sum'] / agg_df[
        'flux_ratio_sq_sum']
    agg_df['flux_dif3'] = (
        agg_df['flux_max'] - agg_df['flux_min']) / agg_df['flux_w_mean']
    agg_flux_diff = agg_df.reset_index()[["object_id", "flux_diff"]]
    df2 = pd.merge(df, agg_df, how="left", on="object_id")
    df2["flux_norm"] = df2.flux / df2.flux_diff
    del df2["flux"]
    fcp = {
        'fft_coefficient': [{
            'coeff': 0,
            'attr': 'abs'
        }, {
            'coeff': 1,
            'attr': 'abs'
        }],
        'kurtosis':
        None,
        'skewness':
        None,
        "cid_ce": [{"normalize": True}]
    }
    fcp2 = {
        "fft_coefficient": [{
            "coeff": 0,
            "attr": "abs"
        }, {
            "coeff": 1,
            "attr": "abs"
        }],
        "abs_energy": None,
        "sample_entropy": None
    }
    fcp_flux = {
        "longest_strike_above_mean": None,
        "longest_strike_below_mean": None,
        "mean_change": None,
        "mean_abs_change": None,
        "cid_ce": [{"normalize": True}]
    }
    fcp_flux_by_flux_ratio_sq = {
        "longest_strike_above_mean": None,
        "longest_strike_below_mean": None
    }
    agg_df_ts = extract_features(
        df,
        column_id='object_id',
        column_sort='mjd',
        column_kind='passband',
        column_value='flux',
        default_fc_parameters=fcp,
        n_jobs=6)
    agg_df_ts2 = extract_features(
        df2,
        column_id="object_id",
        column_sort="mjd",
        column_kind="passband",
        column_value="flux_norm",
        default_fc_parameters=fcp2,
        n_jobs=4
    )
    agg_df_flux = extract_features(
        df,
        column_id="object_id",
        column_value="flux",
        default_fc_parameters=fcp_flux,
        n_jobs=4
    )
    agg_df_ffrs = extract_features(
        df,
        column_id="object_id",
        column_value="flux_by_flux_ratio_sq",
        default_fc_parameters=fcp_flux_by_flux_ratio_sq,
        n_jobs=4
    )
    df_det = df[df['detected'] == 1].copy()

    agg_df_mjd = extract_features(
        df_det,
        column_id='object_id',
        column_value='mjd',
        default_fc_parameters={
            'maximum': None,
            'minimum': None
        },
        n_jobs=8)
    agg_df_mjd['mjd_diff_det'] = agg_df_mjd['mjd__maximum'] - agg_df_mjd[
        'mjd__minimum']
    del agg_df_mjd['mjd__maximum'], agg_df_mjd['mjd__minimum']
    agg_df_ts2.columns = pd.Index([e + "_norm" for e in agg_df_ts2.columns])
    agg_df_ts = pd.merge(agg_df_ts, agg_df_mjd, on='id')
    agg_df_ts = pd.merge(agg_df_ts, agg_df_ts2, on="id")
    agg_df_ts = pd.merge(agg_df_ts, agg_df_flux, on="id")
    agg_df_ts = pd.merge(agg_df_ts, agg_df_ffrs, on="id")
    # tsfresh returns a dataframe with an index name='id'
    agg_df_ts.index.rename('object_id', inplace=True)
    agg_df = pd.merge(agg_df, agg_df_ts, on='object_id')
    return agg_df


def cluster_mean_diff(df):
    new_df = df.groupby(["object_id", "cluster"]).agg({
        "flux": ["mean", "max", "min"]
    })
    new_df.columns = pd.Index(
        [e[0] + "_" + e[1] for e in new_df.columns.tolist()])
    new_df["normalized_mean"] = new_df["flux_mean"] / (
        new_df["flux_max"] - new_df["flux_min"])
    new_df.reset_index(inplace=True)
    return new_df.groupby("object_id").agg({"normalized_mean": "std"})


def passband_std_difference(df):
    std_df = df.groupby(["object_id", "cluster", "passband"]).agg({
        "flux": "std"
    }).reset_index().groupby(["object_id",
                              "passband"])["flux"].mean().reset_index()
    std_df_max = std_df.groupby("object_id")["flux"].max()
    std_df_min = std_df.groupby("object_id")["flux"].min()
    return (std_df_max / std_df_min).reset_index()


def num_outliers(df):
    new_df = df.groupby("object_id").agg({"flux": ["mean", "std"]})
    new_df.columns = pd.Index([e[0] + "_" + e[1] for e in new_df.columns])
    new_df["upper_sigma"] = new_df["flux_mean"] + new_df["flux_std"]
    new_df["upper_2sigma"] = new_df["flux_mean"] + 2 * new_df["flux_std"]
    new_df["lower_sigma"] = new_df["flux_mean"] - new_df["flux_std"]
    new_df["lower_2sigma"] = new_df["flux_mean"] - 2 * new_df["flux_std"]
    new_df.drop(["flux_mean", "flux_std"], axis=1, inplace=True)
    new_df = pd.merge(df, new_df, how="left", on="object_id")
    new_df["outside_sigma"] = (
        (new_df["flux"] > new_df["upper_sigma"]) |
        (new_df["flux"] < new_df["lower_sigma"])).astype(int)
    new_df["outside_2sigma"] = (
        (new_df["flux"] > new_df["upper_2sigma"]) |
        (new_df["flux"] < new_df["lower_2sigma"])).astype(int)

    return_df = new_df.groupby("object_id").agg({
        "outside_sigma": "sum",
        "outside_2sigma": "sum"
    })
    return_df.reset_index(inplace=True)
    return return_df

In [4]:
def get_full(df, meta):
    agg_basic = basic(df)
    cl_mean_diff = cluster_mean_diff(df)
    ps_std_diff = passband_std_difference(df)
    num_out = num_outliers(df)

    full = pd.merge(agg_basic, cl_mean_diff, how="left", on="object_id")
    full = pd.merge(full, ps_std_diff, how="left", on="object_id")
    full = pd.merge(full, num_out, how="left", on="object_id")

    full = pd.merge(full, meta, how="left", on="object_id")
    if "target" in full.columns:
        full.drop("target", axis=1, inplace=True)
    return full


def train_data(df, meta):
    full = get_full(df, meta)
    y = meta.target
    classes = sorted(y.unique())
    class_weight = {c: 1 for c in classes}

    for c in [64, 15]:
        class_weight[c] = 2
    oof_df = full[["object_id"]]
    del full['object_id'], full['distmod'], full['hostgal_specz']
    del full['ra'], full['decl'], full['gal_l'], full['gal_b'], full['ddf']
    return full, y, classes, class_weight, oof_df

In [5]:
%%time
full, y, classes, class_weight, oof_df = train_data(train, meta)

Feature Extraction: 100%|██████████| 30/30 [00:05<00:00,  6.97it/s]
Feature Extraction: 100%|██████████| 20/20 [00:15<00:00,  2.66it/s]
Feature Extraction: 100%|██████████| 20/20 [00:01<00:00, 17.48it/s]
Feature Extraction: 100%|██████████| 20/20 [00:00<00:00, 30.27it/s]
Feature Extraction: 100%|██████████| 40/40 [00:00<00:00, 99.47it/s] 


CPU times: user 21.4 s, sys: 1.36 s, total: 22.8 s
Wall time: 36.9 s


In [6]:
train_mean = full.mean(axis=0)
full.fillna(0, inplace=True)

In [7]:
del train
del meta

In [8]:
def multi_weighted_logloss(y_true, y_preds):
    """
    @author olivier https://www.kaggle.com/ogrellier
    multi logloss for PLAsTiCC challenge
    """
    # class_weights taken from Giba's topic : https://www.kaggle.com/titericz
    # https://www.kaggle.com/c/PLAsTiCC-2018/discussion/67194
    # with Kyle Boone's post https://www.kaggle.com/kyleboone
    classes = [6, 15, 16, 42, 52, 53, 62, 64, 65, 67, 88, 90, 92, 95]
    class_weight = {
        6: 1,
        15: 2,
        16: 1,
        42: 1,
        52: 1,
        53: 1,
        62: 1,
        64: 2,
        65: 1,
        67: 1,
        88: 1,
        90: 1,
        92: 1,
        95: 1
    }
    if len(np.unique(y_true)) > 14:
        classes.append(99)
        class_weight[99] = 2
    y_p = y_preds
    # Trasform y_true in dummies
    y_ohe = pd.get_dummies(y_true)
    # Normalize rows and limit y_preds to 1e-15, 1-1e-15
    y_p = np.clip(a=y_p, a_min=1e-15, a_max=1 - 1e-15)
    # Transform to log
    y_p_log = np.log(y_p)
    # Get the log for ones, .values is used to drop the index of DataFrames
    # Exclude class 99 for now, since there is no class99 in the training set
    # we gave a special process for that class
    y_log_ones = np.sum(y_ohe.values * y_p_log, axis=0)
    # Get the number of positives for each class
    nb_pos = y_ohe.sum(axis=0).values.astype(float)
    # Weight average and divide by the number of positives
    class_arr = np.array(
        [class_weight[k] for k in sorted(class_weight.keys())])
    y_w = y_log_ones * class_arr / nb_pos

    loss = -np.sum(y_w) / np.sum(class_arr)
    return loss


def lgb_multi_weighted_logloss(y_true, y_preds):
    """
    @author olivier https://www.kaggle.com/ogrellier
    multi logloss for PLAsTiCC challenge
    """
    # class_weights taken from Giba's topic : https://www.kaggle.com/titericz
    # https://www.kaggle.com/c/PLAsTiCC-2018/discussion/67194
    # with Kyle Boone's post https://www.kaggle.com/kyleboone
    classes = [6, 15, 16, 42, 52, 53, 62, 64, 65, 67, 88, 90, 92, 95]
    class_weight = {
        6: 1,
        15: 2,
        16: 1,
        42: 1,
        52: 1,
        53: 1,
        62: 1,
        64: 2,
        65: 1,
        67: 1,
        88: 1,
        90: 1,
        92: 1,
        95: 1
    }
    if len(np.unique(y_true)) > 14:
        classes.append(99)
        class_weight[99] = 2
    y_p = y_preds.reshape(y_true.shape[0], len(classes), order='F')

    # Trasform y_true in dummies
    y_ohe = pd.get_dummies(y_true)
    # Normalize rows and limit y_preds to 1e-15, 1-1e-15
    y_p = np.clip(a=y_p, a_min=1e-15, a_max=1 - 1e-15)
    # Transform to log
    y_p_log = np.log(y_p)
    # Get the log for ones, .values is used to drop the index of DataFrames
    # Exclude class 99 for now, since there is no class99 in the training set
    # we gave a special process for that class
    y_log_ones = np.sum(y_ohe.values * y_p_log, axis=0)
    # Get the number of positives for each class
    nb_pos = y_ohe.sum(axis=0).values.astype(float)
    # Weight average and divide by the number of positives
    class_arr = np.array(
        [class_weight[k] for k in sorted(class_weight.keys())])
    y_w = y_log_ones * class_arr / nb_pos

    loss = -np.sum(y_w) / np.sum(class_arr)
    return 'wloss', loss, False

In [9]:
def save_importances(importances_):
    mean_gain = importances_[['gain', 'feature']].groupby('feature').mean()
    importances_['mean_gain'] = importances_['feature'].map(mean_gain['gain'])
    plt.figure(figsize=(8, 12))
    sns.barplot(
        x='gain',
        y='feature',
        data=importances_.sort_values('mean_gain', ascending=False)[:300])
    plt.tight_layout()
    plt.savefig('importances_2.png')


def plot_confusion_matrix(cm,
                          classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(
            j,
            i,
            format(cm[i, j], fmt),
            horizontalalignment="center",
            color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()


def save_cm(y, oof_preds, path):
    unique_y = np.unique(y)
    class_map = dict()
    for i, val in enumerate(unique_y):
        class_map[val] = i

    y_map = np.zeros((y.shape[0], ))
    y_map = np.array([class_map[val] for val in y])

    # Compute confusion matrix
    cnf_matrix = confusion_matrix(y_map, np.argmax(oof_preds, axis=-1))
    np.set_printoptions(precision=2)

    sample_sub = pd.read_csv(path)
    class_names = list(sample_sub.columns[1:-1])
    del sample_sub

    # Plot non-normalized confusion matrix
    plt.figure(figsize=(12, 12))
    plot_confusion_matrix(
        cnf_matrix,
        classes=class_names,
        normalize=True,
        title='Confusion matrix')
    plt.savefig("confusion_matrix_2.png")

In [10]:
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
clfs = []
importances = pd.DataFrame()

lgb_params = {
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': 14,
    'metric': 'multi_logloss',
    'learning_rate': 0.03,
    'subsample': .9,
    'colsample_bytree': 0.5,
    'reg_alpha': .01,
    'reg_lambda': .01,
    'min_split_gain': 0.01,
    'min_child_weight': 10,
    'n_estimators': 1000,
    'silent': -1,
    'verbose': -1,
    'max_depth': 3
}

# Compute weights
w = y.value_counts()
weights = {i: np.sum(w) / w[i] for i in w.index}
oof_preds = np.zeros((len(full), np.unique(y).shape[0]))

for fold_, (trn_, val_) in enumerate(folds.split(y, y)):
    trn_x, trn_y = full.iloc[trn_], y.iloc[trn_]
    val_x, val_y = full.iloc[val_], y.iloc[val_]

    clf = lgb.LGBMClassifier(**lgb_params)
    clf.fit(
        trn_x,
        trn_y,
        eval_set=[(trn_x, trn_y), (val_x, val_y)],
        eval_metric=lgb_multi_weighted_logloss,
        verbose=100,
        early_stopping_rounds=50,
        sample_weight=trn_y.map(weights))
    oof_preds[val_, :] = clf.predict_proba(
        val_x, num_iteration=clf.best_iteration_)
    print(multi_weighted_logloss(val_y, oof_preds[val_, :]))

    imp_df = pd.DataFrame()
    imp_df['feature'] = full.columns
    imp_df['gain'] = clf.feature_importances_
    imp_df['fold'] = fold_ + 1
    importances = pd.concat([importances, imp_df], axis=0, sort=False)

    clfs.append(clf)

print('MULTI WEIGHTED LOG LOSS : %.5f ' % multi_weighted_logloss(
    y_true=y, y_preds=oof_preds))

Training until validation scores don't improve for 50 rounds.
[100]	training's multi_logloss: 0.758303	training's wloss: 0.744405	valid_1's multi_logloss: 1.1007	valid_1's wloss: 0.893931
[200]	training's multi_logloss: 0.493922	training's wloss: 0.479925	valid_1's multi_logloss: 0.864227	valid_1's wloss: 0.692595
[300]	training's multi_logloss: 0.378858	training's wloss: 0.365139	valid_1's multi_logloss: 0.774035	valid_1's wloss: 0.637011
[400]	training's multi_logloss: 0.310429	training's wloss: 0.297079	valid_1's multi_logloss: 0.725361	valid_1's wloss: 0.617738
[500]	training's multi_logloss: 0.262275	training's wloss: 0.249548	valid_1's multi_logloss: 0.693754	valid_1's wloss: 0.611711
Early stopping, best iteration is:
[485]	training's multi_logloss: 0.268942	training's wloss: 0.256063	valid_1's multi_logloss: 0.697398	valid_1's wloss: 0.611094
0.611093606570106
Training until validation scores don't improve for 50 rounds.
[100]	training's multi_logloss: 0.761706	training's wloss

In [11]:
save_importances(importances_=importances)
save_cm(y, oof_preds, data_dir + "/sample_submission.csv")

Normalized confusion matrix


In [13]:
def predict_chunk(df_, clfs_, meta_, features, train_mean, i_c):
    # Group by object id
    agg_ = get_full(df_, meta_)

    full_test = agg_.fillna(0)
    
    if i_c == 0:
        full_test.to_csv('full_test4.csv', header=True, mode='a', index=False)
    else:
        full_test.to_csv('full_test4.csv', header=False, mode='a', index=False)
    # Make predictions
    preds_ = None
    for clf in clfs_:
        if preds_ is None:
            preds_ = clf.predict_proba(full_test[features]) / len(clfs_)
        else:
            preds_ += clf.predict_proba(full_test[features]) / len(clfs_)

    # Compute preds_99 as the proba of class not being any of the others
    # preds_99 = 0.1 gives 1.769
    preds_99 = np.ones(preds_.shape[0])
    for i in range(preds_.shape[1]):
        preds_99 *= (1 - preds_[:, i])

    # Create DataFrame from predictions
    preds_df_ = pd.DataFrame(preds_, columns=['class_' + str(s) for s in clfs_[0].classes_])
    preds_df_['object_id'] = full_test['object_id']
    preds_df_['class_99'] = 0.14 * preds_99 / np.mean(preds_99) 
    return preds_df_

In [14]:
import gc
gc.enable()
import time
start = time.time()
indices = [(2, 28321978), (28321979, 49999913), (49999914, 74995602), (74995603, 99999902),
           (99999903, 125006991), (125006992, 149999831), (149999832, 174999676),
           (174999677, 199999735), (199999736, 225008807), (225008808, 249999654),
           (249999655, 275002650), (275002651, 299999649), (299999650, 325002148),
           (325002149, 349999619), (349999620, 374998640), (374998641, 399999534),
           (399999535, 424994349), (424994350, 449999411), (449999412, 451826374),
           (451826375, "end")]
test_files = [f"test_{e[0]}_{e[1]}.csv" for e in indices]
meta_test = pd.read_csv(data_dir + '/test_set_metadata.csv')
for i_c, f in enumerate(test_files):
    test = pd.read_csv(f)
    preds_df = predict_chunk(
        df_=test,
        clfs_=clfs,
        meta_=meta_test,
        features=full.columns,
        train_mean=train_mean,
        i_c=i_c
    )
    if i_c == 0:
        preds_df.to_csv('predictions4.csv', header=True, mode='a', index=False)
    else:
        preds_df.to_csv('predictions4.csv', header=False, mode='a', index=False)

    del preds_df
    print(f'{f} done in {(time.time() - start) / 60} minutes', flush=True)

Feature Extraction: 100%|██████████| 30/30 [01:56<00:00,  2.90s/it]
Feature Extraction: 100%|██████████| 20/20 [03:45<00:00,  5.24s/it]
Feature Extraction: 100%|██████████| 20/20 [00:26<00:00,  1.01s/it]
Feature Extraction: 100%|██████████| 20/20 [00:13<00:00,  2.06it/s]
Feature Extraction: 100%|██████████| 40/40 [00:09<00:00,  4.25it/s]


test_2_28321978.csv done in 12.434533886114757 minutes


Feature Extraction: 100%|██████████| 30/30 [01:57<00:00,  3.31s/it]
Feature Extraction: 100%|██████████| 20/20 [02:28<00:00,  7.51s/it]
Feature Extraction: 100%|██████████| 20/20 [00:22<00:00,  1.14it/s]
Feature Extraction: 100%|██████████| 20/20 [00:11<00:00,  2.06it/s]
Feature Extraction: 100%|██████████| 40/40 [00:09<00:00,  3.35it/s]


test_28321979_49999913.csv done in 23.10989718437195 minutes


Feature Extraction: 100%|██████████| 30/30 [02:11<00:00,  3.55s/it]
Feature Extraction: 100%|██████████| 20/20 [02:50<00:00,  8.58s/it]
Feature Extraction: 100%|██████████| 20/20 [00:27<00:00,  1.18s/it]
Feature Extraction: 100%|██████████| 20/20 [00:14<00:00,  1.55it/s]
Feature Extraction: 100%|██████████| 40/40 [00:10<00:00,  4.07it/s]


test_49999914_74995602.csv done in 35.397408219178516 minutes


Feature Extraction: 100%|██████████| 30/30 [02:12<00:00,  3.77s/it]
Feature Extraction: 100%|██████████| 20/20 [02:46<00:00,  6.62s/it]
Feature Extraction: 100%|██████████| 20/20 [00:25<00:00,  1.06it/s]
Feature Extraction: 100%|██████████| 20/20 [00:14<00:00,  1.65it/s]
Feature Extraction: 100%|██████████| 40/40 [00:11<00:00,  3.40it/s]


test_74995603_99999902.csv done in 47.58093222379684 minutes


Feature Extraction: 100%|██████████| 30/30 [02:13<00:00,  3.59s/it]
Feature Extraction: 100%|██████████| 20/20 [02:43<00:00,  6.03s/it]
Feature Extraction: 100%|██████████| 20/20 [00:26<00:00,  1.03it/s]
Feature Extraction: 100%|██████████| 20/20 [00:14<00:00,  1.66it/s]
Feature Extraction: 100%|██████████| 40/40 [00:11<00:00,  3.03it/s]


test_99999903_125006991.csv done in 59.70303700764974 minutes


Feature Extraction: 100%|██████████| 30/30 [02:08<00:00,  3.27s/it]
Feature Extraction: 100%|██████████| 20/20 [02:44<00:00,  6.46s/it]
Feature Extraction: 100%|██████████| 20/20 [00:26<00:00,  1.24s/it]
Feature Extraction: 100%|██████████| 20/20 [00:14<00:00,  1.86it/s]
Feature Extraction: 100%|██████████| 40/40 [00:11<00:00,  3.81it/s]


test_125006992_149999831.csv done in 71.80921949545542 minutes


Feature Extraction: 100%|██████████| 30/30 [02:13<00:00,  3.62s/it]
Feature Extraction: 100%|██████████| 20/20 [02:44<00:00,  6.70s/it]
Feature Extraction: 100%|██████████| 20/20 [00:26<00:00,  1.12s/it]
Feature Extraction: 100%|██████████| 20/20 [00:13<00:00,  1.76it/s]
Feature Extraction: 100%|██████████| 40/40 [00:10<00:00,  4.16it/s]


test_149999832_174999676.csv done in 83.9979857524236 minutes


Feature Extraction: 100%|██████████| 30/30 [02:11<00:00,  3.51s/it]
Feature Extraction: 100%|██████████| 20/20 [02:44<00:00,  6.09s/it]
Feature Extraction: 100%|██████████| 20/20 [00:25<00:00,  1.04it/s]
Feature Extraction: 100%|██████████| 20/20 [00:14<00:00,  1.72it/s]
Feature Extraction: 100%|██████████| 40/40 [00:11<00:00,  3.07it/s]


test_174999677_199999735.csv done in 96.13206797043482 minutes


Feature Extraction: 100%|██████████| 30/30 [02:14<00:00,  3.61s/it]
Feature Extraction: 100%|██████████| 20/20 [02:45<00:00,  6.47s/it]
Feature Extraction: 100%|██████████| 20/20 [00:26<00:00,  1.25s/it]
Feature Extraction: 100%|██████████| 20/20 [00:13<00:00,  1.73it/s]
Feature Extraction: 100%|██████████| 40/40 [00:11<00:00,  3.78it/s]


test_199999736_225008807.csv done in 108.36638201475144 minutes


Feature Extraction: 100%|██████████| 30/30 [02:15<00:00,  3.53s/it]
Feature Extraction: 100%|██████████| 20/20 [02:47<00:00,  7.30s/it]
Feature Extraction: 100%|██████████| 20/20 [00:26<00:00,  1.11s/it]
Feature Extraction: 100%|██████████| 20/20 [00:13<00:00,  1.73it/s]
Feature Extraction: 100%|██████████| 40/40 [00:10<00:00,  4.29it/s]


test_225008808_249999654.csv done in 120.63337833881378 minutes


Feature Extraction: 100%|██████████| 30/30 [02:14<00:00,  3.22s/it]
Feature Extraction: 100%|██████████| 20/20 [02:51<00:00,  8.39s/it]
Feature Extraction: 100%|██████████| 20/20 [00:26<00:00,  1.03s/it]
Feature Extraction: 100%|██████████| 20/20 [00:13<00:00,  1.85it/s]
Feature Extraction: 100%|██████████| 40/40 [00:10<00:00,  4.18it/s]


test_249999655_275002650.csv done in 132.92960915962854 minutes


Feature Extraction: 100%|██████████| 30/30 [02:13<00:00,  2.67s/it]
Feature Extraction: 100%|██████████| 20/20 [02:45<00:00,  6.33s/it]
Feature Extraction: 100%|██████████| 20/20 [00:25<00:00,  1.04it/s]
Feature Extraction: 100%|██████████| 20/20 [00:14<00:00,  1.66it/s]
Feature Extraction: 100%|██████████| 40/40 [00:11<00:00,  3.14it/s]


test_275002651_299999649.csv done in 145.1011205037435 minutes


Feature Extraction: 100%|██████████| 30/30 [02:14<00:00,  3.56s/it]
Feature Extraction: 100%|██████████| 20/20 [02:45<00:00,  7.31s/it]
Feature Extraction: 100%|██████████| 20/20 [00:26<00:00,  1.02s/it]
Feature Extraction: 100%|██████████| 20/20 [00:13<00:00,  1.96it/s]
Feature Extraction: 100%|██████████| 40/40 [00:10<00:00,  4.23it/s]


test_299999650_325002148.csv done in 157.28565887610117 minutes


Feature Extraction: 100%|██████████| 30/30 [02:08<00:00,  2.64s/it]
Feature Extraction: 100%|██████████| 20/20 [02:40<00:00,  6.05s/it]
Feature Extraction: 100%|██████████| 20/20 [00:26<00:00,  1.08s/it]
Feature Extraction: 100%|██████████| 20/20 [00:14<00:00,  1.74it/s]
Feature Extraction: 100%|██████████| 40/40 [00:11<00:00,  3.44it/s]


test_325002149_349999619.csv done in 169.3471800963084 minutes


Feature Extraction: 100%|██████████| 30/30 [02:14<00:00,  3.47s/it]
Feature Extraction: 100%|██████████| 20/20 [02:43<00:00,  6.47s/it]
Feature Extraction: 100%|██████████| 20/20 [00:26<00:00,  1.01it/s]
Feature Extraction: 100%|██████████| 20/20 [00:13<00:00,  1.91it/s]
Feature Extraction: 100%|██████████| 40/40 [00:11<00:00,  3.76it/s]


test_349999620_374998640.csv done in 181.49672244787217 minutes


Feature Extraction: 100%|██████████| 30/30 [02:11<00:00,  3.24s/it]
Feature Extraction: 100%|██████████| 20/20 [02:43<00:00,  5.93s/it]
Feature Extraction: 100%|██████████| 20/20 [00:26<00:00,  1.03s/it]
Feature Extraction: 100%|██████████| 20/20 [00:14<00:00,  1.74it/s]
Feature Extraction: 100%|██████████| 40/40 [00:11<00:00,  3.43it/s]


test_374998641_399999534.csv done in 193.57399430672328 minutes


Feature Extraction: 100%|██████████| 30/30 [02:16<00:00,  3.72s/it]
Feature Extraction: 100%|██████████| 20/20 [02:44<00:00,  6.75s/it]
Feature Extraction: 100%|██████████| 20/20 [00:26<00:00,  1.01s/it]
Feature Extraction: 100%|██████████| 20/20 [00:14<00:00,  1.68it/s]
Feature Extraction: 100%|██████████| 40/40 [00:10<00:00,  4.20it/s]


test_399999535_424994349.csv done in 205.7602188785871 minutes


Feature Extraction: 100%|██████████| 30/30 [02:09<00:00,  3.12s/it]
Feature Extraction: 100%|██████████| 20/20 [02:46<00:00,  7.56s/it]
Feature Extraction: 100%|██████████| 20/20 [00:26<00:00,  1.13s/it]
Feature Extraction: 100%|██████████| 20/20 [00:13<00:00,  1.72it/s]
Feature Extraction: 100%|██████████| 40/40 [00:10<00:00,  3.66it/s]


test_424994350_449999411.csv done in 217.92167537212373 minutes


Feature Extraction: 100%|██████████| 30/30 [00:09<00:00,  3.87it/s]
Feature Extraction: 100%|██████████| 20/20 [00:11<00:00,  2.33it/s]
Feature Extraction: 100%|██████████| 20/20 [00:02<00:00, 10.08it/s]
Feature Extraction: 100%|██████████| 20/20 [00:01<00:00, 15.71it/s]
Feature Extraction: 100%|██████████| 40/40 [00:00<00:00, 48.47it/s]


test_449999412_451826374.csv done in 218.82148482402167 minutes


Feature Extraction: 100%|██████████| 30/30 [00:09<00:00,  3.82it/s]
Feature Extraction: 100%|██████████| 20/20 [00:12<00:00,  2.02it/s]
Feature Extraction: 100%|██████████| 20/20 [00:01<00:00, 10.80it/s]
Feature Extraction: 100%|██████████| 20/20 [00:01<00:00, 17.65it/s]
Feature Extraction: 100%|██████████| 40/40 [00:00<00:00, 56.47it/s]


test_451826375_end.csv done in 219.71768608093262 minutes


In [15]:
!zip predictions4.csv.zip predictions4.csv


  adding: predictions4.csv (deflated 56%)
 90%|██████████████████████████████████▎   | 434M/481M [1:27:18<09:27, 86.9kB/s]
HTTPSConnectionPool(host='www.googleapis.com', port=443): Max retries exceeded with url: /upload/storage/v1/b/kaggle-competitions-submissions/o?uploadType=resumable&upload_id=AEnB2Up7oZAwAtkE361rnOddcBWqsppXBao06xp4f3jA9BpLyhKGz9Gmepcp6uTmnrnt_VVMYb-21_6LZN31fUwju6UtxTsF7g (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7f20dc702ac8>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution',))
Could not submit to competition

In [16]:
!kaggle competitions submit -c PLAsTiCC-2018 -f predictions4.csv.zip -m "Fourth"

494MB [1:52:20, 76.9kB/s]
Successfully submitted to PLAsTiCC Astronomical Classification