In [1]:
import auxiliary as aux

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor

import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Input, BatchNormalization


valid_true, valid = aux.train_valid_split(aux.data, frac=0.0125, seed=19)

## tests

In [7]:
%%time
predicted = aux.ImputeHelper(
    aux.Step(aux.predictor, aux.subcol[4], estimator=LGBMRegressor(random_state=7, n_jobs=-1, n_estimators=1500, num_leaves=29)),
    aux.Step(aux.transformer, 'all', imputer=SimpleImputer()),
).run(valid, validate_on=valid_true)

100%|██████████| 15/15 [13:22<00:00, 53.49s/it, avg. score=0.555]
Final validation: 100%|██████████| 55/55 [00:00<00:00, 476.17it/s]


Final validation score: 0.8783267769946155
Overall final score: 0.10180127772818637
CPU times: user 1h 25min, sys: 2min 2s, total: 1h 27min 3s
Wall time: 13min 24s


In [9]:
# %%time
# predicted = aux.ImputeHelper(
#     aux.Step(aux.predictor, aux.subcol[4], estimator=LGBMRegressor(random_state=7, n_jobs=-1, n_estimators=1500, num_leaves=29)),
#     aux.Step(aux.cosine_stats, aux.subcol[1], threshold=0.7, subsample=0.05, backend='threading', seed=17),
#     aux.Step(aux.transformer, 'all', imputer=SimpleImputer()),
# ).run(valid, validate_on=valid_true)

100%|██████████| 15/15 [12:57<00:00, 51.86s/it, avg. score=0.555]


Using `cosine_stats()` is deprecated! It may take a VERY long time on large data, work unstable or don't work at all.
87877 rows in 1758 chunks


100%|██████████| 1758/1758 [07:41<00:00,  3.81it/s]


Some values are NaN. Try decrease threshold.


Final validation: 100%|██████████| 55/55 [00:00<00:00, 461.12it/s]


Final validation score: 0.8806708728036292
Overall final score: 0.10207657602880504
CPU times: user 1h 40min 55s, sys: 10min 55s, total: 1h 51min 51s
Wall time: 20min 48s


In [11]:
add_cols = ['F_1_7', 'F_1_12', 'F_1_13'] + ['F_3_19', 'F_3_21']
prepared = aux.ImputeHelper(
    aux.Step(aux.predictor, aux.subcol[4] + add_cols, estimator=LGBMRegressor(random_state=7, n_jobs=-1, n_estimators=1500, num_leaves=29)),
).run(valid, validate_on=valid_true)

100%|██████████| 20/20 [13:08<00:00, 39.43s/it, avg. score=0.581]
Final validation: 100%|██████████| 20/20 [00:00<00:00, 470.17it/s]


Final validation score: 0.6688705070162656
Overall final score: 0.06564987610368235


In [21]:
# NN approach
tf.random.set_seed(11)

model = Sequential([
    Input(shape=(45, )),
    # Dense(512, activation='selu'),
    Dense(512, activation='selu'),
    Dense(512, activation='selu'),
    Dense(35, activation='sigmoid'),
])

# SGD / RMSprop / Adam / Adadelta / Adagrad / Adamax / Nadam / Ftrl
model.compile(optimizer='adam', loss='mse', metrics='mse')
pipeline = make_pipeline(
    SimpleImputer(),
    model
)

add_cols = ['F_1_7', 'F_1_12', 'F_1_13'] + ['F_3_19', 'F_3_21']
pred_cols = [*[f'F_1_{n}' for n in range(15) if n not in (7, 12, 13)],
             *[f'F_3_{n}' for n in range(25) if n not in (19, 21)]]

predicted = aux.ImputeHelper(
    # aux.Step(aux.predictor, aux.subcol[4] + add_cols, estimator=LGBMRegressor(random_state=7, n_jobs=-1, n_estimators=1500, num_leaves=29)),
    aux.Step(aux.onestep_neural, 'all', pipeline, fill_columns=pred_cols, sequential__epochs=3),
).run(prepared, validate_on=valid_true)
# it's an improve if is better than
# Final validation score: 0.8783267769946155
# Overall final score: 0.10180127772818637

Epoch 1/3
Epoch 2/3
Epoch 3/3


Final validation: 100%|██████████| 35/35 [00:00<00:00, 380.44it/s]


Final validation score: 0.9988624634525739
Overall final score: 0.1018428030899562


In [17]:
# sub#2 to dummies
df = pd.concat([aux.subset[1], pd.get_dummies(aux.subset[2].astype('category')), aux.subset[3], aux.subset[4]], axis=1)
df_valid_true, df_valid = aux.train_valid_split(df, frac=0.0125, seed=19)

tf.random.set_seed(11)
model = Sequential([
    Input(shape=(388, )),
    # Dense(512, activation='selu'),
    Dense(512, activation='tanh'),
    Dense(512, activation='tanh'),
    Dense(35, activation='sigmoid'),
])

# SGD / RMSprop / Adam / Adadelta / Adagrad / Adamax / Nadam / Ftrl
model.compile(optimizer='adam', loss='mse', metrics='mse')
pipeline = make_pipeline(
    SimpleImputer(),
    model
)

add_cols = ['F_1_7', 'F_1_12', 'F_1_13'] + ['F_3_19', 'F_3_21']
pred_cols = [*[f'F_1_{n}' for n in range(15) if n not in (7, 12, 13)],
             *[f'F_3_{n}' for n in range(25) if n not in (19, 21)]]

predicted = aux.ImputeHelper(
    aux.Step(aux.predictor, aux.subcol[4] + add_cols, estimator=LGBMRegressor(random_state=7, n_jobs=-1, n_estimators=1500, num_leaves=29)),
    aux.Step(aux.onestep_neural, 'all', pipeline, fill_columns=pred_cols, sequential__epochs=3),
).run(df_valid, validate_on=df_valid_true)

100%|██████████| 20/20 [12:37<00:00, 37.89s/it, avg. score=0.797]


Epoch 1/3
Epoch 2/3
Epoch 3/3
   97/11071 [..............................] - ETA: 17s

2022-06-25 00:25:21.865953: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 549817728 exceeds 10% of free system memory.




Final validation: 100%|██████████| 55/55 [00:00<00:00, 242.98it/s]


Final validation score: 0.9662174870419447
Overall final score: 0.11153378677407988


## temp

In [12]:
predicted.isna().any().any()

False

In [None]:
# default
# Final validation score: 0.8783267769946155
# Overall final score: 0.10180127772818637