In [1]:
import sys
import pathlib
sys.path.append(pathlib.Path().cwd().parent.as_posix())

import auxiliary as aux

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor


valid_true, valid = aux.train_valid_split(aux.data, frac=0.0125, seed=19)
sn = 1      # subset number

## statistics

In [13]:
predicted = aux.ImputeHelper(
    aux.Step(aux.transformer, aux.subcol[sn], imputer=SimpleImputer(strategy='mean')),
).run(valid, validate_on=valid_true)

Final validation: 100%|██████████| 15/15 [00:00<00:00, 422.15it/s]


Final validation score: 0.9458409606286409
Overall final score: 0.07871724952961066


In [16]:
predicted = aux.ImputeHelper(
    aux.Step(aux.transformer, aux.subcol[sn], imputer=SimpleImputer(strategy='median')),
).run(valid, validate_on=valid_true)

Final validation: 100%|██████████| 15/15 [00:00<00:00, 380.51it/s]


Final validation score: 0.946407242477418
Overall final score: 0.07875267965525354


In [40]:
predicted = aux.ImputeHelper(
    aux.Step(aux.groupstat, aux.subcol[2] + aux.subcol[sn], gcol=['F_2_4', 'F_2_8']),
    aux.Step(aux.transformer, aux.subcol[sn], imputer=SimpleImputer()),
).run(valid, validate_on=valid_true)

Stats contain NaN, so data was not filled in completely!


Final validation: 100%|██████████| 15/15 [00:00<00:00, 409.07it/s]


Final validation score: 0.9459950582305936
Overall final score: 0.07872988528282197


## predictor

In [4]:
predicted = aux.ImputeHelper(
    aux.Step(aux.predictor, aux.subcol[sn], estimator=LGBMRegressor(random_state=7, n_jobs=-1, n_estimators=200, num_leaves=29)),
).run(valid, validate_on=valid_true)

100%|██████████| 15/15 [00:54<00:00,  3.61s/it, avg. score=0.933]
Final validation: 100%|██████████| 15/15 [00:00<00:00, 376.01it/s]


Final validation score: 0.9468399368083408
Overall final score: 0.07880115193845771


## mean matching

In [5]:
pipeline = make_pipeline(
    SimpleImputer(),
    DecisionTreeRegressor(random_state=7, max_leaf_nodes=50)
)

predicted = aux.ImputeHelper(
    aux.Step(aux.mean_matching, aux.subcol[sn], N=2500, init=pipeline, backend='threading'),
).run(valid, validate_on=valid_true)

Initiate values: 100%|██████████| 15/15 [01:33<00:00,  6.23s/it, avg. score=0.945]
Collect remapper: 100%|██████████| 15/15 [00:00<00:00, 3233.35it/s]
Final validation: 100%|██████████| 15/15 [00:00<00:00, 483.31it/s]

Final validation score: 0.9459761479273461
Overall final score: 0.07872871604323141





In [6]:
pipeline = make_pipeline(
    SimpleImputer(),
    DecisionTreeRegressor(random_state=7, max_leaf_nodes=100)
)

predicted = aux.ImputeHelper(
    aux.Step(aux.mean_matching, aux.subcol[sn], N=2500, init=pipeline, backend='threading'),
).run(valid, validate_on=valid_true)

Initiate values: 100%|██████████| 15/15 [01:54<00:00,  7.64s/it, avg. score=0.944]
Collect remapper: 100%|██████████| 15/15 [00:00<00:00, 5871.09it/s]
Final validation: 100%|██████████| 15/15 [00:00<00:00, 402.23it/s]


Final validation score: 0.9459910474734456
Overall final score: 0.07872953099368132


In [7]:
predicted = aux.ImputeHelper(
    aux.Step(aux.mean_matching, aux.subcol[sn], N=2500, init=LGBMRegressor(random_state=7, n_jobs=-1, n_estimators=5, num_leaves=13), backend='threading'),
).run(valid, validate_on=valid_true)

Initiate values: 100%|██████████| 15/15 [00:06<00:00,  2.20it/s, avg. score=0.945]
Collect remapper: 100%|██████████| 15/15 [00:00<00:00, 254.37it/s]
Final validation: 100%|██████████| 15/15 [00:00<00:00, 423.72it/s]

Final validation score: 0.9460813265472827
Overall final score: 0.07873790926574317





## MICE

In [8]:
predicted = aux.ImputeHelper(
    aux.Step(aux.mice, aux.subcol[sn], estimator=SGDRegressor(max_iter=1000), epochs=5, seed=11, autosplit=False),
).run(valid, validate_on=valid_true)

Epoch 1 / 5: 100%|██████████| 15/15 [00:06<00:00,  2.31it/s, avg. score=0.946]
Epoch 2 / 5: 100%|██████████| 15/15 [00:06<00:00,  2.29it/s, avg. score=0.946]
Epoch 3 / 5: 100%|██████████| 15/15 [00:06<00:00,  2.42it/s, avg. score=0.946]
Epoch 4 / 5: 100%|██████████| 15/15 [00:06<00:00,  2.41it/s, avg. score=0.946]
Epoch 5 / 5: 100%|██████████| 15/15 [00:06<00:00,  2.33it/s, avg. score=0.946]
Final validation: 100%|██████████| 15/15 [00:00<00:00, 456.87it/s]

Final validation score: 0.9467771309784687
Overall final score: 0.07879497630442307





In [9]:
predicted = aux.ImputeHelper(
    aux.Step(aux.mice, aux.subcol[sn], estimator=SGDRegressor(learning_rate='adaptive', eta0=0.1, alpha=0.001, max_iter=1000), epochs=5, seed=11, autosplit=False),
).run(valid, validate_on=valid_true)

Epoch 1 / 5: 100%|██████████| 15/15 [00:45<00:00,  3.04s/it, avg. score=0.946]
Epoch 2 / 5: 100%|██████████| 15/15 [00:46<00:00,  3.08s/it, avg. score=0.946]
Epoch 3 / 5: 100%|██████████| 15/15 [00:43<00:00,  2.93s/it, avg. score=0.946]
Epoch 4 / 5: 100%|██████████| 15/15 [00:44<00:00,  3.00s/it, avg. score=0.946]
Epoch 5 / 5: 100%|██████████| 15/15 [00:46<00:00,  3.13s/it, avg. score=0.946]
Final validation: 100%|██████████| 15/15 [00:00<00:00, 529.50it/s]

Final validation score: 0.9458632858361525
Overall final score: 0.07871894861898743





## test

In [6]:
pipeline = make_pipeline(
    SimpleImputer(),
    DecisionTreeRegressor(min_samples_leaf=111)
)

predicted = aux.ImputeHelper(
    # aux.Step(aux.mean_matching, aux.subcol[sn], N=5, init=pipeline, backend='threading', max_fill_nan_count=1),
    # aux.Step(aux.mice, aux.subcol[sn], estimator=SGDRegressor(max_iter=1000), epochs=5, seed=11, autosplit=False),
    # aux.Step(aux.predictor, aux.subcol[sn], estimator=LGBMRegressor(random_state=7, n_jobs=-1, n_estimators=2000, num_leaves=29), max_fill_nan_count=1),
    # aux.Step(aux.transformer, aux.subcol[sn], imputer=SimpleImputer(strategy='mean')),
    aux.Step(aux.cosine_stats, aux.subcol[sn], threshold=0.7, subsample=0.05, backend='threading', seed=17),
    aux.Step(aux.transformer, aux.subcol[sn], imputer=SimpleImputer(strategy='mean')),
).run(valid, validate_on=valid_true)

Using `cosine_stats()` is deprecated! It may take a VERY long time on large data, work unstable or don't work at all.
87877 rows in 1758 chunks


100%|██████████| 1758/1758 [07:39<00:00,  3.82it/s]


Some values are NaN. Try decrease threshold.


Final validation: 100%|██████████| 15/15 [00:00<00:00, 483.22it/s]

Final validation score: 0.9543703617457886
Overall final score: 0.07942118458080594





In [17]:
# TRY THIS
predicted = aux.ImputeHelper(
    aux.Step(aux.cosine_stats, aux.subcol[sn], threshold=0.9, subsample=0.1, backend='threading', seed=17, max_fill_nan_count=1),
    # aux.Step(aux.predictor, aux.subcol[sn], estimator=LGBMRegressor(random_state=7, n_jobs=-1, n_estimators=100, num_leaves=29)),
    aux.Step(aux.transformer, aux.subcol[sn], imputer=SimpleImputer(strategy='mean')),
).run(valid, validate_on=valid_true)

In [6]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer


predicted = aux.ImputeHelper(
    aux.Step(aux.transformer, aux.subcol[sn], imputer=IterativeImputer(LGBMRegressor(n_jobs=-1, n_estimators=100, num_leaves=29), max_iter=50, random_state=11) ),
).run(valid, validate_on=valid_true)

Final validation: 100%|██████████| 15/15 [00:00<00:00, 326.25it/s]


Final validation score: 0.9464343787314623
Overall final score: 0.07876676098004146


In [5]:
%%time
predicted = aux.ImputeHelper(
    aux.Step(aux.transformer, aux.subcol[sn], imputer=IterativeImputer(SGDRegressor(), max_iter=50, random_state=11) ),
).run(valid, validate_on=valid_true)

Final validation: 100%|██████████| 15/15 [00:00<00:00, 453.97it/s]

Final validation score: 0.9465468724349905
Overall final score: 0.07877631765432132
CPU times: user 12min 31s, sys: 24.8 s, total: 12min 56s
Wall time: 5min 32s





In [2]:
predicted = aux.ImputeHelper(
    # Step(transformer, subcol[1], imputer=SimpleImputer()),
    # Step(transformer, subcol[1], imputer=IterativeImputer(SGDRegressor(), max_iter=50, random_state=11)),
    # Step(transformer, subcol[1], imputer=SimpleImputer(), max_fill_nan_count=1),
    aux.Step(aux.cosine_stats, aux.subcol[1], threshold=0.8, subsample=0.5, backend='threading', seed=17, max_fill_nan_count=1),
    # Step(predictor, subcol[1], estimator=LGBMRegressor(random_state=7, n_jobs=-1, n_estimators=100, num_leaves=29)),
    # Step(mice, subcol[1], estimator=SGDRegressor(), epochs=10, seed=11, autosplit=False),
    aux.Step(aux.transformer, aux.subcol[1], imputer=SimpleImputer()),
).run(valid, validate_on=valid_true)

Using `cosine_stats()` is deprecated! It may take a VERY long time on large data, work unstable or don't work at all.
36834 rows in 737 chunks


100%|██████████| 737/737 [16:36<00:00,  1.35s/it]


Some values are NaN. Try decrease threshold.


Final validation: 100%|██████████| 15/15 [00:00<00:00, 375.66it/s]


Final validation score: 0.948455967517788
Overall final score: 0.07893368806759937


In [None]:
# 0.9458409606286409