In [1]:
import sys
import pathlib
sys.path.append(pathlib.Path().cwd().parent.as_posix())

import auxiliary as aux

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor


valid_true, valid = aux.train_valid_split(aux.data, frac=0.0125, seed=19)
sn = 4      # subset number

In [24]:
L1 = ['a', 'b']
L2 = ['c']
len(set(L1).intersection(L2)) == len(set(L1)) == len(set(L2))

False

## statistics

In [9]:
predicted = aux.ImputeHelper(
    aux.Step(aux.transformer, aux.subcol[sn], imputer=SimpleImputer()),
).run(valid, validate_on=valid_true)

Final validation: 100%|██████████| 15/15 [00:00<00:00, 384.83it/s]


Final validation score: 1.9085581040001287
Overall final score: 0.1824617746726912


In [4]:
predicted = aux.ImputeHelper(
    aux.Step(aux.groupstat, aux.subcol[2] + aux.subcol[sn], gcol=['F_2_10']),
).run(valid, validate_on=valid_true)

Final validation: 100%|██████████| 15/15 [00:00<00:00, 421.08it/s]


Final validation score: 1.9087085139675106
Overall final score: 0.18248992329459227


## predictor

In [5]:
predicted = aux.ImputeHelper(
    aux.Step(aux.predictor, aux.subcol[sn], estimator=LGBMRegressor(random_state=7, n_jobs=-1, n_estimators=1500, num_leaves=29)),
).run(valid, validate_on=valid_true)

100%|██████████| 15/15 [08:59<00:00, 35.96s/it, avg. score=0.555]
Final validation: 100%|██████████| 15/15 [00:00<00:00, 442.11it/s]


Final validation score: 0.6478440903133111
Overall final score: 0.0603447382289857


## mean matching

In [6]:
pipeline = make_pipeline(
    SimpleImputer(),
    DecisionTreeRegressor(random_state=7, max_leaf_nodes=50)
)

predicted = aux.ImputeHelper(
    aux.Step(aux.mean_matching, aux.subcol[sn], N=2500, init=pipeline, backend='threading'),
).run(valid, validate_on=valid_true)

Initiate values: 100%|██████████| 15/15 [01:20<00:00,  5.38s/it, avg. score=1.55]
Collect remapper: 100%|██████████| 15/15 [00:00<00:00, 2332.85it/s]
Final validation: 100%|██████████| 15/15 [00:00<00:00, 393.98it/s]


Final validation score: 1.9108438206748157
Overall final score: 0.1827651397711478


In [7]:
pipeline = make_pipeline(
    SimpleImputer(),
    DecisionTreeRegressor(random_state=7, max_leaf_nodes=100)
)

predicted = aux.ImputeHelper(
    aux.Step(aux.mean_matching, aux.subcol[sn], N=2500, init=pipeline, backend='threading'),
).run(valid, validate_on=valid_true)

Initiate values: 100%|██████████| 15/15 [01:26<00:00,  5.78s/it, avg. score=1.49]
Collect remapper: 100%|██████████| 15/15 [00:00<00:00, 6456.09it/s]
Final validation: 100%|██████████| 15/15 [00:00<00:00, 446.51it/s]

Final validation score: 1.909850799324801
Overall final score: 0.18259287260794207





In [8]:
predicted = aux.ImputeHelper(
    aux.Step(aux.mean_matching, aux.subcol[sn], N=2500, init=LGBMRegressor(random_state=7, n_jobs=-1, n_estimators=5, num_leaves=13), backend='threading'),
).run(valid, validate_on=valid_true)

Initiate values: 100%|██████████| 15/15 [00:07<00:00,  1.88it/s, avg. score=1.73]
Collect remapper: 100%|██████████| 15/15 [00:00<00:00, 888.18it/s]
Final validation: 100%|██████████| 15/15 [00:00<00:00, 401.77it/s]


Final validation score: 1.9107521843225386
Overall final score: 0.1827374542731559


## MICE

In [None]:
predicted = aux.ImputeHelper(
    aux.Step(aux.mice, aux.subcol[sn], estimator=SGDRegressor(max_iter=1000), epochs=5, seed=11, autosplit=False),
).run(valid, validate_on=valid_true)

In [None]:
predicted = aux.ImputeHelper(
    aux.Step(aux.mice, aux.subcol[sn], estimator=SGDRegressor(learning_rate='adaptive', eta0=0.1, alpha=0.001, max_iter=1000), epochs=5, seed=11, autosplit=False),
).run(valid, validate_on=valid_true)

## test

In [3]:
predicted = aux.ImputeHelper(
    aux.Step(aux.mice, aux.subcol[sn], estimator=SGDRegressor(max_iter=1000), epochs=5, seed=11, autosplit=False, max_fill_nan_count=1),
    aux.Step(aux.mice, aux.subcol[sn], estimator=SGDRegressor(max_iter=1000), epochs=5, seed=11, autosplit=False, max_fill_nan_count=1),
    aux.Step(aux.mice, aux.subcol[sn], estimator=SGDRegressor(max_iter=1000), epochs=5, seed=11, autosplit=False, max_fill_nan_count=1),
    aux.Step(aux.predictor, aux.subcol[sn], estimator=LGBMRegressor(random_state=7, n_jobs=-1, n_estimators=200, num_leaves=29)),
).run(valid, validate_on=valid_true)

Epoch 1 / 5: 100%|██████████| 15/15 [00:08<00:00,  1.78it/s, avg. score=0.993]
Epoch 2 / 5: 100%|██████████| 15/15 [00:07<00:00,  1.89it/s, avg. score=0.686]
Epoch 3 / 5: 100%|██████████| 15/15 [00:08<00:00,  1.84it/s, avg. score=0.669]
Epoch 4 / 5: 100%|██████████| 15/15 [00:08<00:00,  1.84it/s, avg. score=0.668]
Epoch 5 / 5: 100%|██████████| 15/15 [00:08<00:00,  1.70it/s, avg. score=0.669]
Epoch 1 / 5: 0it [00:00, ?it/s]
Epoch 2 / 5: 0it [00:00, ?it/s]
Epoch 3 / 5: 0it [00:00, ?it/s]
Epoch 4 / 5: 0it [00:00, ?it/s]
Epoch 5 / 5: 0it [00:00, ?it/s]
Epoch 1 / 5: 0it [00:00, ?it/s]
Epoch 2 / 5: 0it [00:00, ?it/s]
Epoch 3 / 5: 0it [00:00, ?it/s]
Epoch 4 / 5: 0it [00:00, ?it/s]
Epoch 5 / 5: 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Final validation: 100%|██████████| 15/15 [00:00<00:00, 451.12it/s]

Final validation score: 0.9322867447162698
Overall final score: 0.0864203314210984





In [None]:
# Final validation score: 0.6478440903133111
# Overall final score: 0.0603447382289857