In [2]:
import sys
import pathlib
sys.path.append(pathlib.Path().cwd().parent.as_posix())

import auxiliary as aux

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor


valid_true, valid = aux.train_valid_split(aux.data, frac=0.0125, seed=19)
sn = 4      # subset number

## statistics

In [4]:
predicted = aux.ImputeHelper(
    aux.Step(aux.transformer, aux.subcol[sn], imputer=SimpleImputer()),
).run(valid, validate_on=valid_true)

Final validation: 100%|██████████| 15/15 [00:00<00:00, 457.24it/s]


Final validation score: 1.9085581040001287
Overall final score: 0.1824617746726912


In [5]:
predicted = aux.ImputeHelper(
    aux.Step(aux.groupstat, aux.subcol[2] + aux.subcol[sn], gcol=['F_2_10']),
).run(valid, validate_on=valid_true)

Final validation: 100%|██████████| 15/15 [00:00<00:00, 460.15it/s]


Final validation score: 1.9087085139675106
Overall final score: 0.18248992329459227


## predictor

In [7]:
predicted = aux.ImputeHelper(
    aux.Step(aux.predictor, aux.subcol[sn], estimator=LGBMRegressor(random_state=7, n_jobs=-1, n_estimators=1500, num_leaves=29)),
).run(valid, validate_on=valid_true)

 67%|██████▋   | 10/15 [08:59<05:14, 62.87s/it, avg. score=0.618]

## mean matching

In [None]:
pipeline = make_pipeline(
    SimpleImputer(),
    DecisionTreeRegressor(random_state=7, max_leaf_nodes=50)
)

predicted = aux.ImputeHelper(
    aux.Step(aux.mean_matching, aux.subcol[sn], N=2500, init=pipeline, backend='threading'),
).run(valid, validate_on=valid_true)

In [None]:
pipeline = make_pipeline(
    SimpleImputer(),
    DecisionTreeRegressor(random_state=7, max_leaf_nodes=100)
)

predicted = aux.ImputeHelper(
    aux.Step(aux.mean_matching, aux.subcol[sn], N=2500, init=pipeline, backend='threading'),
).run(valid, validate_on=valid_true)

In [None]:
predicted = aux.ImputeHelper(
    aux.Step(aux.mean_matching, aux.subcol[sn], N=2500, init=LGBMRegressor(random_state=7, n_jobs=-1, n_estimators=5, num_leaves=13), backend='threading'),
).run(valid, validate_on=valid_true)

## MICE

In [None]:
predicted = aux.ImputeHelper(
    aux.Step(aux.mice, aux.subcol[sn], estimator=SGDRegressor(max_iter=1000), epochs=5, seed=11, autosplit=False),
).run(valid, validate_on=valid_true)

In [None]:
predicted = aux.ImputeHelper(
    aux.Step(aux.mice, aux.subcol[sn], estimator=SGDRegressor(learning_rate='adaptive', eta0=0.1, alpha=0.001, max_iter=1000), epochs=5, seed=11, autosplit=False),
).run(valid, validate_on=valid_true)

## test

In [3]:
predicted = aux.ImputeHelper(
    aux.Step(aux.mice, aux.subcol[sn], estimator=SGDRegressor(max_iter=1000), epochs=5, seed=11, autosplit=False, max_fill_nan_count=1),
    aux.Step(aux.mice, aux.subcol[sn], estimator=SGDRegressor(max_iter=1000), epochs=5, seed=11, autosplit=False, max_fill_nan_count=1),
    aux.Step(aux.mice, aux.subcol[sn], estimator=SGDRegressor(max_iter=1000), epochs=5, seed=11, autosplit=False, max_fill_nan_count=1),
    aux.Step(aux.predictor, aux.subcol[sn], estimator=LGBMRegressor(random_state=7, n_jobs=-1, n_estimators=200, num_leaves=29)),
).run(valid, validate_on=valid_true)

Epoch 1 / 5: 100%|██████████| 15/15 [00:08<00:00,  1.78it/s, avg. score=0.993]
Epoch 2 / 5: 100%|██████████| 15/15 [00:07<00:00,  1.89it/s, avg. score=0.686]
Epoch 3 / 5: 100%|██████████| 15/15 [00:08<00:00,  1.84it/s, avg. score=0.669]
Epoch 4 / 5: 100%|██████████| 15/15 [00:08<00:00,  1.84it/s, avg. score=0.668]
Epoch 5 / 5: 100%|██████████| 15/15 [00:08<00:00,  1.70it/s, avg. score=0.669]
Epoch 1 / 5: 0it [00:00, ?it/s]
Epoch 2 / 5: 0it [00:00, ?it/s]
Epoch 3 / 5: 0it [00:00, ?it/s]
Epoch 4 / 5: 0it [00:00, ?it/s]
Epoch 5 / 5: 0it [00:00, ?it/s]
Epoch 1 / 5: 0it [00:00, ?it/s]
Epoch 2 / 5: 0it [00:00, ?it/s]
Epoch 3 / 5: 0it [00:00, ?it/s]
Epoch 4 / 5: 0it [00:00, ?it/s]
Epoch 5 / 5: 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Final validation: 100%|██████████| 15/15 [00:00<00:00, 451.12it/s]

Final validation score: 0.9322867447162698
Overall final score: 0.0864203314210984





In [None]:
# 0.9054178709836844 / 0.444