In [24]:
from knnimpute import (
    knn_impute_few_observed,
    knn_impute_with_argpartition,
    knn_impute_optimistic,
    knn_impute_reference,
)
import time
import numpy as np
import pandas as pd
from collections import OrderedDict

In [25]:
fns = [
    knn_impute_few_observed,
    knn_impute_with_argpartition,
    knn_impute_optimistic,
    knn_impute_reference,
]

def time_implementations(X, k):
    mask = np.isnan(X)
    result_dict = {}
    for fn in fns:
        start_t = time.time()
        fn(X, mask, k=k)
        end_t = time.time()
        elapsed_t = end_t - start_t
        name = fn.__name__ 
        result_dict[name] = elapsed_t
    return result_dict

In [26]:
def create_rank_k_dataset(
        n_rows=5,
        n_cols=5,
        k=3,
        fraction_missing=0.1,
        random_seed=0):
    np.random.seed(random_seed)
    x = np.random.randn(n_rows, k)
    y = np.random.randn(k, n_cols)
    XY = np.dot(x, y)
    missing_raw_values = np.random.uniform(0, 1, (n_rows, n_cols))
    missing_mask = missing_raw_values < fraction_missing
    XY[missing_mask] = np.nan
    return XY

In [27]:
columns = OrderedDict([
    ("k", []),
    ("n_rows", []),
    ("n_cols", []),
    ("frac_missing", []),
])
for fn in fns:
    columns[fn.__name__] = []

for k in [1, 3]:
    for n_rows in [250, 1000]:
        for n_cols in [25, 100, 400]:
            for frac_missing in [0.05, 0.25]:
                print(k, n_rows, n_cols, frac_missing)
                columns["k"].append(k)
                columns["n_rows"].append(n_rows)
                columns["n_cols"].append(n_cols)
                columns["frac_missing"].append(frac_missing)
                X = create_rank_k_dataset(n_rows=n_rows, n_cols=n_cols, fraction_missing=frac_missing)
                for (name, seconds) in time_implementations(X, k).items():
                    columns[name].append(seconds)
df = pd.DataFrame(columns)

1 250 25 0.05
1 250 25 0.25
1 250 100 0.05
1 250 100 0.25
1 250 400 0.05
1 250 400 0.25
1 1000 25 0.05
1 1000 25 0.25
1 1000 100 0.05
1 1000 100 0.25
1 1000 400 0.05
1 1000 400 0.25
3 250 25 0.05
3 250 25 0.25
3 250 100 0.05
3 250 100 0.25
3 250 400 0.05
3 250 400 0.25
3 1000 25 0.05
3 1000 25 0.25
3 1000 100 0.05
3 1000 100 0.25
3 1000 400 0.05
3 1000 400 0.25


In [29]:
df

Unnamed: 0,k,n_rows,n_cols,frac_missing,knn_impute_few_observed,knn_impute_with_argpartition,knn_impute_optimistic,knn_impute_reference
0,1,250,25,0.05,0.041315,0.023684,0.026951,0.029649
1,1,250,25,0.25,0.039408,0.055361,0.049052,0.073835
2,1,250,100,0.05,0.057301,0.080033,0.063674,0.100215
3,1,250,100,0.25,0.140592,0.202042,0.175675,0.320111
4,1,250,400,0.05,0.235964,0.241643,0.173593,0.29966
5,1,250,400,0.25,0.549346,0.701175,0.38172,1.004569
6,1,1000,25,0.05,0.275119,0.240917,0.318955,0.298176
7,1,1000,25,0.25,0.448762,0.414425,0.439901,0.761481
8,1,1000,100,0.05,0.78688,0.723401,0.719038,1.002797
9,1,1000,100,0.25,1.475442,1.524288,1.343072,2.980739


In [32]:
df["argpartition_log_ratio"] = np.log(df["knn_impute_reference"]/ df["knn_impute_with_argpartition"] )
df["few_observed_log_ratio"] = np.log( df["knn_impute_reference"] / df["knn_impute_few_observed"])
df["optimistic_log_ratio"] = np.log(df["knn_impute_reference"] /df["knn_impute_optimistic"] )

In [33]:
df

Unnamed: 0,k,n_rows,n_cols,frac_missing,knn_impute_few_observed,knn_impute_with_argpartition,knn_impute_optimistic,knn_impute_reference,argpartition_log_ratio,few_observed_log_ratio,optimistic_log_ratio
0,1,250,25,0.05,0.041315,0.023684,0.026951,0.029649,0.224628,-0.331798,0.095415
1,1,250,25,0.25,0.039408,0.055361,0.049052,0.073835,0.287956,0.627863,0.408951
2,1,250,100,0.05,0.057301,0.080033,0.063674,0.100215,0.224878,0.558999,0.453542
3,1,250,100,0.25,0.140592,0.202042,0.175675,0.320111,0.460193,0.822807,0.600033
4,1,250,400,0.05,0.235964,0.241643,0.173593,0.29966,0.215187,0.23897,0.545936
5,1,250,400,0.25,0.549346,0.701175,0.38172,1.004569,0.359556,0.603586,0.967626
6,1,1000,25,0.05,0.275119,0.240917,0.318955,0.298176,0.213232,0.08048,-0.067366
7,1,1000,25,0.25,0.448762,0.414425,0.439901,0.761481,0.608374,0.528773,0.548716
8,1,1000,100,0.05,0.78688,0.723401,0.719038,1.002797,0.326584,0.242472,0.332634
9,1,1000,100,0.25,1.475442,1.524288,1.343072,2.980739,0.670644,0.703213,0.797212


In [34]:
df[df.n_rows == 1000]

Unnamed: 0,k,n_rows,n_cols,frac_missing,knn_impute_few_observed,knn_impute_with_argpartition,knn_impute_optimistic,knn_impute_reference,argpartition_log_ratio,few_observed_log_ratio,optimistic_log_ratio
6,1,1000,25,0.05,0.275119,0.240917,0.318955,0.298176,0.213232,0.08048,-0.067366
7,1,1000,25,0.25,0.448762,0.414425,0.439901,0.761481,0.608374,0.528773,0.548716
8,1,1000,100,0.05,0.78688,0.723401,0.719038,1.002797,0.326584,0.242472,0.332634
9,1,1000,100,0.25,1.475442,1.524288,1.343072,2.980739,0.670644,0.703213,0.797212
10,1,1000,400,0.05,3.359883,3.193343,2.835239,4.267282,0.289909,0.239071,0.408851
11,1,1000,400,0.25,6.40978,6.837735,6.888379,12.996844,0.64225,0.706882,0.634871
18,3,1000,25,0.05,0.321528,0.241566,0.30087,0.290265,0.183652,-0.10229,-0.035883
19,3,1000,25,0.25,0.468678,0.487511,0.49282,0.791085,0.484093,0.523489,0.473261
20,3,1000,100,0.05,0.718434,0.765385,0.820797,1.031683,0.298568,0.361873,0.228671
21,3,1000,100,0.25,1.427744,1.81215,1.430812,2.981968,0.498069,0.736488,0.734341


In [35]:
df[(df.n_rows == 1000) & (df.frac_missing == 0.05)]

Unnamed: 0,k,n_rows,n_cols,frac_missing,knn_impute_few_observed,knn_impute_with_argpartition,knn_impute_optimistic,knn_impute_reference,argpartition_log_ratio,few_observed_log_ratio,optimistic_log_ratio
6,1,1000,25,0.05,0.275119,0.240917,0.318955,0.298176,0.213232,0.08048,-0.067366
8,1,1000,100,0.05,0.78688,0.723401,0.719038,1.002797,0.326584,0.242472,0.332634
10,1,1000,400,0.05,3.359883,3.193343,2.835239,4.267282,0.289909,0.239071,0.408851
18,3,1000,25,0.05,0.321528,0.241566,0.30087,0.290265,0.183652,-0.10229,-0.035883
20,3,1000,100,0.05,0.718434,0.765385,0.820797,1.031683,0.298568,0.361873,0.228671
22,3,1000,400,0.05,3.075797,3.365653,2.996103,4.359769,0.258797,0.348855,0.375107


In [36]:
df[(df.n_rows == 1000) & (df.frac_missing == 0.25)]

Unnamed: 0,k,n_rows,n_cols,frac_missing,knn_impute_few_observed,knn_impute_with_argpartition,knn_impute_optimistic,knn_impute_reference,argpartition_log_ratio,few_observed_log_ratio,optimistic_log_ratio
7,1,1000,25,0.25,0.448762,0.414425,0.439901,0.761481,0.608374,0.528773,0.548716
9,1,1000,100,0.25,1.475442,1.524288,1.343072,2.980739,0.670644,0.703213,0.797212
11,1,1000,400,0.25,6.40978,6.837735,6.888379,12.996844,0.64225,0.706882,0.634871
19,3,1000,25,0.25,0.468678,0.487511,0.49282,0.791085,0.484093,0.523489,0.473261
21,3,1000,100,0.25,1.427744,1.81215,1.430812,2.981968,0.498069,0.736488,0.734341
23,3,1000,400,0.25,5.843878,7.423589,6.233972,12.336019,0.507861,0.747129,0.68251


In [37]:
df[(df.n_rows == 1000) & (df.frac_missing == 0.05)].mean()

k                                  2.000000
n_rows                          1000.000000
n_cols                           175.000000
frac_missing                       0.050000
knn_impute_few_observed            1.422940
knn_impute_with_argpartition       1.421711
knn_impute_optimistic              1.331834
knn_impute_reference               1.874995
argpartition_log_ratio             0.261790
few_observed_log_ratio             0.195077
optimistic_log_ratio               0.207002
dtype: float64

In [38]:
df[(df.n_rows == 1000) & (df.frac_missing == 0.25)].mean()

k                                  2.000000
n_rows                          1000.000000
n_cols                           175.000000
frac_missing                       0.250000
knn_impute_few_observed            2.679047
knn_impute_with_argpartition       3.083283
knn_impute_optimistic              2.804826
knn_impute_reference               5.474689
argpartition_log_ratio             0.568548
few_observed_log_ratio             0.657662
optimistic_log_ratio               0.645152
dtype: float64

In [42]:
df[(df.frac_missing == 0.05) & (df.n_cols == 400)].mean()

k                                 2.000000
n_rows                          625.000000
n_cols                          400.000000
frac_missing                      0.050000
knn_impute_few_observed           1.720773
knn_impute_with_argpartition      1.770842
knn_impute_optimistic             1.551597
knn_impute_reference              2.318603
argpartition_log_ratio            0.242686
few_observed_log_ratio            0.331065
optimistic_log_ratio              0.468921
dtype: float64

In [56]:
df[(df.frac_missing == 0.25) & (df.n_rows == 1000) & (df.n_cols == 100)].mean()

k                                  2.000000
n_rows                          1000.000000
n_cols                           100.000000
frac_missing                       0.250000
knn_impute_few_observed            1.451593
knn_impute_with_argpartition       1.668219
knn_impute_optimistic              1.386942
knn_impute_reference               2.981353
argpartition_log_ratio             0.584357
few_observed_log_ratio             0.719851
optimistic_log_ratio               0.765777
dtype: float64

In [57]:
df[(df.frac_missing == 0.25) & (df.n_rows == 1000) & (df.n_cols == 400)].mean()

k                                  2.000000
n_rows                          1000.000000
n_cols                           400.000000
frac_missing                       0.250000
knn_impute_few_observed            6.126829
knn_impute_with_argpartition       7.130662
knn_impute_optimistic              6.561175
knn_impute_reference              12.666431
argpartition_log_ratio             0.575055
few_observed_log_ratio             0.727005
optimistic_log_ratio               0.658690
dtype: float64

In [58]:
X_features = np.array(df[["k", "n_cols", "frac_missing"]])

In [79]:
y = np.zeros((len(X_features)), dtype=int)
for i, row in df.iterrows():
    ratios = np.array(row[["argpartition_log_ratio", "few_observed_log_ratio", "optimistic_log_ratio"]])
    max_idx = np.argmax(ratios)
    y[i] = max_idx

In [80]:
import sklearn

In [81]:
from sklearn.linear_model import LogisticRegression

In [84]:
lr = LogisticRegression(multi_class="multinomial", solver="lbfgs")

In [85]:
lr.fit(X_features, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [94]:
coef = lr.coef_
coef

array([[ 0.03521171, -0.02291925, -0.38305093],
       [ 0.11429702,  0.00810134,  0.48589078],
       [-0.14950873,  0.01481791, -0.10283985]])

In [87]:
lr.predict(X_features)

array([0, 0, 1, 1, 2, 2, 0, 0, 1, 1, 2, 2, 0, 0, 1, 1, 2, 2, 0, 0, 1, 1, 2,
       2])

In [88]:
y

array([0, 1, 1, 1, 2, 2, 0, 0, 2, 2, 2, 1, 0, 1, 0, 2, 2, 2, 0, 1, 1, 1, 2,
       1])

In [98]:
intercept = lr.intercept_
intercept

array([ 1.49822931, -0.32685412, -1.17137519])

In [90]:
lr.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'max_iter': 100,
 'multi_class': 'multinomial',
 'n_jobs': 1,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [92]:
lr.decision_function??

In [96]:
scores = np.dot(X_features, coef.T) + intercept

In [97]:
scores

array([[ 0.94130735,  0.01427082, -0.95557816],
       [ 0.86469716,  0.11144897, -0.97614613],
       [-0.77763603,  0.62187095,  0.15576508],
       [-0.85424622,  0.71904911,  0.13519711],
       [-7.65340955,  3.0522715 ,  4.60113805],
       [-7.73001974,  3.14944966,  4.58057008],
       [ 0.94130735,  0.01427082, -0.95557816],
       [ 0.86469716,  0.11144897, -0.97614613],
       [-0.77763603,  0.62187095,  0.15576508],
       [-0.85424622,  0.71904911,  0.13519711],
       [-7.65340955,  3.0522715 ,  4.60113805],
       [-7.73001974,  3.14944966,  4.58057008],
       [ 1.01173078,  0.24286485, -1.25459562],
       [ 0.93512059,  0.340043  , -1.27516359],
       [-0.70721261,  0.85046498, -0.14325238],
       [-0.78382279,  0.94764314, -0.16382035],
       [-7.58298613,  3.28086553,  4.30212059],
       [-7.65959631,  3.37804369,  4.28155262],
       [ 1.01173078,  0.24286485, -1.25459562],
       [ 0.93512059,  0.340043  , -1.27516359],
       [-0.70721261,  0.85046498, -0.143

In [116]:
xx = np.array([13, 300, 0.7])

In [117]:
scores = np.dot(xx, coef.T) + intercept

In [118]:
scores

array([-5.18792759,  3.92953118,  1.25839641])

In [103]:
np.argmax(scores)

2