In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
import scipy.stats as stats
from main import get_data

data, labels, continuous, discrete, dummy, categorical = get_data()


In [3]:
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline


class ModelBasedColImputer(BaseEstimator, TransformerMixin):
    """
    Uses colname_nan to indicate which rows were nan
    """
    def __init__(self, column, model):
        self.column = column
        self.model = clone(model)

    def fit(self, X, y=None, **fit_params):
        without_na = X[~X[self.column + '_nan'].astype('bool')]
        without_target_col = without_na.drop([self.column, self.column + '_nan'], axis=1)
        self.model.fit(without_target_col, without_na[self.column])
        return self

    def transform(self, X):
        with_na = X[X[self.column + '_nan'].astype('bool')]
        without_target_col = with_na.drop([self.column, self.column + '_nan'], axis=1)
        X.loc[X[self.column + '_nan'].astype('bool'), self.column] = self.model.predict(without_target_col)
        return X

class ModelBasedImputer(BaseEstimator, TransformerMixin):
    def __init__(self, columns, model):
        self.columns = columns
        self.model = model

    def fit(self, X, y=None, **fit_params):
        imputers = [(col + '_imputer', ModelBasedColImputer(column=col, model=self.model)) for col in self.columns if col + '_nan' in X.columns]
        self.pipe = Pipeline(imputers)
        return self.pipe.fit(X, y)

    def transform(self, X):
        return self.pipe.transform(X)


In [13]:
df = data.drop('Product_Info_2', axis = 1).iloc[:20, :7].set_index('Id')

In [100]:
df

Unnamed: 0_level_0,Product_Info_1,Product_Info_3,Product_Info_4,Product_Info_5,Product_Info_6,Product_Info_7
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
33308,,29,0.230769,2,3,1
74812,1.0,26,0.076923,2,3,1
23707,1.0,26,,2,3,1
668,1.0,26,,2,3,1
65826,,26,0.230769,2,3,1
21487,1.0,26,0.230769,2,3,1
66051,1.0,26,0.025641,2,3,1
51785,1.0,26,0.076923,2,3,3
73035,1.0,26,0.333333,2,3,1
48958,1.0,26,0.487179,2,3,1


In [None]:
kf = KnnFiller(5, dist_fn1)
kf.fit(df, df.iloc[:,5])


In [None]:
to_drop = data.columns[data.isnull().sum() > 20000]
dropped = data.drop(to_drop, axis=1)



In [28]:
# X = pd.DataFrame({'A': [0, 1, 4, 5, 0 ,1, 0], 'B': [0, 3, np.NaN, np.NaN, 1, 2, np.NaN], 'C': [0, 2, 3, np.NaN, 1, 3,np.NaN]})
X = pd.DataFrame({
    'A': [0, 1, 4, 5, 0 ,1, 0],
    'B': [0, 3, np.NaN, np.NaN, 1, 2, 0],
    'C': [0, 2, 3, 0, 1, 3, 0],
    'B_nan': [False, False, True, True, False, False, True],
    'C_nan': [True, True, True, False, False, False, True]
})
X2 = pd.DataFrame({
    'A': [0, 1, 2, 3, 4 ,5, 6, 7, 8],
    'B': [1, 3, 0, 0, 9, 11, 13, 15, 17],
    'C': [0, 0, 0, 6, 8, 10, 0, 14, 16],
    'B_nan': [False, False, True, True, False, False, True, False, True],
    'C_nan': [True, True, True, False, False, False, True, False, False]
})
with_nan = X[X[['B']].isnull().any(axis=1) |  (X['B'] < 1)]

In [7]:
# X.loc[X['B'].isnull(), 'B'] = with_nan['A'].map(a['Ht'])
X

Unnamed: 0,A,B,B_nan,C,C_nan
0,0,0.0,False,0,True
1,1,3.0,False,2,True
2,4,,True,3,True
3,5,,True,0,False
4,0,1.0,False,1,False
5,1,2.0,False,3,False
6,0,0.0,True,0,True


In [201]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np


class HotDeckImputer(BaseEstimator, TransformerMixin):
    def __init__(self, columns, k=8):
        self.columns = columns
        self.k = k

    def fit(self, X, y=None, **fit_params):
        self.clusterer = KMeans(n_clusters=self.k)
        without_na = X.dropna(axis=0)

        # fit KMeans to other columns
        without_target_cols = without_na.drop(self.columns, axis=1)
        self.clusterer.fit(without_target_cols)

        just_target_cols = without_na[self.columns]
        # get mean of specified attributes per each cluster
        just_target_cols['cluster'] = self.clusterer.predict(without_target_cols)
        self.values_per_cluster = just_target_cols.groupby('cluster').apply(np.mean)
        return self

    def transform(self, X):
        rows_with_nan = X[X[self.columns].isnull().any(axis=1)]
        without_target_cols = rows_with_nan.drop(self.columns, axis=1)
        rows_with_nan['cluster'] = self.clusterer.predict(without_target_cols)
        for col in self.columns:


In [30]:
from sklearn.linear_model import LinearRegression

regr = ModelBasedImputer(['B', 'C'], model=LinearRegression())
regr.fit(X2)

Pipeline(memory=None,
     steps=[('B_imputer', ModelBasedColImputer(column='B',
           model=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False))), ('C_imputer', ModelBasedColImputer(column='C',
           model=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)))])

In [31]:
regr.transform(X2)

Unnamed: 0,A,B,B_nan,C,C_nan
0,0,1.0,False,-3.885781e-15,True
1,1,3.0,False,2.0,True
2,2,5.0,True,4.0,True
3,3,7.0,True,6.0,False
4,4,9.0,False,8.0,False
5,5,11.0,False,10.0,False
6,6,13.0,True,12.0,True
7,7,15.0,False,14.0,False
8,8,17.0,True,16.0,False


In [11]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso

from transformers.one_hot_encoder import CustomOneHotEncoder

onehot = CustomOneHotEncoder(columns=categorical)
df = onehot.fit_transform(data).fillna(0).drop('Response', axis=1)
selector = SelectFromModel(Lasso())


In [12]:
selector.fit(df, labels)

SelectFromModel(estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
        norm_order=1, prefit=False, threshold=None)

In [None]:
selector.transform(df).shape

(10000, 4)