In [2]:
!pip install -U scikit-learn

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [17]:
import numpy as np
import pandas as pd
import json

In [4]:
import sklearn
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from sklearn import set_config

In [6]:
set_config(transform_output="pandas")

In [5]:
feats = pd.read_parquet('data/feats.parquet')
feats.head()

Unnamed: 0,true_false,one_hot,dates,floats,max_of_list,nunique_of_list,desc_stats,multi_label,random_col,other
0,,green,2022-05-29,8.5,59,"apple,apple,grapefruit",6280.0,,7.0,3
1,true,purple,2022-03-19,5.0,5,3594,825.0,"grape,apple,pineapple,blueberry,strawberry",,3
2,false,,2022-09-25,6.0,47,orange,6330.0,"blueberry,blueberry,blueberry,pineapple,grape",7.0,4
3,1,purple,2022-03-06,9.0,84,"blueberry,grapefruit,apple",,"pineapple,apple,apple,apple,apple,grapefruit",2.0,3
4,false,,2022-06-21,5.0,40551,0,29128.0,"orange,orange,grapefruit,blueberry",9.0,0


In [7]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder

class TrueFalseTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._col_names = None

    def fit(self, X, y=None):
        self._col_names = list(X.columns)
        return self

    def transform(self, X, y=None):
        print('Running TrueFalseTransformer')
        X.fillna('-1', inplace=True)
        X = X.replace({'true':'1', 'false':'0'})
        X = X.apply(pd.to_numeric, args=('coerce',))
        return X

    def get_feature_names(self):
        return self._col_names

class OneHotTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._filler = 'ml_empty'
        self._col_names = None
        self._encoder = None
        self._transformer = None
        self._transformed_feats = []

    def fit(self, X, y=None):
        self._col_names = X.dropna(axis=1, how='all').columns
        X = X[self._col_names].fillna(self._filler)
        self._encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        self._transformer = self._encoder.fit(X)
        self._transformed_feats = self._transformer.get_feature_names_out()
        return self

    def transform(self, X, y=None):
        print('Running OneHotTransformer')
        X = self._transformer.transform(X[self._col_names])
        return X

    def get_feature_names(self):
        return list(self._transformed_feats)

class DateTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._col_names = None

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        print('Running DateTransformer')
        temp_df = pd.DataFrame(index=X.index.copy())

        for col in X.columns:
            X[col] = pd.to_datetime(X[col])
            temp_df[f'{col}-month'] = X[col].dt.month.astype(float)
            temp_df[f'{col}-day_of_week'] = X[col].dt.dayofweek.astype(float)
            temp_df[f'{col}-hour'] = X[col].dt.hour.astype(float)
            temp_df[f'{col}-day_of_month'] = X[col].dt.day.astype(float)
            temp_df[f'{col}-is_month_start'] = X[col].dt.is_month_start.astype(int)
            temp_df[f'{col}-is_month_end'] = X[col].dt.is_month_end.astype(int)
        self._col_names = list(temp_df.columns)
        temp_df = temp_df.fillna(-1)
        return temp_df

    def get_feature_names(self):
        return self._col_names

class FloatTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._col_names = None

    def fit(self, X, y=None):
        self._col_names = list(X.columns)
        return self

    def transform(self, X, y=None):
        print('Running FloatTransformer')
        for col in self._col_names:
            if X[col].dtype != 'float':
                X[col] = X[col].astype(float)
        X = X.fillna(-1.0)
        return X

    def get_feature_names(self):
        return self._col_names

class ListMaxTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._col_names = None

    def fit(self, X, y=None):
        self._col_names = list(X.columns)
        return self

    def transform(self, X, y=None):
        print('Running ListMaxTransformer')
        temp_df = pd.DataFrame(index=X.index)
        for col in self._col_names:
            if X[col].dtype == 'str':
                X[col].fillna('-1', inplace=True)
                X[col] = X[col].str.split(pat=',').apply(set).apply(list)
            temp_series = X[col].explode()
            temp_series = temp_series.replace({'true':'1', 'false':'0'}).fillna('-1').apply(pd.to_numeric, args=('coerce',))
            temp_series = temp_series.groupby(temp_series.index).max()
            temp_df = temp_df.merge(temp_series, left_index=True, right_index=True, how='outer')
        temp_df = temp_df.fillna(0)
        return temp_df

    def get_feature_names(self):
        return self._col_names

class ListNuniqueTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._col_names = None

    def fit(self, X, y=None):
        self._col_names = list(X.columns)
        return self

    def transform(self, X, y=None):
        print('Running ListNuniqueTransformer')
        temp_df = pd.DataFrame(index=X.index)
        for col in self._col_names:
            if X[col].dtype == 'str':
                X[col] = X[col].dropna().str.split(pat=',').apply(set).apply(list)
            temp_series = X[col].explode()
            temp_series = temp_series.groupby(temp_series.index).nunique()
            temp_df = temp_df.merge(temp_series, left_index=True, right_index=True, how='outer')
        temp_df = temp_df.fillna(0)
        return temp_df

    def get_feature_names(self):
        return self._col_names

class DescStatTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._col_names = None

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        print('Running DescStatTransformer')
        temp_df = pd.DataFrame(index=X.index)
        for col in X.columns:
            if X[col].dtype == 'str':
                X[col].fillna('-1', inplace=True)
                X[col] = X[col].str.split(pat=',').apply(set).apply(list)
            temp_series = X[col].explode()
            temp_series = temp_series.fillna('-1').apply(pd.to_numeric, args=('coerce',))
            temp_series = temp_series.groupby(temp_series.index).agg(['min', 'max', 'mean', 'std', 'nunique'])
            temp_series.columns = [f'{col}-{x}' for x in temp_series.columns]
            temp_df = temp_df.merge(temp_series, left_index=True, right_index=True, how='outer')
        temp_df = temp_df.fillna(0)
        self._col_names = list(temp_df.columns)
        return temp_df

    def get_feature_names(self):
        return self._col_names

class MultilabelTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._filler = 'ml_empty'
        self._encoder = None
        self._transformer = None
        self._col_names = None

    def fit(self, X, y=None):
        X = X.fillna(self._filler).str.split(pat=',').apply(set).apply(list)
        self._encoder = MultiLabelBinarizer()
        self._encoder.fit(X)
        self._col_names = [X.name + '__' + x for x in self._encoder.classes_]
        return self

    def transform(self, X, y=None):
        print('Running MultilabelTransformer')
        X = X.fillna(self._filler).str.split(pat=',').apply(set).apply(list)
        trans_array = self._encoder.transform(X)
        df = pd.DataFrame(trans_array, columns=self._col_names, index=X.index)        
        return df

    def get_feature_names(self):
        return self._col_names

class DropSingleValueCols(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._col_index = []
        self._col_names = []
        
    def fit(self, X, y=None):
        for i in range(len(X.columns)):
            if X.iloc[:,i].nunique() > 1:
                self._col_index.append(i)
        self._col_names = list(X.iloc[:,self._col_index].columns)
        return self
    
    def transform(self, X, y=None):
        print('Running DropSingleValueCols')
        X = X.iloc[:,self._col_index]
        return X
    
    def get_feature_names(self):
        return self._col_names
           
# class DropSingleValueCols(BaseEstimator, TransformerMixin):
#     def __init__(self):
#         self._col_names = []
        
#     def fit(self, X, y=None):
#         for col in X.columns:
#             if X[col].nunique() > 1:
#                 self._col_names.append(col)
#         return self
    
#     def transform(self, X, y=None):
#         print('Running DropSingleValueCols')
#         X = X[self._col_names]
#         return X
    
#     def get_feature_names(self):
#         return self._col_names

class RemoveCollinearity(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._corr_dict = {}
        self._drop_cols = set()
        self._col_index = []
        self._col_names = []
        
    def fit(self, X, y=None):
        drop_list = []
        for i, col in enumerate(X.columns):
            sliced_col = abs(X.iloc[i+1:, i])
            corr_feats = sliced_col[sliced_col > .97].index.tolist()
            if len(corr_feats) > 0:
                self._corr_dict[col] = corr_feats
                drop_list += corr_feats
        self._drop_cols = set(drop_list)
        self._col_names = list(set(X.columns) - self._drop_cols)
        for i, col in enumerate(X.columns):
            if col in self._col_names:
                self._col_index.append(i)
        return self
    
    def transform(self, X, y=None):
        print('Running RemoveCollinearity')
        X =  X.iloc[:,self._col_index]
        return X
    
    def get_feature_names(self):
        return self._col_names
    
# class RemoveCollinearity(BaseEstimator, TransformerMixin):
#     def __init__(self):
#         self._corr_dict = {}
#         self._drop_cols = set()
#         self._col_names = []
        
#     def fit(self, X, y=None):
#         drop_list = []
#         for i, col in enumerate(X.columns):
#             sliced_col = abs(X.iloc[i+1:, i])
#             corr_feats = sliced_col[sliced_col > .97].index.tolist()
#             if len(corr_feats) > 0:
#                 self._corr_dict[col] = corr_feats
#                 drop_list += corr_feats
#         self._drop_cols = set(drop_list)
#         self._col_names = list(set(X.columns) - self._drop_cols)
#         return self
    
#     def transform(self, X, y=None):
#         print('Running RemoveCollinearity')
#         X = X[self._col_names]
#         return X
    
#     def get_feature_names(self):
#         return self._col_names


In [8]:
true_false = ['true_false']
one_hot = ['one_hot']
date_cols = ['dates']
float_cols = ['floats']
max_of_list = ['max_of_list']
count_unique = ['nunique_of_list']
desc_stat_cols = ['desc_stats']
list_to_labels = ['multi_label']
drop_cols = ['random_col']

In [9]:
preprocessor = ColumnTransformer([
    ('truefalse', TrueFalseTransformer(), true_false),
    ('onehot', OneHotTransformer(), one_hot),
    ('dates', DateTransformer(), date_cols),
    ('floats', FloatTransformer(), float_cols),
    ('listmax', ListMaxTransformer(), max_of_list),
    ('nunique', ListNuniqueTransformer(), count_unique),
    ('descstats', DescStatTransformer(), desc_stat_cols),
    ('multilabel', MultilabelTransformer(), 'multi_label')],
    verbose_feature_names_out=False)

extras = Pipeline([
    ('dropsingle', DropSingleValueCols()),
    ('removemulticollinear', RemoveCollinearity())])

processor = Pipeline([
    ('preprocess', preprocessor),
    ('additional', extras)])

In [10]:
processor.fit(feats)

Running TrueFalseTransformer
Running OneHotTransformer
Running DateTransformer
Running FloatTransformer
Running ListMaxTransformer
Running ListNuniqueTransformer
Running DescStatTransformer
Running MultilabelTransformer
Running DropSingleValueCols


In [11]:
processor.transform(feats)

Running TrueFalseTransformer
Running OneHotTransformer
Running DateTransformer
Running FloatTransformer
Running ListMaxTransformer
Running ListNuniqueTransformer
Running DescStatTransformer
Running MultilabelTransformer
Running DropSingleValueCols
Running RemoveCollinearity


Unnamed: 0,true_false,one_hot_blue,one_hot_green,one_hot_orange,one_hot_purple,one_hot_red,one_hot_yellow,dates-month,dates-day_of_week,dates-hour,...,desc_stats-mean,desc_stats-nunique,multi_label__apple,multi_label__blueberry,multi_label__grape,multi_label__grapefruit,multi_label__ml_empty,multi_label__orange,multi_label__pineapple,multi_label__strawberry
0,-1,0.0,1.0,0.0,0.0,0.0,0.0,5.0,6.0,0.0,...,0.0,0,0,0,0,0,1,0,0,0
1,1,0.0,0.0,0.0,1.0,0.0,0.0,3.0,5.0,0.0,...,0.0,0,1,1,1,0,0,0,1,1
2,0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,6.0,0.0,...,0.0,0,0,1,1,0,0,0,1,0
3,1,0.0,0.0,0.0,1.0,0.0,0.0,3.0,6.0,0.0,...,-1.0,1,1,0,0,1,0,0,1,0
4,0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,1.0,0.0,...,0.0,0,0,1,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1,1.0,0.0,0.0,0.0,0.0,0.0,7.0,2.0,0.0,...,0.0,0,0,0,0,1,0,0,1,0
9996,0,1.0,0.0,0.0,0.0,0.0,0.0,10.0,4.0,0.0,...,0.0,0,1,1,0,1,0,0,1,1
9997,-1,0.0,0.0,0.0,0.0,1.0,0.0,10.0,3.0,0.0,...,0.0,0,0,0,0,0,0,1,0,0
9998,0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,-1.0,1,0,0,1,1,0,1,1,1


In [12]:
set_config(transform_output="default")

In [None]:
payload = {"true_false":None,"one_hot":"yellow","dates":1671062400000,"floats":9.0,"max_of_list":"4,3,9,6,0","nunique_of_list":"4,9","desc_stats":"9,4","multi_label":"strawberry","random_col":1.0,"other":3}

In [22]:
json_str = feats.iloc[0,:].to_json()
json_str

'{"true_false":null,"one_hot":"green","dates":1653782400000,"floats":8.5,"max_of_list":"5,9","nunique_of_list":"apple,apple,grapefruit","desc_stats":"6,2,8,0","multi_label":null,"random_col":7.0,"other":3}'

In [26]:
input_data = pd.DataFrame(json.loads(json_str), index=[0,])
input_data

Unnamed: 0,true_false,one_hot,dates,floats,max_of_list,nunique_of_list,desc_stats,multi_label,random_col,other
0,,green,1653782400000,8.5,59,"apple,apple,grapefruit",6280,,7.0,3


In [27]:
processor.transform(input_data)

Running TrueFalseTransformer
Running OneHotTransformer
Running DateTransformer
Running FloatTransformer
Running ListMaxTransformer
Running ListNuniqueTransformer
Running DescStatTransformer
Running MultilabelTransformer
Running DropSingleValueCols


AttributeError: 'numpy.ndarray' object has no attribute 'iloc'

In [15]:
processor.transform(feats.iloc[0,:])

ValueError: X does not contain any features, but ColumnTransformer is expecting 10 features

In [5]:
len(feats.columns)

10

In [10]:
feats.iloc[:,1].nunique()

6

In [12]:
feats.iloc[:,[1]]

Unnamed: 0,one_hot
0,green
1,purple
2,
3,purple
4,
...,...
9995,blue
9996,blue
9997,red
9998,red


In [19]:
list(feats.columns)

['true_false',
 'one_hot',
 'dates',
 'floats',
 'max_of_list',
 'nunique_of_list',
 'desc_stats',
 'multi_label',
 'random_col',
 'other']

In [14]:
col_index = []
for i in range(len(feats.columns)):
    if feats.iloc[:,i].nunique() > 1:
        col_index.append(i)
feats.iloc[:,col_index].columns

Index(['true_false', 'one_hot', 'dates', 'floats', 'max_of_list',
       'nunique_of_list', 'desc_stats', 'multi_label', 'random_col', 'other'],
      dtype='object')

In [20]:
sub_list = ['one_hot','floats','nunique_of_list','multi_label','other']

In [23]:
col_index = []
for i, col in enumerate(feats.columns):
    if col in sub_list:
        col_index.append(i)

In [24]:
col_index

[1, 3, 5, 7, 9]

In [None]:
class DropSingleValueCols(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._col_index = []
        self._col_names = []
        
    def fit(self, X, y=None):
        for i in range(len(X.columns)):
            if X.iloc[:,i].nunique() > 1:
                self._col_index.append(i)
        self._col_names = list(X.iloc[:,self._col_index].columns)
        return self
    
    def transform(self, X, y=None):
        print('Running DropSingleValueCols')
        X = X.iloc[:,self._col_index]
        return X
    
    def get_feature_names(self):
        return self._col_names
    
class RemoveCollinearity(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._corr_dict = {}
        self._drop_cols = set()
        self._col_index = []
        self._col_names = []
        
    def fit(self, X, y=None):
        drop_list = []
        for i, col in enumerate(X.columns):
            sliced_col = abs(X.iloc[i+1:, i])
            corr_feats = sliced_col[sliced_col > .97].index.tolist()
            if len(corr_feats) > 0:
                self._corr_dict[col] = corr_feats
                drop_list += corr_feats
        self._drop_cols = set(drop_list)
        self._col_names = list(set(X.columns) - self._drop_cols)
        for i, col in enumerate(X.columns):
            if col in self._col_names:
                self._col_index.append(i)
        return self
    
    def transform(self, X, y=None):
        print('Running RemoveCollinearity')
        X =  X.iloc[:,self._col_index]
        return X
    
    def get_feature_names(self):
        return self._col_names

In [None]:
class DropSingleValueCols(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._col_names = []
        
    def fit(self, X, y=None):
        for col in X.columns:
            if X[col].nunique() > 1:
                self._col_names.append(col)
        return self
    
    def transform(self, X, y=None):
        print('Running DropSingleValueCols')
        X = X[self._col_names]
        return X
    
    def get_feature_names(self):
        return self._col_names
    
class RemoveCollinearity(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._corr_dict = {}
        self._drop_cols = set()
        self._col_names = []
        
    def fit(self, X, y=None):
        drop_list = []
        for i, col in enumerate(X.columns):
            sliced_col = abs(X.iloc[i+1:, i])
            corr_feats = sliced_col[sliced_col > .97].index.tolist()
            if len(corr_feats) > 0:
                self._corr_dict[col] = corr_feats
                drop_list += corr_feats
        self._drop_cols = set(drop_list)
        self._col_names = list(set(X.columns) - self._drop_cols)
        return self
    
    def transform(self, X, y=None):
        print('Running RemoveCollinearity')
        X = X[self._col_names]
        return X
    
    def get_feature_names(self):
        return self._col_names

In [None]:
import numpy as np
import pandas as pd

In [None]:
values = [[-1. 0. 0. 0. 0. 0. 0. 1. 1. 3. 0. 1. 1. 0. 9. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]

In [None]:
arr_vals = np.array([[-1, 0, 0, 0, 0, 0, 0, 1, 1, 3, 0, 1, 1, 0, 9, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])
arr_vals

In [None]:
print(arr_vals)

In [None]:
type(arr_vals)

In [None]:
arr_vals[0][[1,3,5,7,9]]

In [None]:
feat_names = ['true_false', 'one_hot_blue', 'one_hot_green', 'one_hot_ml_empty', 'one_hot_orange', 'one_hot_purple', 'one_hot_red', 'one_hot_yellow', 'dates-month', 'dates-day_of_week', 'dates-hour', 'dates-day_of_month', 'dates-is_month_start', 'dates-is_month_end', 'floats', 'max_of_list', 'nunique_of_list', 'desc_stats-min', 'desc_stats-max', 'desc_stats-mean', 'desc_stats-std', 'desc_stats-nunique', 'multi_label__apple', 'multi_label__blueberry', 'multi_label__grape', 'multi_label__grapefruit', 'multi_label__ml_empty', 'multi_label__orange', 'multi_label__pineapple', 'multi_label__strawberry']

In [None]:
df = pd.DataFrame(arr_vals, columns=feat_names)
df

In [None]:
dict_output = df.to_dict(orient='records')
dict_output

In [None]:
import json

In [None]:
json.dumps(dict_output)

In [None]:
df.to_json(orient='records')