In [1]:
import pandas as pd
from functools import reduce

In [31]:
class GroupByTransformer():
    def __init__(self, param_dict=None):
        self.param_dict = [{k: self._process_item(k, v)
                             for k, v in p_dict.items()}
                            for p_dict in param_dict]
        self.features = []

    def _process_item(self, k, v):
        if k == 'key':
            # convert key to List[List[str]].
            if isinstance(v, str):
                return [[v]]
            elif isinstance(v, list) and all(isinstance(v, str) for x in v):
                return [v]
            else:
                return [x if isinstance(x, list) else [x] for x in v]
        else:
            return v if isinstance(v, list) else [v]
        
    def _get_params(self, p_dict):
        return p_dict['key'], p_dict['var'], p_dict['agg']

    def transform(self, df):
        for p_dict, features in zip(self.param_dict, self.features):
            for _key, agged in zip(p_dict['key'], features):
                df = df.merge(agged, on=_key, how='left')
        return df

    def fit(self, df):
        for p_dict in self.param_dict:
            key, var, agg = self._get_params(p_dict)
            features = []

            for _key in key:
                use_features = list(set(_key + var))
                agged = df[use_features].groupby(_key)[var].agg(agg).reset_index()
                columns = self._get_feature_names(_key, var, agg)
                agged.columns = _key + columns
                features.append(agged)
            self.features.append(features)
        return self

    def _get_feature_names(self, key, var, agg):
        _agg = [a if isinstance(a, str) else a.__name__ for a in agg]

        return ['_'.join([a, v, 'by'] + key)
                for v in var
                for a in _agg]

    def get_feature_names(self):
        for param_dict in self.params_dict:
            key, var, agg, on = self._get_params(param_dict)
            self.feature_names += self._get_feature_names(key, var, agg)
        return self.feature_names

In [32]:
import numpy as np

N = 100

df = pd.DataFrame({
    'a': np.random.randint(5, size=N),
    'b': np.random.randint(5, size=N),
    'c': np.random.random(N),
    'd': np.random.random(N),
})

df.head()

Unnamed: 0,a,b,c,d
0,1,3,0.467377,0.744858
1,3,1,0.00568,0.412037
2,2,2,0.938342,0.973634
3,3,3,0.711459,0.550293
4,1,4,0.021161,0.549032


In [33]:
gbt = GroupByTransformer([{
    'key': ['a', ['a', 'b']],
    'var': ['c', 'd'],
    'agg': ['sum', 'mean', 'max', 'min', 'std', 'var']}
])
gbt.fit(df)
tranformed = gbt.transform(df)
tranformed

Unnamed: 0,a,b,c,d,sum_c_by_a,mean_c_by_a,max_c_by_a,min_c_by_a,std_c_by_a,var_c_by_a,...,max_c_by_a_b,min_c_by_a_b,std_c_by_a_b,var_c_by_a_b,sum_d_by_a_b,mean_d_by_a_b,max_d_by_a_b,min_d_by_a_b,std_d_by_a_b,var_d_by_a_b
0,1,3,0.467377,0.744858,9.915396,0.550855,0.977075,0.021161,0.327048,0.106960,...,0.856657,0.022393,0.417442,0.174258,2.147482,0.715827,0.791182,0.611442,0.093321,0.008709
1,3,1,0.005680,0.412037,11.793259,0.536057,0.966267,0.005680,0.324605,0.105368,...,0.966267,0.005680,0.392506,0.154061,3.653189,0.405910,0.910240,0.021323,0.273898,0.075020
2,2,2,0.938342,0.973634,11.933304,0.542423,0.995043,0.015682,0.300276,0.090166,...,0.995043,0.650108,0.137869,0.019008,3.816157,0.763231,0.973634,0.483277,0.198663,0.039467
3,3,3,0.711459,0.550293,11.793259,0.536057,0.966267,0.005680,0.324605,0.105368,...,0.830679,0.682886,0.057568,0.003314,2.412397,0.482479,0.629614,0.250903,0.163462,0.026720
4,1,4,0.021161,0.549032,9.915396,0.550855,0.977075,0.021161,0.327048,0.106960,...,0.637125,0.021161,0.435552,0.189706,0.750135,0.375068,0.549032,0.201103,0.246023,0.060527
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,3,2,0.759155,0.947795,11.793259,0.536057,0.966267,0.005680,0.324605,0.105368,...,0.954832,0.490127,0.233315,0.054436,1.375087,0.458362,0.947795,0.056882,0.451922,0.204233
96,1,0,0.672434,0.439008,9.915396,0.550855,0.977075,0.021161,0.327048,0.106960,...,0.966552,0.328305,0.231852,0.053756,3.347657,0.669531,0.929360,0.307186,0.284759,0.081088
97,3,3,0.721999,0.250903,11.793259,0.536057,0.966267,0.005680,0.324605,0.105368,...,0.830679,0.682886,0.057568,0.003314,2.412397,0.482479,0.629614,0.250903,0.163462,0.026720
98,3,1,0.251637,0.534873,11.793259,0.536057,0.966267,0.005680,0.324605,0.105368,...,0.966267,0.005680,0.392506,0.154061,3.653189,0.405910,0.910240,0.021323,0.273898,0.075020


In [34]:
from functools import reduce
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder


class OneHotEncoder():
    def __init__(self, cols):
        self.cols = cols
        sefl.enc = None

    def fit(self, df):
        self.enc = OneHotEncoder()
        self.enc.fit(df[self.cols].astype(str))
        return self
    
    def transform(self, df):
        encoded = pd.DataFrame(encoder.transform(df[cols].astype(str)).toarray(),
                           index=df.index, columns=categories,)
        if drop:
            return pd.concat([df.drop(cols, axis=1), encoded], axis=1)
        else:
            return pd.concat([df, encoded], axis=1)
        
    def get_feature_names(self):
        return reduce(lambda l, c: l + c.tolist(), enc.categories_, [])


In [35]:
df = pd.DataFrame({
    'a': ['a', 'b', 'b', np.nan]
})

In [37]:
vc = df['a'].value_counts()
vc

b    2
a    1
Name: a, dtype: int64

In [39]:
df['a'].map(vc)

0    1.0
1    2.0
2    2.0
3    NaN
Name: a, dtype: float64