/
preprocessing_transforms.py
83 lines (59 loc) · 2.39 KB
/
preprocessing_transforms.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from sklearn.base import TransformerMixin, BaseEstimator
import pandas as pd
class BaseTransformer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None, **fit_params):
return self
def transform(self, X, **transform_params):
return self
class ColumnSelector(BaseTransformer):
"""Selects columns from Pandas Dataframe"""
def __init__(self, columns, c_type=None):
self.columns = columns
self.c_type = c_type
def transform(self, X, **transform_params):
cs = X[self.columns]
if self.c_type is None:
return cs
else:
return cs.astype(self.c_type)
class SpreadBinary(BaseTransformer):
def transform(self, X, **transform_params):
return X.applymap(lambda x: 1 if x == 1 else -1)
class DfTransformerAdapter(BaseTransformer):
"""Adapts a scikit-learn Transformer to return a pandas DataFrame"""
def __init__(self, transformer):
self.transformer = transformer
def fit(self, X, y=None, **fit_params):
self.transformer.fit(X, y=y, **fit_params)
return self
def transform(self, X, **transform_params):
raw_result = self.transformer.transform(X, **transform_params)
return pd.DataFrame(raw_result, columns=X.columns, index=X.index)
class DfOneHot(BaseTransformer):
"""
Wraps helper method `get_dummies` making sure all columns get one-hot encoded.
"""
def __init__(self):
self.dummy_columns = []
def fit(self, X, y=None, **fit_params):
self.dummy_columns = pd.get_dummies(
X,
prefix=[c for c in X.columns],
columns=X.columns).columns
return self
def transform(self, X, **transform_params):
return pd.get_dummies(
X,
prefix=[c for c in X.columns],
columns=X.columns).reindex(columns=self.dummy_columns, fill_value=0)
class DfFeatureUnion(BaseTransformer):
"""A dataframe friendly implementation of `FeatureUnion`"""
def __init__(self, transformers):
self.transformers = transformers
def fit(self, X, y=None, **fit_params):
for l, t in self.transformers:
t.fit(X, y=y, **fit_params)
return self
def transform(self, X, **transform_params):
transform_results = [t.transform(X, **transform_params) for l, t in self.transformers]
return pd.concat(transform_results, axis=1)