In [15]:
import pandas as pd
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.base import TransformerMixin, BaseEstimator

from constants import ALL_COLS, LABEL, PCA_OUT_NUM


In [41]:
class DataTransformer(BaseEstimator, TransformerMixin):       
    
    def __init__(self, pca_out_num, all_cols):
        self.pca_out_num = pca_out_num
        self.all_cols = all_cols

        pca_pipe = Pipeline(steps=[
            ('scaler', StandardScaler()), 
            ('pca', PCA(n_components=self.pca_out_num))
            ]
        )

        feat_union = FeatureUnion(transformer_list=[
                            ('scaler', StandardScaler(),), 
                            ('pca', pca_pipe)
                            ]
                        )
        self.all_feats_transform = ColumnTransformer(
            transformers=[('feat_union', feat_union, self.all_cols)]
        )

    def fit(self, X, y=None):
        self.all_feats_transform.fit(X, y)
        return self
    
    def transform(self, X):
        return self.all_feats_transform.transform(X)

In [43]:
cl_instance = DataTransformer(PCA_OUT_NUM, ALL_COLS)
cl_instance.all_feats_transform

In [44]:
raw_df = pd.read_csv('data_raw/train.csv')
cl_instance.fit_transform(raw_df)

array([[-0.17861102, -0.34241451, -0.36880956, ...,  0.43153732,
        -0.2059403 , -0.34863841],
       [ 0.67460995,  0.79387665,  0.85279701, ..., -0.12713696,
         0.57769171,  0.3588663 ],
       [ 1.08580078,  0.93023159,  0.89673969, ..., -0.11323214,
        -0.38290128,  0.43460208],
       ...,
       [-0.09637285, -0.08764607, -0.113942  , ...,  0.60835837,
        -1.35919595,  0.36146473],
       [ 0.01670463, -0.19170642, -0.39517517, ..., -0.31331592,
        -1.17553652, -0.03410665],
       [-0.77483772, -0.95601173, -1.14220077, ..., -0.71312369,
        -0.17221056, -0.74585302]])