In [1]:
from typing import List, Union
import math

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoding

In [None]:
class FeatureEng:
    def __init__(self, data: pd.DataFrame):
        if type(data) is not pd.DataFrame:
            raise TypeError("Data type must be pandas DataFrame.")
        self.data = data
        self.num_columns = []
        self.obj_columns = []
        for col in self.data.columns:
            if col in [int, float]:
                self.num_columns.append(col)
            else:
                self.obj_columns.append(col)

    def add_feature(self, features: List[List[str]]):
        columns = self.data.columns.values
        not_in_features = []
        for feature1, feature2 in features:
            if feature1 not in columns:
                not_in_features.append(feature1)
            if feature2 not in columns:
                not_in_features.append(feature2)

        if not_in_features:
            raise ValueError(f"These features are not in data. \n {not_in_features}")

        for feature1, feature2 in features:
            self.data[f"{feature1}_add_{feature2}"] =\
                self.data[feature1] + self.data[feature2]

    def cross_feature(self, features: List[List[str]]):
        columns = self.data.columns.values
        not_in_features = []
        for feature1, feature2 in features:
            if feature1 not in columns:
                not_in_features.append(feature1)
            if feature2 not in columns:
                not_in_features.append(feature2)

        if not_in_features:
            raise ValueError(f"These features are not in data. \n {not_in_features}")

        for feature1, feature2 in features:
            self.data[f"{feature1}_times_{feature2}"] =\
                self.data[feature1] * self.data[feature2]

    def sub_feature(self, features: List[List[str]]):
        columns = self.data.columns.values
        not_in_features = []
        for feature1, feature2 in features:
            if feature1 not in columns:
                not_in_features.append(feature1)
            if feature2 not in columns:
                not_in_features.append(feature2)

        if not_in_features:
            raise ValueError(f"These features are not in data. \n {not_in_features}")

        for feature1, feature2 in features:
            self.data[f"{feature1}_sub_{feature2}"] =\
                self.data[feature1] - self.data[feature2]

    def div_feature(self, features: List[List[str]],
                    round_flag=False):
        columns = self.data.columns.values
        not_in_features = []
        for feature1, feature2 in features:
            if feature1 not in columns:
                not_in_features.append(feature1)
            if feature2 not in columns:
                not_in_features.append(feature2)

        if not_in_features:
            raise ValueError(f"These features are not in data. \n {not_in_features}")

        if round_flag:
            for feature1, feature2 in features:
                self.data[f"{feature1}_sub_{feature2}"] =\
                    math.ceil(self.data[feature1] / self.data[feature2])

        else:
            for feature1, feature2 in features:
                self.data[f"{feature1}_sub_{feature2}"] =\
                    self.data[feature1] / self.data[feature2]

    def cat_encoding(self, features: List[str], method: str = 'one-hot'):
        for feature in features:
            if feature not in self.obj_columns:
                raise TypeError("This feature type is numerical. Please check data type.")

        if method == 'one-hot':
            for feature in features:
                tmp = pd.get_dummies(self.data[feature], drop_first=True)
                self.data = pd.concat((self.data, tmp), axis=1)

        elif method == 'label':
            for feature in features:
                le = LabelEncoding()
                self.data[feature] = le.fit_transform(self.data[feature])