In [None]:
# imports
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
import json

In [None]:
# read in the data
training_csv = "../CEIP_csv/training_data.csv"
training_df = pd.read_csv(training_csv) # takes about 1 min to read 

In [None]:
# Custom Transformer that removes duplicates
class RemoveDuplicatesTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X.drop_duplicates()

# Custom Transformer that converts ixMaterial to Material
class ConvertMaterialTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        # Load the JSON file
        material_types_df = pd.read_json('../CEIP_csv/MaterialTypes.json')
        # Create a dictionary of the material types and their corresponding values 
        self.material_dict = material_types_df.to_dict()
        # Make a new dictionary where the keys are the same but the values are the sNames
        self.sName_dict = {k: v['sName'] for k, v in self.material_dict.items()}
        return self

    def transform(self, X, y=None):
        X['ixMaterial'] = X['ixMaterial'].map(self.sName_dict)
        X = X.rename(columns={'ixMaterial': 'Material'})
        return X

# Custom Transformer that filters 'ms' from Material and drops Material column
class FilterAndDropMaterialTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.dropna(subset=['Material'])
        X = X[X['Material'] == 'ms']
        X = X.drop(columns=['Material'])
        return X

# Custom Transformer that converts ixAutoNestStrategy
class ConvertAutoNestStrategyTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        # Load the dictionary from the JSON file
        with open('../CEIP_csv/AutoNestStrategy.json', 'r') as f:
            self.autoneststrategy_dict = json.load(f)
        return self

    def transform(self, X, y=None):
        X['ixAutoNestStrategy'] = X['ixAutoNestStrategy'].astype(str)
        X['ixAutoNestStrategy'] = X['ixAutoNestStrategy'].map(self.autoneststrategy_dict)
        return X

# Custom Transformer that one-hot encodes ixAutoNestStrategy
class OneHotEncodeAutoNestStrategyTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.one_hot = OneHotEncoder(drop='first')

    def fit(self, X, y=None):
        self.one_hot.fit(X[['ixAutoNestStrategy']])
        return self

    def transform(self, X, y=None):
        one_hot_result = self.one_hot.transform(X[['ixAutoNestStrategy']]).toarray()
        one_hot_df = pd.DataFrame(one_hot_result, columns=self.one_hot.get_feature_names(['']))
        X = pd.concat([X, one_hot_df], axis=1)
        X = X.drop(columns=['ixAutoNestStrategy'])
        return X

In [None]:
# Putting all together
preprocessing_pipeline = Pipeline(steps=[
    ('remove_duplicates', RemoveDuplicatesTransformer()),
    ('convert_material', ConvertMaterialTransformer()),
    ('filter_and_drop_material', FilterAndDropMaterialTransformer()),
    ('convert_autoneststrategy', ConvertAutoNestStrategyTransformer()),
    ('one_hot_encode_autoneststrategy', OneHotEncodeAutoNestStrategyTransformer())
])

In [None]:
# this is how to fit and transform the training data
preprocessed_df = preprocessing_pipeline.fit_transform(training_df)