In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from scipy import stats
import pickle
from imblearn.over_sampling import SMOTE

In [None]:
class DataFrameImputer(SimpleImputer):
    def fit(self, X, y=None):
        super().fit(X, y)
        # Storing the column names
        self.column_names = X.columns  
        return self

    def transform(self, X):
        data = super().transform(X)
        # Converting the ndarray back to a DataFrame and assigning the column names
        return pd.DataFrame(data, columns=self.column_names)

In [None]:
class CustomTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, log_cols_idx, scale_cols_idx, column_names=None):
        self.log_cols_idx = log_cols_idx
        self.scale_cols_idx = scale_cols_idx
        self.column_names = column_names 
        self.scaler = StandardScaler()
        
    def fit(self, X, y=None):
        self.column_names = X.columns.tolist()
        self.scaler.fit(X.iloc[:, self.scale_cols_idx])
        return self

    def transform(self, X, y=None):
        X_transformed = X.copy() 
        X_transformed.iloc[:, self.log_cols_idx] = np.log1p(X_transformed.iloc[:, self.log_cols_idx])
        X_transformed.iloc[:, self.scale_cols_idx] = self.scaler.transform(X_transformed.iloc[:, self.scale_cols_idx])
        return pd.DataFrame(X_transformed, columns=self.column_names)

In [None]:
preprocessor = Pipeline([
    ('imputer', DataFrameImputer(strategy='mean')), 
    ('transformer', CustomTransformer(log_cols_idx=None, scale_cols_idx=None))
])

In [None]:
class DataPipeline:
    def __init__(self, file, y_column, test_size, task, preprocessor, num_splits=10, random_state=2023):
        self.file = file
        self.y_column = y_column
        self.test_size = test_size
        self.task = task
        self.random_state = random_state
        self.data = None
        self.X = None
        self.y = None
        self.preprocessor = preprocessor
        self.num_splits = num_splits
        self.preprocessed_data_splits = [] 

    def load_data(self):
        self.data = pd.read_csv(self.file)
        self.column_names = self.data.columns.tolist()
    
    def get_X_y(self):
        df = self.data.copy()
        self.X = df.drop(self.y_column, axis=1)
        self.y = df[self.y_column]
        # binary classification scenarios
        # severe & moderate = 1 vs. mild
        if self.task == "binary-0":
            self.y = self.y.apply(lambda x: 0 if x == 0 else 1)
            self.prevalence = sum(self.y) / len(self.y)
        
        # mild = 1 vs. severe & moderate
        elif self.task == "binary-1":
            self.y = self.y.apply(lambda x: 1 if x == 0 else 0)
            self.prevalence = sum(self.y) / len(self.y)
            
        # severe = 1 vs. moderate & mild
        elif self.task == "binary-2":
            self.y = self.y.apply(lambda x: 0 if x in [0,1] else 1)
            self.prevalence = sum(self.y) / len(self.y)
            
        else:
            pass

    def log_sca_idx(self):
        self.log_cols_idx = [self.X.columns.get_loc(col) for col in log_cols]
        self.sca_cols_idx = [self.X.columns.get_loc(col) for col in sca_cols]
        self.preprocessor.named_steps['transformer'].log_cols_idx = self.log_cols_idx
        self.preprocessor.named_steps['transformer'].scale_cols_idx = self.sca_cols_idx

    def create_preprocessed_splits(self):

        for i in range(self.num_splits):

            X_train, X_test, y_train, y_test = train_test_split(
                self.X, self.y, test_size=self.test_size, stratify=self.y, random_state=self.random_state+i)

            X_train_preprocessed = self.preprocessor.fit_transform(X_train)
            X_test_preprocessed = self.preprocessor.transform(X_test)
            
            smote = SMOTE(random_state=self.random_state)
            X_train_smote, y_train_smote = smote.fit_resample(X_train_preprocessed.copy(), y_train.copy())

            self.preprocessed_data_splits.append({
                'X_train': X_train_smote,
                'X_test': X_test_preprocessed,
                'y_train': y_train_smote,
                'y_test': y_test
            })

In [None]:
log_cols = ['Amylase', 'Lipase', 'R', 'WBC', 'NEUT', 'LYMPH', 'PLT', 'HCT', 'ALT', 
            'AST', 'ALB', 'GLU', 'BUN', 'CR', 'TG', 'CHOL', 'LDH', 'HBDH', 'Ca']
sca_cols = ['Age', 'Amylase', 'Lipase', 'Abdominal pain onset time (h)', 'T', 'R', 
            'HR', 'WBC', 'NEUT', 'LYMPH', 'PLT', 'HCT', 'Hgb', 'ALT', 'AST', 'ALB', 
            'GLB', 'GLU', 'BUN', 'CR', 'TG', 'CHOL', 'HDL-C', 'LDL-C', 'LDH', 
            'HBDH', 'Ca', 'CRP']

In [None]:
file_name = {"RAC": "data_rac.csv", "DBC": "data_dbc.csv", "MDBC": "data_mdbc.csv"}
for key, value in file_name.items():

    file = f"{file_path}{value}"
    data_pipeline = DataPipeline(file=file, y_column=key, test_size=.25, task='mulit', preprocessor=preprocessor)
    
    # Execute the data processing steps for the current dataset
    data_pipeline.load_data()
    data_pipeline.get_X_y()
    data_pipeline.log_sca_idx()
    data_pipeline.create_preprocessed_splits() 
    
    # Save the preprocessed data splits for the current dataset
    with open(f'b_{key.lower()}_smote_splits.pkl', 'wb') as f:
        pickle.dump(data_pipeline.preprocessed_data_splits, f)