In [None]:
def convert_to_categories(df, categorical_cols):
    out = df.copy()
    out[categorical_cols] = out[categorical_cols].astype('category')
    return out

In [None]:
def reduce_categories(df, cat_column, cats_to_keep):
    out = df.copy()
    out[cat_column] = out[cat_column].apply(lambda x: x if x in cats_to_keep else "Other")
    return out

In [None]:
def normalize_numeric(df):
    out = df.copy()
    numeric_cols = list(out.select_dtypes('number'))
    out[numeric_cols] = MinMaxScaler().fit_transform(out[numeric_cols])
    return out

In [None]:
def one_hot_encoding(df):
    out = df.copy()
    category_list = list(out.select_dtypes('category'))
    out = pd.get_dummies(out, columns=category_list)
    return out

In [None]:
def create_target_var(df, target_var, new_target_var):
    out = df.copy()
    q3 = out[target_var].quantile(0.75)
    out[new_target_var] = pd.cut(out[target_var], bins=[-1, q3, 1], labels=[0, 1])
    return out

In [None]:
def drop_bad_columns(df, bad_cols):
    out = df.copy()
    out = out.drop(bad_cols, axis=1)
    return out

In [None]:
def train_tune_test(df, new_target_var):
    out = df.copy()
    train, temp = train_test_split(out, train_size=5/7, stratify=out[new_target_var])
    tune, test = train_test_split(temp, train_size=.5, stratify=temp[new_target_var])
    return train, tune, test

In [None]:
def full_preprocess_pipeline(df, categorical_cols, cat_column_to_reduce, cats_to_keep, target_var, new_target_var, bad_cols):
    out = df.copy()
    out = convert_to_categories(out, categorical_cols)
    out = reduce_categories(out, cat_column_to_reduce, cats_to_keep)
    out = normalize_numeric(out)
    out = one_hot_encoding(out)
    out = create_target_var(out, target_var, new_target_var)
    out = drop_bad_columns(out, bad_cols)
    train, tune, test = train_tune_test(out, new_target_var)
    return train, tune, test