In [1]:
import json
import yaml
import os

In [2]:
with open('param_config.yaml', 'r') as config_file:
    config_params = yaml.safe_load(config_file)

In [3]:
folder_dir = config_params['processor_dir']

In [4]:
os.makedirs(folder_dir, exist_ok=True)

In [5]:
algo_dict = {
            # 'LogisticRegression': {'module': 'sklearn.linear_model',
            #                         'class': "LogisticRegression",
            #                         'defaults': {'max_iter': 10000},
            #                         'params': {"solver": ["lbfgs", "liblinear", "newton-cg", "newton-cholesky",
            #                                                "sag","saga"],
            #                                    "penalty": ["l1", "l2", None],
            #                                    'max_iter': [10000]}},
             'SGDClassifier': {'module': 'sklearn.linear_model',
                               'class': "SGDClassifier",
                               'defaults': {'loss': 'modified_huber'},
                               'params': {"loss": ["log_loss", "modified_huber"],
                                          "penalty": ["l1", "l2", None],
                                          'alpha': [0.0001, 0.001, 0.01, 0.1, 1],
                                          'learning_rate':['constant', 'optimal', 'invscaling', 'adaptive'],
                                          'eta0':[10e-5]}},
             'KNeighborsClassifier': {'module': 'sklearn.neighbors',
                     'class': "KNeighborsClassifier",
                     'params': {'n_neighbors': [5, 15, 25, 50],
                                'weights': ["uniform", "distance"],
                                # 'algorithm': ['ball_tree', 'kd_tree', 'brute'],
                                'metric': ["cityblock", "cosine", "euclidean", "l1", "l2", "manhattan", "nan_euclidean"]}},
             'DecisionTreeClassifier': {'module': 'sklearn.tree',
                              'class': "DecisionTreeClassifier",
                              'params': {"criterion":["gini", "entropy", "log_loss"],
                                         "splitter":["best", "random"],
                                         "max_depth": [None, 10, 100],
                                         "min_samples_split": [2, 100, 1000],
                                         "min_samples_leaf":[1, 10, 100],
                                         "max_features": [None, "sqrt", "log2", 0.25, 0.5, 0.75],
                                         "max_leaf_nodes":[None, 10, 100],
                                         "random_state": [12]}},
             'ExtraTreeClassifier': {'module': 'sklearn.tree',
                           'class': "ExtraTreeClassifier",
                           'defaults': {'splitter': 'best', "min_samples_leaf": 10},
                           'params': {"criterion":["gini", "entropy", "log_loss"],
                                      "splitter":["best", "random"],
                                      "max_depth": [None, 10, 100],
                                      "min_samples_split": [2, 100, 1000],
                                      "min_samples_leaf":[1, 10, 100],
                                      "max_features": [None, "sqrt", "log2", 0.25, 0.5, 0.75],
                                      "max_leaf_nodes":[None, 10, 100],
                                      "random_state": [12]}},
             'AdaBoostClassifier': {'module': 'sklearn.ensemble',
                          'class': "AdaBoostClassifier",
                          'params': {"n_estimators":[5, 25, 50],
                                     "learning_rate":[0.5, 1, 10],
                                     "algorithm": ['SAMME', 'SAMME.R'],
                                     "random_state": [12]}},
             # 'BaggingClassifier': {'module': 'sklearn.ensemble',
             #             'class': "BaggingClassifier",
             #             'defaults': {"n_estimators": 50,
             #                          "max_samples": 0.5,
             #                          "max_features": 0.6,
             #                          "bootstrap_features": True},
             #             'params': {"n_estimators":[10, 25, 50, 100],
             #                        "max_samples":[0.25, 0.5, 0.75, 1],
             #                        "max_features": [0.25, 0.5, 0.75, 1],
             #                        "bootstrap":[True, False],
             #                        "bootstrap_features":[True, False],
             #                        "warm_start":[True, False],
             #                        "random_state": [12]}},
            'ExtraTreesClassifier': {'module': 'sklearn.ensemble',
                           'class': "ExtraTreesClassifier",
                           'params': {"n_estimators":[50, 100, 150],
                                      "criterion":["gini", "entropy", "log_loss"],
                                      "max_depth": [None, 10, 100],
                                      "min_samples_split": [2, 100, 1000],
                                      "min_samples_leaf":[1, 10, 100],
                                      "max_features": [None, "sqrt", "log2", 0.25, 0.5, 0.75],
                                      "max_leaf_nodes":[None, 10, 100],
                                      "bootstrap":[True, False],
                                      "warm_start":[True, False],
                                      "random_state": [12]}},
            # 'GradientBoostingClassifier': {'module': 'sklearn.ensemble',
            #                   'class': "GradientBoostingClassifier",
            #                   'params': {"loss":['log_loss', 'exponential'],
            #                              "learning_rate":[0.5, 1, 10],
            #                              "n_estimators":[50, 100, 150],
            #                              "subsample": [0.5, 1],
            #                              "criterion": ["friedman_mse", "squared_error"],
            #                              "min_samples_split": [2, 100, 1000],
            #                              "min_samples_leaf":[1, 10, 100],
            #                              "max_depth": [3, 15, 25],
            #                              "max_features": [None, "sqrt", "log2", 0.25, 0.5, 0.75],
            #                              "warm_start":[True, False],
            #                              "validation_fraction": [0.4, 0.2],
            #                              # "n_iter_no_change": [5, 10],
            #                              "random_state": [12]}},
            'RandomForestClassifier': {'module': 'sklearn.ensemble',
                             'class': "RandomForestClassifier",
                             'params': {"n_estimators":[50, 100, 150],
                                        "criterion":["gini", "entropy", "log_loss"],
                                        "max_depth": [None, 10, 100],
                                        "min_samples_split": [2, 100, 1000],
                                        "min_samples_leaf":[1, 10, 100],
                                        "max_features": [None, "sqrt", "log2", 0.25, 0.5, 0.75],
                                        "max_leaf_nodes":[None, 10, 100],
                                        "bootstrap":[True, False],
                                        "warm_start":[True, False],
                                        "random_state": [12]}},
            'HistGradientBoostingClassifier': {'module': 'sklearn.ensemble',
                                  'class': "HistGradientBoostingClassifier",
                                  'params': {"learning_rate":[0.25, 0.5, 0.75, 1],
                                             "max_iter": [50, 100, 150],
                                             "max_leaf_nodes":[10, 30, 100],
                                             "max_depth": [None, 10, 25, 50],
                                             "min_samples_leaf":[1, 10, 100],
                                             "warm_start":[True, False],
                                             "validation_fraction": [0.4, 0.2],
                                             "random_state": [12]}},
            'XGBClassifier': {'module': 'xgboost',
                        'class': "XGBClassifier",
                        'params': {"learning_rate":[0.25, 0.5, 0.75, 1],
                                   "gamma": [0, 0.5, 1, 100],
                                   "max_depth": [1, 10, 100],
                                   "min_child_weight":[1, 10, 100],
                                   "max_delta_step":[0, 2, 10],
                                   "subsample": [0.25, 0.5, 0.75, 1],
                                   "lambda": [0.25, 0.5, 0.75,],
                                   "alpha": [0.25, 0.5, 0.75,],
                                   "tree_method": ["auto", "exact", "approx"],
                                   "random_state": [12]}}
            }

with open(folder_dir + "/algo_dict.json", "w") as ad:
    ad.write(json.dumps(algo_dict))

In [6]:
col_dict = {
        "true_false": {
            'transformer': "TrueFalseTransformer",
            'multiplier': 1,
            'columns': [
                'true_false'
                ]},
        "one_hot": {
            'transformer': "OneHotTransformer",
            'multiplier': 10,
            'columns':[
                'one_hot'
                ]},
        "date_cols": {
            'transformer': "DateTransformer",
            'multiplier': 1,
            'columns':[
                'dates'
                ]},
        "float_cols": {
            'transformer': "FloatTransformer",
            'multiplier': 1,
            'columns':[
                'floats'
                ]},
        "max_of_list": {
            'transformer': "ListMaxTransformer",
            'multiplier': 1,
            'columns':[
                'max_of_list'
                ]},
        "count_unique": {
            'transformer': "ListNuniqueTransformer",
            'multiplier': 1,
            'columns':[
                'nunique_of_list'
                ]},
        "desc_stat_cols":{
            'transformer': "DescStatTransformer",
            'multiplier': 5,
            'columns':[
                'desc_stats'
                ]},
        "list_to_labels": {
            'transformer': "MultilabelTransformer",
            'multiplier': 10,
            'columns':[
                'multi_label'
                ]},
        "drop_cols": {
            'transformer': 'drop',
            'multiplier': 0,
            'columns':[
                'random_col',
                'other']}}

with open(folder_dir + "/col_dict.json", "w") as cd:
    cd.write(json.dumps(col_dict))

# Requirements

In [7]:
%%writefile requirements.txt
alibi==0.9.6
DateTime==5.5
importlib==1.0.4
joblib==1.4.2
numpy==1.26.4
pandas==2.2.3
pathlib==1.0.1
platformdirs==4.3.6
pyarrow==18.0.0
PyYAML==6.0.2
sagemaker-training==4.8.1
scikit-explain==0.1.4
scikit-learn==1.5.2
shap==0.46.0
xgboost==2.1.2

Writing requirements.txt


In [8]:
%%writefile Dockerfile
FROM python:3.12-slim
COPY requirements.txt /opt/app/requirements.txt
COPY source_dir/* /opt/app
WORKDIR /opt/app
RUN apt-get -y update
RUN apt-get -y install gcc
RUN pip3 install -r requirements.txt
ENV PYTHONUNBUFFERED=TRUE
ENV PYTHONPATH="/opt/app:${PYTHONPATH}"
ENTRYPOINT ["python3"]

Writing Dockerfile
