In [1]:
%reload_ext autoreload
%autoreload 2

# Imports
Import the required classes and functions.

In [2]:
from mleko.dataset.convert import CSVToVaexConverter
from mleko.dataset.feature_select import (
    CompositeFeatureSelector,
    InvarianceFeatureSelector,
    MissingRateFeatureSelector,
    PearsonCorrelationFeatureSelector,
    VarianceFeatureSelector,
)
from mleko.dataset.ingest import KaggleIngester
from mleko.dataset.split import ExpressionSplitter, RandomSplitter
from mleko.pipeline import Pipeline
from mleko.pipeline.steps import ConvertStep, FeatureSelectStep, IngestStep, SplitStep


# Constants
Define configuration variables.

In [3]:
OWNER_SLUG = "mlg-ulb"
DATASET_SLUG = "creditcardfraud"
DATASET_NAME = f"{OWNER_SLUG}/{DATASET_SLUG}"

TARGET_FEATURE = "Class"
TIME_FEATURE = "Time"
META_FEATURES = [TIME_FEATURE, TARGET_FEATURE]
RANDOM_STATE = 1337


# Pipeline Setup

In [8]:
kaggle_data_source = KaggleIngester(
    destination_directory=f"data/{DATASET_NAME}/raw", owner_slug=OWNER_SLUG, dataset_slug=DATASET_SLUG
)
csv_to_arrow_converter = CSVToVaexConverter(
    output_directory=f"data/{DATASET_NAME}/converted", downcast_float=True, random_state=RANDOM_STATE
)
random_data_splitter = RandomSplitter(
    cache_directory=f"data/{DATASET_NAME}/split",
    data_split=(0.80, 0.20),
    shuffle=True,
    stratify=TARGET_FEATURE,
    random_state=RANDOM_STATE,
)
expression_data_splitter = ExpressionSplitter(cache_directory=f"data/{DATASET_NAME}/split", expression="Time > 100")
composite_feature_selector = CompositeFeatureSelector(
    cache_directory=f"data/{DATASET_NAME}/feature_selection",
    feature_selectors=[
        MissingRateFeatureSelector(
            cache_directory=f"data/{DATASET_NAME}/feature_selection",
            missing_rate_threshold=0.7,
            ignore_features=META_FEATURES,
        ),
        VarianceFeatureSelector(
            cache_directory=f"data/{DATASET_NAME}/feature_selection",
            variance_threshold=0.00,
            ignore_features=META_FEATURES
        ),
        PearsonCorrelationFeatureSelector(
            cache_directory=f"data/{DATASET_NAME}/feature_selection",
            correlation_threshold=0.7,
            ignore_features=META_FEATURES
        ),
        InvarianceFeatureSelector(
            cache_directory=f"data/{DATASET_NAME}/feature_selection",
            ignore_features=META_FEATURES
        ),
    ],
)

pipeline = Pipeline(
    steps=[
        IngestStep(kaggle_data_source, outputs=["raw_csv"]),
        ConvertStep(csv_to_arrow_converter, inputs=["raw_csv"], outputs=["df_clean"]),
        SplitStep(random_data_splitter, inputs=["df_clean"], outputs=["df_train_validate", "df_test"], cache_group="train_test_split"),
        FeatureSelectStep(
            composite_feature_selector,
            inputs=["df_train_validate"],
            outputs=["df_train_validate_features_selected"],
            cache_group="feature_selection_train_validate",
        ),
        SplitStep(random_data_splitter, inputs=["df_train_validate_features_selected"], outputs=["df_train", "df_validate"], cache_group="train_test_split"),
    ]
)


[2023-06-26 23:05:25] [[1;32mINFO[0m] Attempting to fetch Kaggle API credentials from environment variables 'KAGGLE_USERNAME' and 'KAGGLE_KEY'. [1m(kaggle_ingester.py:74)[0m
[2023-06-26 23:05:25] [[1;32mINFO[0m] Kaggle credentials successfully fetched. [1m(kaggle_ingester.py:91)[0m


# Run Pipeline

In [9]:
data_container =  pipeline.run().data

[2023-06-26 23:05:28] [[1;32mINFO[0m] No data container provided. Creating an empty one. [1m(pipeline.py:77)[0m
[2023-06-26 23:05:28] [[1;32mINFO[0m] Executing step 1/5: IngestStep. [1m(pipeline.py:81)[0m
[2023-06-26 23:05:29] [[1;32mINFO[0m] [32mCache Hit[0m: Local dataset is up to date with Kaggle, skipping download. [1m(kaggle_ingester.py:279)[0m
[2023-06-26 23:05:29] [[1;32mINFO[0m] Finished step 1/5 execution. [1m(pipeline.py:83)[0m
[2023-06-26 23:05:29] [[1;32mINFO[0m] Executing step 2/5: ConvertStep. [1m(pipeline.py:81)[0m
[2023-06-26 23:05:29] [[1;32mINFO[0m] [32mCache Hit[0m (LRUCache) CSVToVaexConverter.convert: Using cached output. [1m(cache_mixin.py:134)[0m
[2023-06-26 23:05:29] [[1;32mINFO[0m] Finished step 2/5 execution. [1m(pipeline.py:83)[0m
[2023-06-26 23:05:29] [[1;32mINFO[0m] Executing step 3/5: SplitStep. [1m(pipeline.py:81)[0m
[2023-06-26 23:05:29] [[1;32mINFO[0m] [32mCache Hit[0m (LRUCache) RandomSplitter.split: Using cached

Writing DataFrame to .arrow file:   0%|          | 0/100 [00:00<?, ?it/s]

Writing DataFrame to .arrow file:   0%|          | 0/100 [00:00<?, ?it/s]

[2023-06-26 23:05:30] [[1;32mINFO[0m] Finished step 5/5 execution. [1m(pipeline.py:83)[0m
