In [2]:
%reload_ext autoreload
%autoreload 2

# Imports
Import the required classes and functions.

In [3]:
from mleko.dataset.convert import CSVToVaexConverter
from mleko.dataset.feature_select import (
    MissingRateFeatureSelector,
    VarianceFeatureSelector,
    CompositeFeatureSelector,
    PearsonCorrelationFeatureSelector,
    InvarianceFeatureSelector,
)
from mleko.dataset.ingest import KaggleIngester
from mleko.dataset.split import ExpressionSplitter, RandomSplitter
from mleko.pipeline import Pipeline
from mleko.pipeline.steps import ConvertStep, FeatureSelectStep, IngestStep, SplitStep


# Constants
Define configuration variables.

In [4]:
OWNER_SLUG = 'mlg-ulb'
DATASET_SLUG = 'creditcardfraud'
DATASET_NAME = f'{OWNER_SLUG}/{DATASET_SLUG}'

TARGET_FEATURE = "Class"
TIME_FEATURE = "Time"
META_FEATURES = [TIME_FEATURE, TARGET_FEATURE]
RANDOM_STATE = 1337

# Pipeline Setup

In [5]:
kaggle_data_source = KaggleIngester(
    destination_directory=f"data/{DATASET_NAME}/raw", owner_slug=OWNER_SLUG, dataset_slug=DATASET_SLUG
)
csv_to_arrow_converter = CSVToVaexConverter(
    output_directory=f"data/{DATASET_NAME}/converted", downcast_float=True, random_state=RANDOM_STATE
)
random_data_splitter = RandomSplitter(
    cache_directory=f"data/{DATASET_NAME}/split",
    data_split=(0.80, 0.20),
    shuffle=True,
    stratify=TARGET_FEATURE,
    random_state=RANDOM_STATE,
)
expression_data_splitter = ExpressionSplitter(cache_directory=f"data/{DATASET_NAME}/split", expression="Time > 100")
composite_feature_selector = CompositeFeatureSelector(
    cache_directory=f"data/{DATASET_NAME}/feature_selection",
    feature_selectors=[
        MissingRateFeatureSelector(
            cache_directory=f"data/{DATASET_NAME}/feature_selection",
            missing_rate_threshold=0.7,
            ignore_features=META_FEATURES,
        ),
        VarianceFeatureSelector(
            cache_directory=f"data/{DATASET_NAME}/feature_selection",
            variance_threshold=0.00,
            ignore_features=META_FEATURES
        ),
        PearsonCorrelationFeatureSelector(
            cache_directory=f"data/{DATASET_NAME}/feature_selection",
            correlation_threshold=0.7,
            ignore_features=META_FEATURES
        ),
        InvarianceFeatureSelector(
            cache_directory=f"data/{DATASET_NAME}/feature_selection",
            ignore_features=META_FEATURES
        ),
    ],
)


pipeline = Pipeline(
    steps=[
        IngestStep(kaggle_data_source, outputs=["raw_csv"]),
        ConvertStep(csv_to_arrow_converter, inputs=["raw_csv"], outputs=["df_clean"]),
        SplitStep(random_data_splitter, inputs=["df_clean"], outputs=["df_train_validate", "df_test"]),
        FeatureSelectStep(
            composite_feature_selector,
            inputs=["df_train_validate"],
            outputs=["df_train_validate_features_selected"],
        ),
        SplitStep(expression_data_splitter, inputs=["df_train_validate_features_selected"], outputs=["df_train", "df_validate"]),
    ]
)


[2023-06-03 22:59:19] [[1;32mINFO[0m] Attempting to fetch Kaggle API credentials from environment variables 'KAGGLE_USERNAME' and 'KAGGLE_KEY'. [1m(kaggle_ingester.py:74)[0m
[2023-06-03 22:59:19] [[1;32mINFO[0m] Kaggle credentials successfully fetched. [1m(kaggle_ingester.py:91)[0m


# Run Pipeline

In [6]:
data_container =  pipeline.run(force_recompute=True).data

[2023-06-03 22:59:20] [[1;32mINFO[0m] No data container provided. Creating an empty one. [1m(pipeline.py:77)[0m
[2023-06-03 22:59:20] [[1;32mINFO[0m] Executing step 1/5: IngestStep. [1m(pipeline.py:81)[0m
[2023-06-03 22:59:21] [[1;32mINFO[0m] [33mForce Cache Refresh[0m: Downloading mlg-ulb/creditcardfraud/* to data/mlg-ulb/creditcardfraud/raw from Kaggle. [1m(kaggle_ingester.py:287)[0m


Downloading files from Kaggle: 100%|██████████| 1/1 [00:04<00:00,  4.29s/it]

[2023-06-03 22:59:26] [[1;32mINFO[0m] Finished downloading 1 files from Kaggle. [1m(kaggle_ingester.py:303)[0m
[2023-06-03 22:59:26] [[1;32mINFO[0m] Finished step 1/5 execution. [1m(pipeline.py:83)[0m
[2023-06-03 22:59:26] [[1;32mINFO[0m] Executing step 2/5: ConvertStep. [1m(pipeline.py:81)[0m





[2023-06-03 22:59:26] [[1;32mINFO[0m] [33mForce Cache Refresh[0m (LRUCache) CSVToVaexConverter.convert: Executing method. [1m(cache_mixin.py:133)[0m


Converting CSV files: 100%|██████████| 1/1 [00:03<00:00,  3.62s/it]
Writing DataFrame to .arrow file: 100%|██████████| 100/100 [00:00<00:00, 739.42it/s]

[2023-06-03 22:59:34] [[1;32mINFO[0m] Finished step 2/5 execution. [1m(pipeline.py:83)[0m
[2023-06-03 22:59:34] [[1;32mINFO[0m] Executing step 3/5: SplitStep. [1m(pipeline.py:81)[0m
[2023-06-03 22:59:35] [[1;32mINFO[0m] [33mForce Cache Refresh[0m (LRUCache) RandomSplitter.split: Executing method. [1m(cache_mixin.py:133)[0m
[2023-06-03 22:59:35] [[1;32mINFO[0m] Shuffling data before splitting. [1m(random_splitter.py:122)[0m
[2023-06-03 22:59:35] [[1;32mINFO[0m] Splitting data with stratification on column 'Class'. [1m(random_splitter.py:126)[0m





[2023-06-03 22:59:35] [[1;32mINFO[0m] Split dataframe into two dataframes with shapes (227845, 32) and (56962, 32). [1m(random_splitter.py:138)[0m


Writing DataFrame to .arrow file: 100%|██████████| 100/100 [00:00<00:00, 398.93it/s]
Writing DataFrame to .arrow file: 100%|██████████| 100/100 [00:00<00:00, 595.79it/s]


[2023-06-03 22:59:35] [[1;32mINFO[0m] Finished step 3/5 execution. [1m(pipeline.py:83)[0m
[2023-06-03 22:59:35] [[1;32mINFO[0m] Executing step 4/5: FeatureSelectStep. [1m(pipeline.py:81)[0m
[2023-06-03 22:59:35] [[1;32mINFO[0m] [33mForce Cache Refresh[0m (LRUCache) CompositeFeatureSelector.select_features: Executing method. [1m(cache_mixin.py:133)[0m
[2023-06-03 22:59:35] [[1;32mINFO[0m] Executing composite feature selection step 1/4: MissingRateFeatureSelector. [1m(composite_feature_selector.py:108)[0m
[2023-06-03 22:59:35] [[1;32mINFO[0m] Selecting features from the following set (29): ['Amount', 'V1', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V2', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9']. [1m(missing_rate_feature_selector.py:100)[0m


Calculating missing rates for features: 100%|██████████| 29/29 [00:00<00:00, 49.68it/s]

[2023-06-03 22:59:36] [[1;32mINFO[0m] Dropping (0) features with missing rate >= 0.7: set(). [1m(missing_rate_feature_selector.py:108)[0m
[2023-06-03 22:59:36] [[1;32mINFO[0m] Finished composite feature selection step 1/4. [1m(composite_feature_selector.py:113)[0m
[2023-06-03 22:59:36] [[1;32mINFO[0m] Executing composite feature selection step 2/4: VarianceFeatureSelector. [1m(composite_feature_selector.py:108)[0m
[2023-06-03 22:59:36] [[1;32mINFO[0m] Selecting features from the following set (29): ['Amount', 'V1', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V2', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9']. [1m(variance_feature_selector.py:104)[0m



Calculating variance for features: 100%|██████████| 29/29 [00:00<00:00, 32.96it/s]

[2023-06-03 22:59:37] [[1;32mINFO[0m] Dropping (0) features with normalized variance <= 0.0: set(). [1m(variance_feature_selector.py:114)[0m
[2023-06-03 22:59:37] [[1;32mINFO[0m] Finished composite feature selection step 2/4. [1m(composite_feature_selector.py:113)[0m
[2023-06-03 22:59:37] [[1;32mINFO[0m] Executing composite feature selection step 3/4: PearsonCorrelationFeatureSelector. [1m(composite_feature_selector.py:108)[0m
[2023-06-03 22:59:37] [[1;32mINFO[0m] Selecting features from the following set (29): ['Amount', 'V1', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V2', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9']. [1m(pearson_correlation_feature_selector.py:101)[0m





[2023-06-03 22:59:38] [[1;32mINFO[0m] Dropping (0) features with correlation >= 0.7: set(). [1m(pearson_correlation_feature_selector.py:147)[0m
[2023-06-03 22:59:38] [[1;32mINFO[0m] Finished composite feature selection step 3/4. [1m(composite_feature_selector.py:113)[0m
[2023-06-03 22:59:38] [[1;32mINFO[0m] Executing composite feature selection step 4/4: InvarianceFeatureSelector. [1m(composite_feature_selector.py:108)[0m
[2023-06-03 22:59:38] [[1;32mINFO[0m] Selecting features from the following set (0): []. [1m(invariance_feature_selector.py:98)[0m


Calculating invariance of features: 0it [00:00, ?it/s]

[2023-06-03 22:59:38] [[1;32mINFO[0m] Dropping (0) invariant features: set(). [1m(invariance_feature_selector.py:106)[0m
[2023-06-03 22:59:38] [[1;32mINFO[0m] Finished composite feature selection step 4/4. [1m(composite_feature_selector.py:113)[0m



Writing DataFrame to .arrow file: 100%|██████████| 100/100 [00:00<00:00, 518.01it/s]


[2023-06-03 22:59:38] [[1;32mINFO[0m] Finished step 4/5 execution. [1m(pipeline.py:83)[0m
[2023-06-03 22:59:38] [[1;32mINFO[0m] Executing step 5/5: SplitStep. [1m(pipeline.py:81)[0m
[2023-06-03 22:59:38] [[1;32mINFO[0m] [33mForce Cache Refresh[0m (LRUCache) ExpressionSplitter.split: Executing method. [1m(cache_mixin.py:133)[0m
[2023-06-03 22:59:38] [[1;32mINFO[0m] Splitting dataframe based on expression 'Time > 100'. [1m(expression_splitter.py:92)[0m
[2023-06-03 22:59:38] [[1;32mINFO[0m] Split dataframe into two dataframes with shapes (227718, 31) and (127, 31). [1m(expression_splitter.py:95)[0m


Writing DataFrame to .arrow file: 100%|██████████| 100/100 [00:00<00:00, 410.57it/s]
Writing DataFrame to .arrow file: 100%|██████████| 100/100 [00:00<00:00, 2517.70it/s]

[2023-06-03 22:59:38] [[1;32mINFO[0m] Finished step 5/5 execution. [1m(pipeline.py:83)[0m



