In [1]:
%reload_ext autoreload
%autoreload 2

# Imports
Import the required classes and functions.

In [2]:
from mleko.data.sources import KaggleDataSource
from mleko.data.converters import CsvToArrowConverter
from mleko.data.splitters import RandomDataSplitter, ExpressionDataSplitter
from mleko.pipeline import Pipeline
from mleko.pipeline.steps import IngestStep, ConvertStep, SplitStep

# Constants
Define configuration variables.

In [3]:
OWNER_SLUG = 'mlg-ulb'
DATASET_SLUG = 'creditcardfraud'
DATASET_NAME = f'{OWNER_SLUG}/{DATASET_SLUG}'

TARGET_FEATURE = "Class"
RANDOM_STATE = 1337

# Pipeline Setup

In [4]:
kaggle_data_source = KaggleDataSource(f"data/{DATASET_NAME}/raw", owner_slug=OWNER_SLUG, dataset_slug=DATASET_SLUG)
csv_to_arrow_converter = CsvToArrowConverter(output_directory=f"data/{DATASET_NAME}/converted", downcast_float=True, random_state=RANDOM_STATE)
random_data_splitter = RandomDataSplitter(output_directory=f"data/{DATASET_NAME}/split", data_split=(0.80, 0.20), shuffle=True, stratify=TARGET_FEATURE, random_state=RANDOM_STATE)
expression_data_splitter = ExpressionDataSplitter(output_directory=f"data/{DATASET_NAME}/split", expression="Time > 100")

pipeline = Pipeline(steps=[
    IngestStep(kaggle_data_source, outputs=["raw_csv"]),
    ConvertStep(csv_to_arrow_converter, inputs=["raw_csv"], outputs=["df_clean"]),
    SplitStep(random_data_splitter, inputs=["df_clean"], outputs=["df_train_validate", "df_test"]),
    SplitStep(expression_data_splitter, inputs=["df_train_validate"], outputs=["df_train", "df_validate"]),
])

[2023-05-21 00:04:12] [[1;32mINFO[0m] Attempting to fetch Kaggle API credentials from environment variables 'KAGGLE_USERNAME' and 'KAGGLE_KEY'. [1m(kaggle_data_source.py:77)[0m
[2023-05-21 00:04:12] [[1;32mINFO[0m] Kaggle credentials successfully fetched. [1m(kaggle_data_source.py:94)[0m


# Run Pipeline

In [6]:
data_container =  pipeline.run().data

[2023-05-21 00:04:55] [[1;32mINFO[0m] Executing step 1: IngestStep [1m(pipeline.py:74)[0m
[2023-05-21 00:04:55] [[1;32mINFO[0m] Local dataset is up to date with Kaggle, skipping download. [1m(kaggle_data_source.py:271)[0m
[2023-05-21 00:04:55] [[1;32mINFO[0m] Finished step 1 [1m(pipeline.py:76)[0m
[2023-05-21 00:04:55] [[1;32mINFO[0m] Executing step 2: ConvertStep [1m(pipeline.py:74)[0m
[2023-05-21 00:04:56] [[1;32mINFO[0m] [31mCache Miss[0m (LRUCache) CsvToArrowConverter.convert: Executing method. [1m(cache.py:150)[0m


Converting CSV files: 100%|██████████| 1/1 [00:02<00:00,  2.87s/it]
Writing DataFrame to Arrow file: 100%|██████████| 100/100 [00:00<00:00, 744.74it/s]

[2023-05-21 00:05:00] [[1;32mINFO[0m] Finished step 2 [1m(pipeline.py:76)[0m
[2023-05-21 00:05:00] [[1;32mINFO[0m] Executing step 3: SplitStep [1m(pipeline.py:74)[0m
[2023-05-21 00:05:00] [[1;32mINFO[0m] [31mCache Miss[0m (LRUCache) RandomDataSplitter.split: Executing method. [1m(cache.py:150)[0m
[2023-05-21 00:05:00] [[1;32mINFO[0m] Shuffling data before splitting. [1m(splitters.py:128)[0m
[2023-05-21 00:05:00] [[1;32mINFO[0m] Splitting data with stratification on column 'Class'. [1m(splitters.py:132)[0m





[2023-05-21 00:05:00] [[1;32mINFO[0m] Split dataframe into two dataframes with shapes (227845, 32) and (56962, 32). [1m(splitters.py:144)[0m


Writing DataFrame to Arrow file: 100%|██████████| 100/100 [00:00<00:00, 427.14it/s]
Writing DataFrame to Arrow file: 100%|██████████| 100/100 [00:00<00:00, 653.68it/s]

[2023-05-21 00:05:00] [[1;32mINFO[0m] Finished step 3 [1m(pipeline.py:76)[0m
[2023-05-21 00:05:00] [[1;32mINFO[0m] Executing step 4: SplitStep [1m(pipeline.py:74)[0m





[2023-05-21 00:05:01] [[1;32mINFO[0m] [31mCache Miss[0m (LRUCache) ExpressionDataSplitter.split: Executing method. [1m(cache.py:150)[0m
[2023-05-21 00:05:01] [[1;32mINFO[0m] Splitting dataframe based on expression 'Time > 100'. [1m(splitters.py:227)[0m
[2023-05-21 00:05:01] [[1;32mINFO[0m] Split dataframe into two dataframes with shapes (227718, 31) and (127, 31). [1m(splitters.py:230)[0m


Writing DataFrame to Arrow file: 100%|██████████| 100/100 [00:00<00:00, 484.69it/s]
Writing DataFrame to Arrow file: 100%|██████████| 100/100 [00:00<00:00, 2805.54it/s]

[2023-05-21 00:05:01] [[1;32mINFO[0m] Finished step 4 [1m(pipeline.py:76)[0m





In [7]:
data_container['df_validate']

#,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.3598071336738,-0.0727811733098497,2.53634673796914,1.37815522427443,-0.338320769942518,0.462387777762292,0.239598554061257,0.0986979012610507,0.363786969611213,0.0907941719789316,-0.551599533260813,-0.617800855762348,-0.991389847235408,-0.311169353699879,1.46817697209427,-0.470400525259478,0.207971241929242,0.0257905801985591,0.403992960255733,0.251412098239705,-0.018306777944153,0.277837575558899,-0.110473910188767,0.0669280749146731,0.128539358273528,-0.189114843888824,0.133558376740387,-0.0210530534538215,149.62,0
1,0.0,1.19185711131486,0.26615071205963,0.16648011335321,0.448154078460911,0.0600176492822243,-0.0823608088155687,-0.0788029833323113,0.0851016549148104,-0.255425128109186,-0.166974414004614,1.61272666105479,1.06523531137287,0.48909501589608,-0.143772296441519,0.635558093258208,0.463917041022171,-0.114804663102346,-0.183361270123994,-0.145783041325259,-0.0690831352230203,-0.225775248033138,-0.638671952771851,0.101288021253234,-0.339846475529127,0.167170404418143,0.125894532368176,-0.00898309914322813,0.0147241691924927,2.69,0
2,1.0,-1.35835406159823,-1.34016307473609,1.77320934263119,0.379779593034328,-0.503198133318193,1.80049938079263,0.791460956450422,0.247675786588991,-1.51465432260583,0.207642865216696,0.624501459424895,0.066083685268831,0.717292731410831,-0.165945922763554,2.34586494901581,-2.89008319444231,1.10996937869599,-0.121359313195888,-2.26185709530414,0.524979725224404,0.247998153469754,0.771679401917229,0.909412262347719,-0.689280956490685,-0.327641833735251,-0.139096571514147,-0.0553527940384261,-0.0597518405929204,378.66,0
3,1.0,-0.966271711572087,-0.185226008082898,1.79299333957872,-0.863291275036453,-0.0103088796030823,1.24720316752486,0.23760893977178,0.377435874652262,-1.38702406270197,-0.0549519224713749,-0.226487263835401,0.178228225877303,0.507756869957169,-0.28792374549456,-0.631418117709045,-1.0596472454325,-0.684092786345479,1.96577500349538,-1.2326219700892,-0.208037781160366,-0.108300452035545,0.00527359678253453,-0.190320518742841,-1.17557533186321,0.647376034602038,-0.221928844458407,0.0627228487293033,0.0614576285006353,123.5,0
4,2.0,-1.15823309349523,0.877736754848451,1.548717846511,0.403033933955121,-0.407193377311653,0.0959214624684256,0.592940745385545,-0.270532677192282,0.817739308235294,0.753074431976354,-0.822842877946363,0.53819555014995,1.3458515932154,-1.11966983471731,0.175121130008994,-0.451449182813529,-0.237033239362776,-0.0381947870352842,0.803486924960175,0.408542360392758,-0.00943069713232919,0.79827849458971,-0.137458079619063,0.141266983824769,-0.206009587619756,0.502292224181569,0.219422229513348,0.215153147499206,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122,97.0,1.27381789287239,0.0580090544630036,-1.31242763447362,-0.0489688219755029,2.2443730490817,3.29621142010218,-0.342624648072111,0.760869775317302,-0.154915304214236,0.068384951397303,-0.161823827705382,0.0292246193395541,-0.0206316861725242,0.528189225867633,1.10597698611605,0.375591596223863,-0.926724745512419,0.292544448245333,-0.00288674673124634,0.00890562616536168,-0.0172920944609741,-0.182882699753908,-0.146980903083662,1.00470003322996,0.842882626683356,-0.314280812961772,0.0137442559543499,0.0159071705760456,17.8,0
123,98.0,-0.646513324100542,1.00419850064253,1.61622395046255,-0.0996280015000763,-0.12247685082686,-0.671326722363723,0.656183050929891,0.00975509061850092,-0.6359632919451,-0.0473640939765236,1.53093752074106,0.824084110047441,0.148016384050428,0.266959455514816,0.120395389180657,0.319464747264325,-0.593328286482491,0.044223291332264,0.232081457920242,0.0931905600426426,-0.147934496844679,-0.420045847357141,0.0614243996703537,0.520997080284645,-0.238845350850685,0.0301351020966576,0.140480523947087,0.101163284755117,14.98,0
124,98.0,1.02711436915778,-1.2725428662228,0.673656443011109,-0.747435800812143,-1.2991070607699,0.293656210351348,-1.05419978026175,0.234899955174753,-0.574990847945411,0.680725593968166,1.21834828084553,-0.247881782174543,-0.242920244872586,-0.103666776917496,0.767171902396622,1.5231419165359,-0.000751675104036528,-0.604428955906459,0.23446053259323,0.345875660383277,0.522112894018293,1.10191150591042,-0.248747022416765,-0.259413925287394,0.296057687734508,-0.0309956046666607,0.0184648520232704,0.0334818412804383,159.0,0
125,99.0,-0.883995649772828,-0.150764822957996,2.29179072147755,-0.263452268327782,-0.814535284284635,0.955840627763703,0.0976306732312271,0.474046969090009,0.139512299928856,-0.729861201923768,0.711062608544338,0.095006434720644,-1.09750534430121,-0.0597015195949617,0.234557225294908,-0.142193908419341,0.193357555365588,0.217853313993545,1.15557112117309,0.358751010196778,0.0709014399364962,0.0518320695040774,0.110297657345214,-0.260628692852532,-0.0975487192089246,1.15543923721475,-0.0211993299630798,0.0625654360473211,142.71,0
