In [1]:
%reload_ext autoreload
%autoreload 2

# Imports
Import the required classes and functions.

In [2]:
from mleko.dataset.convert import CSVToVaexConverter
from mleko.dataset.ingest import KaggleIngester
from mleko.dataset.split import RandomSplitter
from mleko.dataset.transform import LabelEncoderTransformer, CompositeTransformer, MinMaxScalerTransformer
from mleko.pipeline import Pipeline
from mleko.pipeline.steps import ConvertStep, IngestStep, SplitStep, TransformStep


# Constants
Define configuration variables.

In [3]:
# Kaggle dataset identifier
OWNER_SLUG = "jsphyg"
DATASET_SLUG = "weather-dataset-rattle-package"
DATASET_NAME = f"{OWNER_SLUG}/{DATASET_SLUG}"

# Dataset feature configuration
TARGET_FEATURE = "RainTomorrow"
DATE_FEATURES = ["Date"]
META_FEATURES = [TARGET_FEATURE, DATE_FEATURES]

# General Configuration
RANDOM_STATE = 1337


# Pipeline Setup

In [30]:
# Download the dataset from Kaggle
kaggle_data_source = KaggleIngester(
    destination_directory=f"data/{DATASET_NAME}/raw", owner_slug=OWNER_SLUG, dataset_slug=DATASET_SLUG
)

# Convert the CSV file to Apache Arrow format
csv_to_arrow_converter = CSVToVaexConverter(
    output_directory=f"data/{DATASET_NAME}/converted",
    na_values=["NA"],
    true_values=["Yes"],
    false_values=["No"],
    downcast_float=True,
    random_state=RANDOM_STATE,
)

# Divide the dataset into train and test sets
random_data_splitter = RandomSplitter(
    cache_directory=f"data/{DATASET_NAME}/split",
    data_split=(0.80, 0.20),
    shuffle=True,
    stratify=TARGET_FEATURE,
    random_state=RANDOM_STATE,
)

# Configure all the transformers in a grouped composite transformer
# The transformers are applied in the order they are defined and can be chained as needed
composite_transformer = CompositeTransformer(
    cache_directory=f"data/{DATASET_NAME}/transform",
    transformers=[
        LabelEncoderTransformer(cache_directory=f"data/{DATASET_NAME}/transform", features=["WindGustDir", "WindDir9am", "WindDir3pm"]),
        MinMaxScalerTransformer(
            cache_directory=f"data/{DATASET_NAME}/transform", features=["WindGustSpeed"], min_value=1, max_value=2
        ),
    ],
)

# Configure the pipeline and the data flow
pipeline = Pipeline(
    steps=[
        IngestStep(kaggle_data_source, outputs=["raw_csv"]),
        ConvertStep(csv_to_arrow_converter, inputs=["raw_csv"], outputs=["df_clean"]),
        SplitStep(random_data_splitter, inputs=["df_clean"], outputs=["df_clean_train_validate", "df_clean_test"]),
        TransformStep(
            composite_transformer,
            fit=True,
            inputs=["df_clean_train_validate"],
            outputs=["df_transformed_train_validate"],
            cache_group="fit_transform_train_validate",
        ),
        TransformStep(
            composite_transformer,
            fit=False,
            inputs=["df_clean_test"],
            outputs=["df_transformed_test"],
            cache_group="fit_transform_test",
        ),
    ]
)

[2023-07-07 17:25:25] [[1;32mINFO[0m] Attempting to fetch Kaggle API credentials from environment variables 'KAGGLE_USERNAME' and 'KAGGLE_KEY'. [1m(kaggle_ingester.py:74)[0m
[2023-07-07 17:25:25] [[1;32mINFO[0m] Kaggle API credentials not found in environment variables, attempting to fetch from fallback path at ~/.kaggle/kaggle.json. [1m(kaggle_ingester.py:82)[0m
[2023-07-07 17:25:25] [[1;32mINFO[0m] Kaggle credentials successfully fetched. [1m(kaggle_ingester.py:91)[0m


# Run Pipeline

In [31]:
data_container =  pipeline.run().data

[2023-07-07 17:25:27] [[1;32mINFO[0m] No data container provided. Creating an empty one. [1m(pipeline.py:77)[0m
[2023-07-07 17:25:27] [[1;32mINFO[0m] Executing step 1/5: IngestStep. [1m(pipeline.py:81)[0m
[2023-07-07 17:25:27] [[1;32mINFO[0m] [32mCache Hit[0m: Local dataset is up to date with Kaggle, skipping download. [1m(kaggle_ingester.py:279)[0m
[2023-07-07 17:25:27] [[1;32mINFO[0m] Finished step 1/5 execution. [1m(pipeline.py:83)[0m
[2023-07-07 17:25:27] [[1;32mINFO[0m] Executing step 2/5: ConvertStep. [1m(pipeline.py:81)[0m
[2023-07-07 17:25:27] [[1;32mINFO[0m] [32mCache Hit[0m (LRUCache) CSVToVaexConverter.convert: Using cached output. [1m(cache_mixin.py:157)[0m
[2023-07-07 17:25:27] [[1;32mINFO[0m] Finished step 2/5 execution. [1m(pipeline.py:83)[0m
[2023-07-07 17:25:27] [[1;32mINFO[0m] Executing step 3/5: SplitStep. [1m(pipeline.py:81)[0m
[2023-07-07 17:25:27] [[1;32mINFO[0m] [32mCache Hit[0m (LRUCache) RandomSplitter.split: Using cached

Writing DataFrame to .arrow file:   0%|          | 0/100 [00:00<?, ?it/s]

[2023-07-07 17:25:28] [[1;32mINFO[0m] Saving transformer to data/jsphyg/weather-dataset-rattle-package/transform/CompositeTransformer.transform.fa4cae8748fc8fe45007d67b5999f33e.pkl. [1m(base_transformer.py:101)[0m
[2023-07-07 17:25:28] [[1;32mINFO[0m] Finished step 4/5 execution. [1m(pipeline.py:83)[0m
[2023-07-07 17:25:28] [[1;32mINFO[0m] Executing step 5/5: TransformStep. [1m(pipeline.py:81)[0m
[2023-07-07 17:25:28] [[1;32mINFO[0m] [31mCache Miss[0m (LRUCache) CompositeTransformer.transform: Executing method. [1m(cache_mixin.py:162)[0m
[2023-07-07 17:25:28] [[1;32mINFO[0m] Executing composite feature transformation step 1/2: LabelEncoderTransformer. [1m(composite_transformer.py:126)[0m
[2023-07-07 17:25:28] [[1;32mINFO[0m] Transforming features using label encoding (3): ('WindGustDir', 'WindDir9am', 'WindDir3pm'). [1m(label_encoder_transformer.py:113)[0m
[2023-07-07 17:25:28] [[1;32mINFO[0m] Finished composite transformation step 1/2. [1m(composite_transf

Writing DataFrame to .arrow file:   0%|          | 0/100 [00:00<?, ?it/s]

[2023-07-07 17:25:28] [[1;32mINFO[0m] Finished step 5/5 execution. [1m(pipeline.py:83)[0m


In [28]:
data_container['df_clean_train_validate']

#,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01 00:00:00,Albury,13.4,22.9,0.6,--,--,W,44,W,WNW,20,24,71,22,1007.7,1007.1,8,--,16.9,21.8,False,False
1,2008-12-02 00:00:00,Albury,7.4,25.1,0.0,--,--,WNW,44,NNW,WSW,4,22,44,25,1010.6,1007.8,--,--,17.2,24.3,False,False
2,2008-12-03 00:00:00,Albury,12.9,25.7,0.0,--,--,WSW,46,W,WSW,19,26,38,30,1007.6,1008.7,--,2,21.0,23.2,False,False
3,2008-12-04 00:00:00,Albury,9.2,28.0,0.0,--,--,NE,24,SE,E,11,9,45,16,1017.6,1012.8,--,--,18.1,26.5,False,False
4,2008-12-05 00:00:00,Albury,17.5,32.3,1.0,--,--,W,41,ENE,NW,7,20,82,33,1010.8,1006.0,7,8,17.8,29.7,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116363,2017-06-21 00:00:00,Uluru,2.8,23.4,0.0,--,--,E,31,SE,ENE,13,11,51,24,1024.6,1020.3,--,--,10.1,22.4,False,False
116364,2017-06-22 00:00:00,Uluru,3.6,25.3,0.0,--,--,NNW,22,SE,N,13,9,56,21,1023.5,1019.1,--,--,10.9,24.5,False,False
116365,2017-06-23 00:00:00,Uluru,5.4,26.9,0.0,--,--,N,37,SE,WNW,9,9,53,24,1021.0,1016.8,--,--,12.5,26.1,False,False
116366,2017-06-24 00:00:00,Uluru,7.8,27.0,0.0,--,--,SE,28,SSE,N,13,7,51,24,1019.4,1016.5,3,2,15.1,26.0,False,False


In [32]:
data_container['df_transformed_train_validate']

#,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01 00:00:00,Albury,13.4,22.9,0.6,--,--,11,1.2945736434108528,11,10,20,24,71,22,1007.7,1007.1,8,--,16.9,21.8,False,False
1,2008-12-02 00:00:00,Albury,7.4,25.1,0.0,--,--,10,1.2945736434108528,16,12,4,22,44,25,1010.6,1007.8,--,--,17.2,24.3,False,False
2,2008-12-03 00:00:00,Albury,12.9,25.7,0.0,--,--,12,1.310077519379845,11,12,19,26,38,30,1007.6,1008.7,--,2,21.0,23.2,False,False
3,2008-12-04 00:00:00,Albury,9.2,28.0,0.0,--,--,2,1.1395348837209303,4,6,11,9,45,16,1017.6,1012.8,--,--,18.1,26.5,False,False
4,2008-12-05 00:00:00,Albury,17.5,32.3,1.0,--,--,11,1.2713178294573644,15,7,7,20,82,33,1010.8,1006.0,7,8,17.8,29.7,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116363,2017-06-21 00:00:00,Uluru,2.8,23.4,0.0,--,--,6,1.193798449612403,4,15,13,11,51,24,1024.6,1020.3,--,--,10.1,22.4,False,False
116364,2017-06-22 00:00:00,Uluru,3.6,25.3,0.0,--,--,16,1.124031007751938,4,3,13,9,56,21,1023.5,1019.1,--,--,10.9,24.5,False,False
116365,2017-06-23 00:00:00,Uluru,5.4,26.9,0.0,--,--,3,1.2403100775193798,4,10,9,9,53,24,1021.0,1016.8,--,--,12.5,26.1,False,False
116366,2017-06-24 00:00:00,Uluru,7.8,27.0,0.0,--,--,4,1.1705426356589148,12,3,13,7,51,24,1019.4,1016.5,3,2,15.1,26.0,False,False


In [33]:
data_container['df_clean_test']

#,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-09 00:00:00,Albury,9.7,31.9,0.0,--,--,NNW,80,SE,NW,7,28,42,9,1008.9,1003.6,--,--,18.3,30.2,False,True
1,2008-12-15 00:00:00,Albury,8.4,24.6,0.0,--,--,--,--,S,WNW,4,30,57,32,1009.7,1008.7,--,--,15.9,23.5,False,--
2,2008-12-20 00:00:00,Albury,9.8,25.6,0.0,--,--,SSE,26,SE,NNW,17,6,45,26,1019.2,1017.1,--,--,15.8,23.2,False,False
3,2008-12-24 00:00:00,Albury,15.3,30.9,0.0,--,--,N,33,ESE,NW,6,13,55,23,1011.0,1008.2,5,--,20.9,29.0,False,False
4,2008-12-27 00:00:00,Albury,16.9,33.0,0.0,--,--,WSW,57,--,W,0,26,41,28,1006.8,1003.6,--,1,26.6,31.2,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29087,2017-05-28 00:00:00,Uluru,8.0,24.6,0.0,--,--,E,33,SE,ESE,11,13,46,25,1021.7,1018.8,4,--,13.8,23.5,False,False
29088,2017-05-31 00:00:00,Uluru,5.4,20.5,0.0,--,--,E,46,E,E,20,28,56,32,1029.2,1025.3,--,--,11.1,20.2,False,False
29089,2017-06-04 00:00:00,Uluru,4.5,18.8,0.0,--,--,E,31,ESE,E,13,15,42,26,1026.9,1022.4,--,--,9.5,18.6,False,False
29090,2017-06-05 00:00:00,Uluru,4.9,20.7,0.0,--,--,E,37,ESE,ESE,17,24,38,11,1027.1,1023.9,--,--,10.3,19.8,False,False


In [36]:
data_container['df_transformed_test']

#,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-09 00:00:00,Albury,9.7,31.9,0.0,--,--,16,1.5736434108527133,4,7,7,28,42,9,1008.9,1003.6,--,--,18.3,30.2,False,True
1,2008-12-15 00:00:00,Albury,8.4,24.6,0.0,--,--,0,--,8,10,4,30,57,32,1009.7,1008.7,--,--,15.9,23.5,False,--
2,2008-12-20 00:00:00,Albury,9.8,25.6,0.0,--,--,13,1.1550387596899225,4,16,17,6,45,26,1019.2,1017.1,--,--,15.8,23.2,False,False
3,2008-12-24 00:00:00,Albury,15.3,30.9,0.0,--,--,3,1.2093023255813953,1,7,6,13,55,23,1011.0,1008.2,5,--,20.9,29.0,False,False
4,2008-12-27 00:00:00,Albury,16.9,33.0,0.0,--,--,12,1.3953488372093024,0,11,0,26,41,28,1006.8,1003.6,--,1,26.6,31.2,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29087,2017-05-28 00:00:00,Uluru,8.0,24.6,0.0,--,--,6,1.2093023255813953,4,1,11,13,46,25,1021.7,1018.8,4,--,13.8,23.5,False,False
29088,2017-05-31 00:00:00,Uluru,5.4,20.5,0.0,--,--,6,1.310077519379845,6,6,20,28,56,32,1029.2,1025.3,--,--,11.1,20.2,False,False
29089,2017-06-04 00:00:00,Uluru,4.5,18.8,0.0,--,--,6,1.193798449612403,1,6,13,15,42,26,1026.9,1022.4,--,--,9.5,18.6,False,False
29090,2017-06-05 00:00:00,Uluru,4.9,20.7,0.0,--,--,6,1.2403100775193798,1,1,17,24,38,11,1027.1,1023.9,--,--,10.3,19.8,False,False
