In [1]:
%reload_ext autoreload
%autoreload 2

# Imports
Import the required classes and functions.

In [2]:
from mleko.dataset.convert import CSVToVaexConverter
from mleko.dataset.ingest import KaggleIngester
from mleko.dataset.feature_select import CompositeFeatureSelector, MissingRateFeatureSelector, PearsonCorrelationFeatureSelector
from mleko.dataset.split import RandomSplitter
from mleko.dataset.transform import LabelEncoderTransformer, CompositeTransformer, MinMaxScalerTransformer
from mleko.pipeline import Pipeline
from mleko.pipeline.steps import ConvertStep, IngestStep, SplitStep, TransformStep, FeatureSelectStep

from mleko.utils.custom_logger import CustomLogger
import logging

CustomLogger.set_global_log_level(logging.DEBUG)


# Constants
Define configuration variables.

In [3]:
# Kaggle dataset identifier
OWNER_SLUG = "jsphyg"
DATASET_SLUG = "weather-dataset-rattle-package"
DATASET_NAME = f"{OWNER_SLUG}/{DATASET_SLUG}"

# Dataset feature configuration
TARGET_FEATURE = "RainTomorrow"
DATE_FEATURE = "Date"
META_FEATURES = [TARGET_FEATURE, DATE_FEATURE]

# General Configuration
RANDOM_STATE = 1337


# Pipeline Setup

In [4]:
# Download the dataset from Kaggle
from calendar import c


kaggle_data_source = KaggleIngester(
    destination_directory=f"data/{DATASET_NAME}/raw", owner_slug=OWNER_SLUG, dataset_slug=DATASET_SLUG
)

# Convert the CSV file to Apache Arrow format
csv_to_arrow_converter = CSVToVaexConverter(
    output_directory=f"data/{DATASET_NAME}/converted",
    na_values=["NA"],
    true_values=["Yes"],
    false_values=["No"],
    downcast_float=True,
    random_state=RANDOM_STATE,
)

# Divide the dataset into train and test sets
random_data_splitter = RandomSplitter(
    cache_directory=f"data/{DATASET_NAME}/split",
    data_split=(0.80, 0.20),
    shuffle=True,
    stratify=TARGET_FEATURE,
    random_state=RANDOM_STATE,
)

# Configure all the feature selectors in a grouped composite feature selector
# The feature selectors are applied in the order they are defined and can be chained as needed
composite_feature_selector = CompositeFeatureSelector(
    cache_directory=f"data/{DATASET_NAME}/feature_select",
    feature_selectors=[
        MissingRateFeatureSelector(
            cache_directory=f"data/{DATASET_NAME}/feature_select",
            missing_rate_threshold=0.5,
            ignore_features=META_FEATURES,
        ),
        PearsonCorrelationFeatureSelector(
            cache_directory=f"data/{DATASET_NAME}/feature_select",
            correlation_threshold=0.6,
            ignore_features=META_FEATURES,
        ),
    ],
)

# Configure all the transformers in a grouped composite transformer
# The transformers are applied in the order they are defined and can be chained as needed
composite_transformer = CompositeTransformer(
    cache_directory=f"data/{DATASET_NAME}/transform",
    transformers=[
        LabelEncoderTransformer(
            cache_directory=f"data/{DATASET_NAME}/transform", features=["WindGustDir", "WindDir9am", "WindDir3pm"]
        ),
        MinMaxScalerTransformer(
            cache_directory=f"data/{DATASET_NAME}/transform", features=["WindGustSpeed"], min_value=1, max_value=2
        ),
    ],
)

# Configure the pipeline and the data flow
pipeline = Pipeline(
    steps=[
        IngestStep(kaggle_data_source, outputs=["raw_csv"]),
        ConvertStep(csv_to_arrow_converter, inputs=["raw_csv"], outputs=["df_clean"]),
        SplitStep(random_data_splitter, inputs=["df_clean"], outputs=["df_clean_train_validate", "df_clean_test"]),
        FeatureSelectStep(
            composite_feature_selector,
            fit=True,
            inputs=["df_clean_train_validate"],
            outputs=["df_selected_train_validate"],
            cache_group="fit_transform_train_validate",
        ),
        FeatureSelectStep(
            composite_feature_selector,
            fit=False,
            inputs=["df_clean_test"],
            outputs=["df_selected_test"],
            cache_group="transform_test",
        ),
        TransformStep(
            composite_transformer,
            fit=True,
            inputs=["df_selected_train_validate"],
            outputs=["df_transformed_train_validate"],
            cache_group="fit_transform_train_validate",
        ),
        TransformStep(
            composite_transformer,
            fit=False,
            inputs=["df_selected_test"],
            outputs=["df_transformed_test"],
            cache_group="fit_transform_test",
        ),
    ]
)

[2023-07-08 00:26:20] [[1;32mINFO[0m] Attempting to fetch Kaggle API credentials from environment variables 'KAGGLE_USERNAME' and 'KAGGLE_KEY'. [1m(kaggle_ingester.py:74)[0m
[2023-07-08 00:26:20] [[1;32mINFO[0m] Kaggle API credentials not found in environment variables, attempting to fetch from fallback path at ~/.kaggle/kaggle.json. [1m(kaggle_ingester.py:82)[0m
[2023-07-08 00:26:20] [[1;32mINFO[0m] Kaggle credentials successfully fetched. [1m(kaggle_ingester.py:91)[0m


# Run Pipeline

In [5]:
data_container =  pipeline.run().data

[2023-07-08 00:26:22] [[1;32mINFO[0m] No data container provided. Creating an empty one. [1m(pipeline.py:77)[0m
[2023-07-08 00:26:22] [[1;32mINFO[0m] Executing step 1/7: IngestStep. [1m(pipeline.py:81)[0m
[2023-07-08 00:26:23] [[1;32mINFO[0m] [31mCache Miss[0m: Downloading jsphyg/weather-dataset-rattle-package/* to data/jsphyg/weather-dataset-rattle-package/raw from Kaggle. [1m(kaggle_ingester.py:292)[0m


Downloading files from Kaggle:   0%|          | 0/1 [00:00<?, ?it/s]

[2023-07-08 00:26:24] [[1;32mINFO[0m] Finished downloading 1 files from Kaggle. [1m(kaggle_ingester.py:303)[0m
[2023-07-08 00:26:24] [[1;32mINFO[0m] Finished step 1/7 execution. [1m(pipeline.py:83)[0m
[2023-07-08 00:26:24] [[1;32mINFO[0m] Executing step 2/7: ConvertStep. [1m(pipeline.py:81)[0m
[2023-07-08 00:26:25] [[1;32mINFO[0m] [31mCache Miss[0m (LRUCache) CSVToVaexConverter.convert: Executing method. [1m(cache_mixin.py:162)[0m


Converting CSV files:   0%|          | 0/1 [00:00<?, ?it/s]

Writing DataFrame to .arrow file:   0%|          | 0/100 [00:00<?, ?it/s]

[2023-07-08 00:26:28] [[1;32mINFO[0m] Finished step 2/7 execution. [1m(pipeline.py:83)[0m
[2023-07-08 00:26:28] [[1;32mINFO[0m] Executing step 3/7: SplitStep. [1m(pipeline.py:81)[0m
[2023-07-08 00:26:28] [[1;32mINFO[0m] [31mCache Miss[0m (LRUCache) RandomSplitter.split: Executing method. [1m(cache_mixin.py:162)[0m
[2023-07-08 00:26:28] [[1;32mINFO[0m] Shuffling data before splitting. [1m(random_splitter.py:135)[0m
[2023-07-08 00:26:28] [[1;32mINFO[0m] Splitting data with stratification on column 'RainTomorrow'. [1m(random_splitter.py:139)[0m
[2023-07-08 00:26:28] [[1;32mINFO[0m] Split dataframe into two dataframes with shapes (116368, 24) and (29092, 24). [1m(random_splitter.py:151)[0m


Writing DataFrame to .arrow file:   0%|          | 0/100 [00:00<?, ?it/s]

Writing DataFrame to .arrow file:   0%|          | 0/100 [00:00<?, ?it/s]

[2023-07-08 00:26:28] [[1;32mINFO[0m] Finished step 3/7 execution. [1m(pipeline.py:83)[0m
[2023-07-08 00:26:28] [[1;32mINFO[0m] Executing step 4/7: FeatureSelectStep. [1m(pipeline.py:81)[0m
[2023-07-08 00:26:28] [[1;32mINFO[0m] [31mCache Miss[0m (LRUCache) CompositeFeatureSelector.select_features: Executing method. [1m(cache_mixin.py:162)[0m
[2023-07-08 00:26:28] [[1;32mINFO[0m] Executing composite feature selection step 1/2: MissingRateFeatureSelector. [1m(composite_feature_selector.py:129)[0m
[2023-07-08 00:26:28] [[1;32mINFO[0m] Fitting missing rate feature selector on 21 features: ['Cloud3pm', 'Cloud9am', 'Evaporation', 'Humidity3pm', 'Humidity9am', 'Location', 'MaxTemp', 'MinTemp', 'Pressure3pm', 'Pressure9am', 'RainToday', 'Rainfall', 'Sunshine', 'Temp3pm', 'Temp9am', 'WindDir3pm', 'WindDir9am', 'WindGustDir', 'WindGustSpeed', 'WindSpeed3pm', 'WindSpeed9am']. [1m(missing_rate_feature_selector.py:102)[0m


Calculating missing rates for features:   0%|          | 0/21 [00:00<?, ?it/s]

[2023-07-08 00:26:29] [[1;32mINFO[0m] Dropping (0) features with missing rate >= 0.5: set(). [1m(missing_rate_feature_selector.py:88)[0m
[2023-07-08 00:26:29] [[1;32mINFO[0m] Finished composite feature selection step 1/2. [1m(composite_feature_selector.py:135)[0m
[2023-07-08 00:26:29] [[1;32mINFO[0m] Executing composite feature selection step 2/2: PearsonCorrelationFeatureSelector. [1m(composite_feature_selector.py:129)[0m
[2023-07-08 00:26:29] [[1;32mINFO[0m] Fitting pearson correlation feature selector on 16 features: ['Cloud3pm', 'Cloud9am', 'Evaporation', 'Humidity3pm', 'Humidity9am', 'MaxTemp', 'MinTemp', 'Pressure3pm', 'Pressure9am', 'Rainfall', 'Sunshine', 'Temp3pm', 'Temp9am', 'WindGustSpeed', 'WindSpeed3pm', 'WindSpeed9am']. [1m(pearson_correlation_feature_selector.py:105)[0m
[2023-07-08 00:26:29] [[1;32mINFO[0m] Dropping (11) features with correlation >= 0.6: {'Sunshine', 'MaxTemp', 'Cloud9am', 'WindSpeed3pm', 'Temp3pm', 'Pressure3pm', 'WindSpeed9am', 'Temp9

Writing DataFrame to .arrow file:   0%|          | 0/100 [00:00<?, ?it/s]

[2023-07-08 00:26:29] [[1;32mINFO[0m] Saving feature selector to data/jsphyg/weather-dataset-rattle-package/feature_select/CompositeFeatureSelector.select_features.fit_transform_train_validate.d818e3237ba342b5628c70d7d0066add.feature_selector. [1m(base_feature_selector.py:183)[0m
[2023-07-08 00:26:29] [[1;32mINFO[0m] Finished step 4/7 execution. [1m(pipeline.py:83)[0m
[2023-07-08 00:26:29] [[1;32mINFO[0m] Executing step 5/7: FeatureSelectStep. [1m(pipeline.py:81)[0m
[2023-07-08 00:26:29] [[1;32mINFO[0m] [31mCache Miss[0m (LRUCache) CompositeFeatureSelector.select_features: Executing method. [1m(cache_mixin.py:162)[0m
[2023-07-08 00:26:29] [[1;32mINFO[0m] Executing composite feature selection step 1/2: MissingRateFeatureSelector. [1m(composite_feature_selector.py:129)[0m
[2023-07-08 00:26:29] [[1;32mINFO[0m] Dropping (0) features with missing rate >= 0.5: set(). [1m(missing_rate_feature_selector.py:88)[0m
[2023-07-08 00:26:29] [[1;32mINFO[0m] Finished composi

Writing DataFrame to .arrow file:   0%|          | 0/100 [00:00<?, ?it/s]

[2023-07-08 00:26:29] [[1;32mINFO[0m] Finished step 5/7 execution. [1m(pipeline.py:83)[0m
[2023-07-08 00:26:29] [[1;32mINFO[0m] Executing step 6/7: TransformStep. [1m(pipeline.py:81)[0m
[2023-07-08 00:26:29] [[1;32mINFO[0m] [31mCache Miss[0m (LRUCache) CompositeTransformer.transform: Executing method. [1m(cache_mixin.py:162)[0m
[2023-07-08 00:26:29] [[1;32mINFO[0m] Executing composite feature transformation step 1/2: LabelEncoderTransformer. [1m(composite_transformer.py:126)[0m
[2023-07-08 00:26:29] [[1;32mINFO[0m] Fitting label encoder transformer (3): ('WindGustDir', 'WindDir9am', 'WindDir3pm'). [1m(label_encoder_transformer.py:93)[0m
[2023-07-08 00:26:29] [[1;32mINFO[0m] Transforming features using label encoding (3): ('WindGustDir', 'WindDir9am', 'WindDir3pm'). [1m(label_encoder_transformer.py:83)[0m
[2023-07-08 00:26:29] [[1;32mINFO[0m] Finished composite transformation step 1/2. [1m(composite_transformer.py:132)[0m
[2023-07-08 00:26:29] [[1;32mINFO

Writing DataFrame to .arrow file:   0%|          | 0/100 [00:00<?, ?it/s]

[2023-07-08 00:26:29] [[1;32mINFO[0m] Saving transformer to data/jsphyg/weather-dataset-rattle-package/transform/CompositeTransformer.transform.fit_transform_train_validate.5f7ac699de5bfd3305c122756f83951a.transformer. [1m(base_transformer.py:128)[0m
[2023-07-08 00:26:29] [[1;32mINFO[0m] Finished step 6/7 execution. [1m(pipeline.py:83)[0m
[2023-07-08 00:26:29] [[1;32mINFO[0m] Executing step 7/7: TransformStep. [1m(pipeline.py:81)[0m
[2023-07-08 00:26:29] [[1;32mINFO[0m] [31mCache Miss[0m (LRUCache) CompositeTransformer.transform: Executing method. [1m(cache_mixin.py:162)[0m
[2023-07-08 00:26:29] [[1;32mINFO[0m] Executing composite feature transformation step 1/2: LabelEncoderTransformer. [1m(composite_transformer.py:126)[0m
[2023-07-08 00:26:29] [[1;32mINFO[0m] Transforming features using label encoding (3): ('WindGustDir', 'WindDir9am', 'WindDir3pm'). [1m(label_encoder_transformer.py:83)[0m
[2023-07-08 00:26:29] [[1;32mINFO[0m] Finished composite transforma

Writing DataFrame to .arrow file:   0%|          | 0/100 [00:00<?, ?it/s]

[2023-07-08 00:26:30] [[1;32mINFO[0m] Finished step 7/7 execution. [1m(pipeline.py:83)[0m


In [6]:
data_container['df_transformed_test']

#,Date,Location,Rainfall,Evaporation,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,Humidity3pm,Cloud3pm,RainToday,RainTomorrow
0,2008-12-09 00:00:00,Albury,0.0,--,16,1.5736434108527133,4,7,9,--,False,True
1,2008-12-15 00:00:00,Albury,0.0,--,0,--,8,10,32,--,False,--
2,2008-12-20 00:00:00,Albury,0.0,--,13,1.1550387596899225,4,16,26,--,False,False
3,2008-12-24 00:00:00,Albury,0.0,--,3,1.2093023255813953,1,7,23,--,False,False
4,2008-12-27 00:00:00,Albury,0.0,--,12,1.3953488372093024,0,11,28,1,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
29087,2017-05-28 00:00:00,Uluru,0.0,--,6,1.2093023255813953,4,1,25,--,False,False
29088,2017-05-31 00:00:00,Uluru,0.0,--,6,1.310077519379845,6,6,32,--,False,False
29089,2017-06-04 00:00:00,Uluru,0.0,--,6,1.193798449612403,1,6,26,--,False,False
29090,2017-06-05 00:00:00,Uluru,0.0,--,6,1.2403100775193798,1,1,11,--,False,False
