## DataCatalog

In [28]:
from kedro.io import KedroDataCatalog

The [`kedro-datasets`](https://github.com/kedro-org/kedro-plugins/tree/main/kedro-datasets) package offers connectors to load data from different formats, such as CSV, Excel, Parquet, and more. 

In [29]:
# Define the configuration details for the companies, shuttles, and reviews datasets as a dictionary

catalog_config = {
    "companies": {
        "type": "pandas.CSVDataset",
        "filepath": "../data/01_raw/companies.csv",
    },
    "shuttles": {
        "type": "pandas.ExcelDataset",
        "filepath": "../data/01_raw/shuttles.xlsx",
    },
    "reviews": {
        "type": "pandas.CSVDataset",
        "filepath": "../data/01_raw/reviews.csv",
    }
}

In [30]:
catalog = KedroDataCatalog.from_config(catalog_config)

In [31]:
catalog.list()

['companies', 'shuttles', 'reviews']

In [38]:
companies = catalog.load("companies")
companies.head()

Unnamed: 0,id,company_rating,company_location,total_fleet_count,iata_approved
0,3888,100%,Isle of Man,1.0,f
1,46728,100%,,1.0,f
2,34618,38%,Isle of Man,1.0,f
3,28619,100%,Bosnia and Herzegovina,1.0,f
4,8240,,Chile,1.0,t


### Load and preprocess the data

In [39]:
shuttles = catalog.load("shuttles")
reviews = catalog.load("reviews")

In [40]:
companies["iata_approved"] = companies["iata_approved"] == "t"
companies["company_rating"] = (companies["company_rating"].str.replace("%", "").astype(float) / 100)

In [41]:
shuttles["d_check_complete"] = shuttles["d_check_complete"] == "t"
shuttles["moon_clearance_complete"] = shuttles["moon_clearance_complete"] == "t"
shuttles["price"] = (
    shuttles["price"].str.replace("$", "").str.replace(",", "").astype(float)
)
rated_shuttles = shuttles.merge(reviews, left_on="id", right_on="shuttle_id")

Unnamed: 0,id_x,shuttle_location,shuttle_type,engine_type,engine_vendor,engines,passenger_capacity,cancellation_policy,crew,d_check_complete,...,review_scores_crew,review_scores_location,review_scores_price,number_of_reviews,reviews_per_month,id_y,company_rating,company_location,total_fleet_count,iata_approved
0,45163,Sao Tome and Principe,Type V5,Plasma,ThetaBase Services,2.0,4,moderate,2.0,False,...,9.0,9.0,9.0,26,0.77,32413,1.0,Faroe Islands,1.0,False
1,49438,Wallis and Futuna,Type V2,Plasma,ThetaBase Services,3.0,5,moderate,3.0,False,...,10.0,10.0,9.0,61,0.62,14122,1.0,Malta,1.0,True
2,10750,Niue,Type F5,Quantum,ThetaBase Services,1.0,2,strict,1.0,True,...,10.0,10.0,10.0,467,4.66,47761,1.0,Niue,2.0,False
3,4146,Malta,Type V2,Quantum,ThetaBase Services,1.0,2,moderate,1.0,False,...,10.0,9.0,9.0,318,3.22,26648,1.0,Niue,2.0,True
4,5067,Malta,Type V2,Plasma,ThetaBase Services,5.0,10,strict,5.0,False,...,10.0,9.0,10.0,22,0.29,26648,1.0,Niue,2.0,True


In [None]:
model_input_table = rated_shuttles.merge(companies, left_on="company_id", right_on="id")
model_input_table = model_input_table.dropna()
model_input_table.head()

In [42]:
# Model training
from sklearn.model_selection import train_test_split

def split_train_test_val(data, target, test_size, random_state):
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

In [43]:
test_size = 0.3
random_state = 3

In [45]:
data = model_input_table[
    [
        "engines",
        "passenger_capacity",
        "crew",
        "d_check_complete",
        "moon_clearance_complete",
        "iata_approved",
        "company_rating",
        "review_scores_rating",
    ]
]
target = model_input_table["price"]

X_train, X_test, y_train, y_test = split_train_test_val(data, target, test_size, random_state)

## OmegaConfigLoader

To further simplify things and separate out the configuration from the code, we can use Kedro's `OmegaConfigLoader` to load the configuration from a YAML file. This way, we can easily change the configuration without modifying the code.

`OmegaConfigLoader` expects the following folder structure for the configuration source directory:
```
conf/
    base/
        catalog.yml
        parameters.yml
    local/
        catalog.yml
        parameters.yml
    logging.yml
```

To use `OmegaConfigLoader` together with `KedroDataCatalog`, we need to define the dataset configurations in the `catalog.yml` file and parameters in the `parameters.yml` file.

In your `conf/base/catalog.yml` file, add the following configuration:
```yaml
companies:
  type: pandas.CSVDataset
  filepath: data/companies.csv

reviews:
  type: pandas.CSVDataset
  filepath: data/reviews.csv

shuttles:
  type: pandas.ExcelDataset
  filepath: data/shuttles.xlsx
```

In your `conf/base/parameters.yml` file, add the following configuration:
```yaml
# params.yml

test_size: 0.3
random_state: 3
features:
    - engines
    - passenger_capacity
    - crew
    - d_check_complete
    - moon_clearance_complete
    - iata_approved
    - company_rating
    - review_scores_rating
```



In [67]:
from kedro.config import OmegaConfigLoader

In [68]:
config_loader = OmegaConfigLoader(conf_source="../conf/", base_env="base", default_run_env="local")

In [69]:
catalog_config = config_loader["catalog"]

In [70]:
parameters = config_loader["parameters"]

In [71]:
test_size = parameters["split_options"]["test_size"]
random_state = parameters["split_options"]["random_state"]
X = model_input_table[parameters["feature_engineering"]["feature"]["static"]]
y = model_input_table["price"]

X_train, X_test, y_train, y_test = split_train_test_val(data, target, test_size, random_state)

## Use Kedro’s configuration loader to load the Data Catalog


In [73]:
catalog = KedroDataCatalog.from_config(config_loader["catalog"])


In [74]:
catalog.list()

['companies',
 'reviews',
 'shuttles',
 'ingestion.int_typed_companies',
 'ingestion.int_typed_shuttles@pandas1',
 'ingestion.int_typed_shuttles@pandas2',
 'ingestion.int_typed_reviews',
 'ingestion.prm_agg_companies',
 'prm_shuttle_company_reviews',
 'prm_spine_table',
 'feature_importance_output',
 'model_input_table',
 'train_evaluation.linear_regression.regressor',
 'train_evaluation.random_forest.regressor',
 'reporting.cancellation_policy_breakdown',
 'reporting.price_histogram',
 'reporting.feature_importance',
 'reporting.cancellation_policy_grid',
 'reporting.confusion_matrix',
 'reporting.top_shuttle_data',
 'X_train',
 'X_test',
 'y_train',
 'y_test',
 'ingestion.prm_spine_table_clone',
 'feature_engineering.feat_static_features',
 'feature_engineering.feat_derived_features',
 'train_evaluation.random_forest.r2_score',
 'train_evaluation.random_forest.experiment_params',
 'train_evaluation.linear_regression.r2_score',
 'train_evaluation.linear_regression.experiment_params']

## Visualise pipelines inside the notebook

In [6]:
from kedro.pipeline import pipeline, node

from typing import List, Tuple

import numpy as np
import pandas as pd


def _is_true(column: pd.Series) -> pd.Series:
    return column == "t"


def apply_types_to_companies(companies: pd.DataFrame) -> pd.DataFrame:
    companies["iata_approved"] = _is_true(companies["iata_approved"])
    companies["company_rating"] = (
        companies["company_rating"].str.replace("%", "").astype(float) / 100
    )
    return companies


def apply_types_to_shuttles(shuttles: pd.DataFrame) -> pd.DataFrame:
    shuttles["d_check_complete"] = _is_true(shuttles["d_check_complete"])
    shuttles["moon_clearance_complete"] = _is_true(shuttles["moon_clearance_complete"])
    shuttles["price"] = (
        shuttles["price"].str.replace(r"[\$,]", "", regex=True).astype(float)
    )
    return shuttles


def apply_types_to_reviews(
    reviews: pd.DataFrame, columns_as_floats: List[str]
) -> pd.DataFrame:
    non_null_reviews = reviews.dropna()

    # Retrieve columns to type
    all_columns_set = set(non_null_reviews.columns)
    float_columns_set = set(columns_as_floats)
    integer_columns_set = all_columns_set - float_columns_set

    # Prepare dictionaries to apply
    new_integer_columns = {c: int for c in integer_columns_set}
    new_float_columns = {c: float for c in float_columns_set}
    new_dtypes = {**new_integer_columns, **new_float_columns}  # merge dictionaries

    # Apply types
    typed_reviews = non_null_reviews.astype(new_dtypes)

    # With add ID column to review table
    return typed_reviews.assign(review_id=lambda df: df.index + 1)


def aggregate_company_data(typed_companies: pd.DataFrame) -> pd.DataFrame:
    working_companies = typed_companies.groupby(["id"]).agg(
        {
            "company_rating": "mean",
            "company_location": lambda x: list(set(x))[0],  # Take first item
            "total_fleet_count": "max",
            "iata_approved": any,
        }
    )
    return working_companies.reset_index().rename(columns={"id": "company_id"})


def combine_shuttle_level_information(
    shuttles: pd.DataFrame, companies: pd.DataFrame, reviews: pd.DataFrame
) -> Tuple[pd.DataFrame, pd.DataFrame]:
   
    rated_shuttles = shuttles.rename(columns={"id": "shuttle_id"}).merge(
        reviews, on="shuttle_id", how="inner"
    )
    combined_table = rated_shuttles.merge(companies, on="company_id", how="inner")

    working_table = combined_table.dropna(how="any")
    id_columns = [x for x in working_table.columns if x.endswith("id")]
    return working_table, working_table[id_columns]


apply_types_to_companies = node(func=apply_types_to_companies, inputs="companies", outputs="int_typed_companies")
apply_types_to_shuttles = node(func=apply_types_to_shuttles, inputs="shuttles", outputs="int_typed_shuttles@pandas1")
apply_types_to_reviews = node(func=apply_types_to_reviews, inputs=["reviews", "params:typing.reviews.columns_as_floats"], outputs="int_typed_reviews")
company_agg = node(func=aggregate_company_data, inputs="int_typed_companies", outputs="prm_agg_companies")
combine_step = node(func=combine_shuttle_level_information, inputs={"shuttles": "int_typed_shuttles@pandas2", "reviews": "int_typed_reviews","companies": "prm_agg_companies",}, outputs=["prm_shuttle_company_reviews", "prm_spine_table"])
prm_spine_table_clone = node(func=lambda x: x, inputs="prm_spine_table", outputs="prm_spine_table_clone",
                name="prm_spine_table_clone",
            )

dummy_pipe = pipeline([apply_types_to_companies, apply_types_to_shuttles, apply_types_to_reviews, company_agg, combine_step, prm_spine_table_clone])

from kedro_viz.integrations.notebook import NotebookVisualizer
NotebookVisualizer(dummy_pipe).show()

                        