In [4]:
from kfp.dsl import component
import pandas as pd
import numpy as np

@component(
    base_image="python:3.10",
    packages_to_install=["pandas", "numpy", "openpyxl"]
)
def clean_electrification_data(
    input_path: str,
    country: str,
    output_path: str
):
    df = pd.read_excel(input_path)

    # Keep relevant columns
    df = df[['Country Name', 'Year', 'Value']]
    df = df[df['Country Name'] == country]

    # Drop missing values
    df = df.dropna()

    # Convert year
    df['Year'] = pd.to_datetime(df['Year'], format='%Y')

    # IQR outlier removal
    Q1 = df['Value'].quantile(0.25)
    Q3 = df['Value'].quantile(0.75)
    IQR = Q3 - Q1

    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    df = df[(df['Value'] >= lower) & (df['Value'] <= upper)]

    df.to_csv(output_path, index=False)


In [5]:
@component(
    base_image="python:3.10",
    packages_to_install=["pandas", "prophet", "scikit-learn"]
)
def prophet_model(
    input_path: str,
    metrics_path: str
):
    import pandas as pd
    import numpy as np
    from prophet import Prophet
    from sklearn.metrics import mean_absolute_error, mean_squared_error

    df = pd.read_csv(input_path)
    df.rename(columns={'Year':'ds','Value':'y'}, inplace=True)

    # Train-test split
    train = df.iloc[:-5]
    test = df.iloc[-5:]

    model = Prophet(yearly_seasonality=True)
    model.fit(train)

    future = model.make_future_dataframe(periods=5, freq='Y')
    forecast = model.predict(future)

    preds = forecast.iloc[-5:]['yhat'].values

    mae = mean_absolute_error(test['y'], preds)
    rmse = np.sqrt(mean_squared_error(test['y'], preds))

    pd.DataFrame({
        "model": ["Prophet"],
        "MAE": [mae],
        "RMSE": [rmse]
    }).to_csv(metrics_path, index=False)


## Kubeflow Component: SARIMA Model

In [6]:
@component(
    base_image="python:3.10",
    packages_to_install=["pandas", "statsmodels", "scikit-learn"]
)
def sarima_model(
    input_path: str,
    metrics_path: str
):
    import pandas as pd
    import numpy as np
    from statsmodels.tsa.statespace.sarimax import SARIMAX
    from sklearn.metrics import mean_absolute_error, mean_squared_error

    df = pd.read_csv(input_path)
    ts = df.set_index('Year')['Value']

    train = ts.iloc[:-5]
    test = ts.iloc[-5:]

    model = SARIMAX(train, order=(1,1,1), seasonal_order=(1,1,1,0))
    fit = model.fit(disp=False)

    preds = fit.forecast(steps=5)

    mae = mean_absolute_error(test, preds)
    rmse = np.sqrt(mean_squared_error(test, preds))

    pd.DataFrame({
        "model": ["SARIMA"],
        "MAE": [mae],
        "RMSE": [rmse]
    }).to_csv(metrics_path, index=False)


## Kubeflow Component: Model Comparison

In [7]:
@component(
    base_image="python:3.10",
    packages_to_install=["pandas"]
)
def compare_models(
    prophet_metrics: str,
    sarima_metrics: str,
    output_path: str
):
    import pandas as pd

    p = pd.read_csv(prophet_metrics)
    s = pd.read_csv(sarima_metrics)

    comparison = pd.concat([p, s])
    comparison.to_csv(output_path, index=False)


## Kubeflow Pipeline Definition

In [8]:
from kfp.dsl import pipeline

pipeline(
    name="electrification-forecasting-pipeline",
    description="Kubeflow pipeline for Prophet vs SARIMA electrification forecasting"
)
def electrification_pipeline(
    input_data: str = "API_EG.ELC.ACCS.ZS_DS2_en_excel_v2_25.xls",
    country: str = "Kenya"
):
    clean_task = clean_electrification_data(
        input_path=input_data,
        country=country,
        output_path="clean_data.csv"
    )

    prophet_task = prophet_model(
        input_path=clean_task.outputs["output_path"],
        metrics_path="prophet_metrics.csv"
    )

    sarima_task = sarima_model(
        input_path=clean_task.outputs["output_path"],
        metrics_path="sarima_metrics.csv"
    )

    compare_models(
        prophet_metrics=prophet_task.outputs["metrics_path"],
        sarima_metrics=sarima_task.outputs["metrics_path"],
        output_path="model_comparison.csv"
    )

## Compile the Pipeline (SUBMISSION STEP)

In [9]:
from kfp import compiler
pipeline_func=electrification_pipeline,
package_path="electrification_pipeline.yaml"


## Imports (Required for Artifacts)

In [10]:
from kfp.dsl import Dataset, Metrics


In [11]:
from kfp.dsl import component, pipeline, Dataset, Metrics


In [12]:
from kfp.dsl import component, Dataset, Metrics

@component
def example_component(
    output_data: Dataset,
    metrics: Metrics
):
    import pandas as pd

    df = pd.DataFrame({"x": [1, 2, 3]})
    df.to_csv(output_data.path, index=False)

    metrics.log_metric("rows", len(df))


  @component


## Data Cleaning Component (Outputs a Dataset Artifact)

In [13]:
@component(
    base_image="python:3.10",
    packages_to_install=["pandas", "numpy", "openpyxl"]
)
def clean_electrification_data(
    input_path: str,
    country: str,
    cleaned_data: Dataset
):
    import pandas as pd
    import numpy as np

    df = pd.read_excel(input_path)

    df = df[['Country Name', 'Year', 'Value']]
    df = df[df['Country Name'] == country]
    df = df.dropna()

    df['Year'] = pd.to_datetime(df['Year'], format='%Y')

    # IQR outlier removal
    Q1 = df['Value'].quantile(0.25)
    Q3 = df['Value'].quantile(0.75)
    IQR = Q3 - Q1

    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    df = df[(df['Value'] >= lower) & (df['Value'] <= upper)]

    df.to_csv(cleaned_data.path, index=False)


## Prophet Component (Logs Metrics Artifact)

In [14]:
@component(
    base_image="python:3.10",
    packages_to_install=["pandas", "prophet", "scikit-learn"]
)
def prophet_model(
    cleaned_data: Dataset,
    metrics: Metrics
):
    import pandas as pd
    import numpy as np
    from prophet import Prophet
    from sklearn.metrics import mean_absolute_error, mean_squared_error

    df = pd.read_csv(cleaned_data.path)
    df = df.rename(columns={'Year': 'ds', 'Value': 'y'})

    train = df.iloc[:-5]
    test = df.iloc[-5:]

    model = Prophet(yearly_seasonality=True)
    model.fit(train)

    future = model.make_future_dataframe(periods=5, freq='Y')
    forecast = model.predict(future)

    preds = forecast.iloc[-5:]['yhat'].values

    mae = mean_absolute_error(test['y'], preds)
    rmse = np.sqrt(mean_squared_error(test['y'], preds))

    # Log metrics (VISIBLE in Kubeflow UI)
    metrics.log_metric("MAE", mae)
    metrics.log_metric("RMSE", rmse)


## SARIMA Component (Logs Metrics Artifact)

In [15]:
@component(
    base_image="python:3.10",
    packages_to_install=["pandas", "statsmodels", "scikit-learn"]
)
def sarima_model(
    cleaned_data: Dataset,
    metrics: Metrics
):
    import pandas as pd
    import numpy as np
    from statsmodels.tsa.statespace.sarimax import SARIMAX
    from sklearn.metrics import mean_absolute_error, mean_squared_error

    df = pd.read_csv(cleaned_data.path)
    ts = df.set_index('Year')['Value']

    train = ts.iloc[:-5]
    test = ts.iloc[-5:]

    model = SARIMAX(train, order=(1,1,1), seasonal_order=(1,1,1,0))
    fit = model.fit(disp=False)

    preds = fit.forecast(steps=5)

    mae = mean_absolute_error(test, preds)
    rmse = np.sqrt(mean_squared_error(test, preds))

    metrics.log_metric("MAE", mae)
    metrics.log_metric("RMSE", rmse)


## Pipeline Definition (Artifact Flow)

In [16]:
pipeline(
    name="electrification-forecasting-with-artifacts",
    description="Kubeflow pipeline with Dataset & Metrics artifacts"
)
def electrification_pipeline(
    input_data: str = "API_EG.ELC.ACCS.ZS_DS2_en_excel_v2_25.xls",
    country: str = "Kenya"
):
    clean_task = clean_electrification_data(
        input_path=input_data,
        country=country
    )

    prophet_model(
        cleaned_data=clean_task.outputs["cleaned_data"]
    )

    sarima_model(
        cleaned_data=clean_task.outputs["cleaned_data"]
    )


## Compile Pipeline (Final Submission File)

In [17]:
from kfp import compiler
pipeline_func=electrification_pipeline,
package_path="electrification_artifact_pipeline.yaml"



In [18]:
import streamlit as st
import pandas as pd