In [None]:
import kfp
from kfp import dsl
import kfp.components as components
from typing import NamedTuple
from datetime import datetime

def get_data_review() -> NamedTuple('Output', [('mlpipeline_ui_metadata', 'UI_metadata')]):
    from minio import Minio
    import pandas as pd
    import numpy as np
    from evidently import ColumnMapping
    from evidently.report import Report
    from evidently.metric_preset import  DataQualityPreset 
    import json

    ## get data from minio
    minio_client = Minio(
       "<minio_ep>",
        access_key="<minio_accK>",
        secret_key="<minio_secK>",
        secure=False
    )
    minio_bucket = "kubeflow"

    minio_client.fget_object(minio_bucket,f'datasets/amazon_customer_behavior/AmazonCustomerBehaviorSurvey.csv',"/tmp/AmazonCustomerBehaviorSurvey.csv")
    df = pd.read_csv("/tmp/AmazonCustomerBehaviorSurvey.csv")
    column_mapping = ColumnMapping()
    column_mapping.target = 'Recommendation_Helpfulness'
    column_mapping.numerical_feature=['age','Customer_Reviews_Importance','Personalized_Recommendation_Frequency','Rating_Accuracy','Shopping_Satisfaction']
    column_mapping.categorical_features = [ 'Gender', 'Purchase_Frequency', 'Purchase_Categories', 'Browsing_Frequency',
        'Product_Search_Method', 'Search_Result_Exploration', 'Add_to_Cart_Browsing','Cart_Completion_Frequency', 
        'Cart_Abandonment_Factors','Saveforlater_Frequency', 'Review_Left', 'Review_Reliability',
        'Review_Helpfulness', 'Service_Appreciation', 'Improvement_Areas']

    report = Report(metrics=[DataQualityPreset()])
    report.run(reference_data=None, current_data=df, column_mapping=column_mapping)

    metadata = {
        'outputs' : [{
        'type': 'web-app',
        'storage': 'inline',
        'source': report.get_html(),
        }]
    }

    from collections import namedtuple
    output = namedtuple('output', ['mlpipeline_ui_metadata'])
    return output(json.dumps(metadata))

def get_data_batch() :
    """
    Function to get dataset and load modified data to minio bucket
    """
    print("getting data")
    from minio import Minio
    import pandas as pd
    import numpy as np
    import json
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import LabelEncoder
    ## get data from minio
    minio_client = Minio(
       "<minio_ep>",
        access_key="<minio_accK>",
        secret_key="<minio_secK>",
        secure=False
    )
    minio_bucket = "kubeflow"

    print("getting data from minio")
    minio_client.fget_object(minio_bucket,f'datasets/amazon_customer_behavior/AmazonCustomerBehaviorSurvey.csv',"/tmp/AmazonCustomerBehaviorSurvey.csv")
    
    print("data processing")
    df = pd.read_csv("/tmp/AmazonCustomerBehaviorSurvey.csv")
    ## drop Timestamp Column    
    df.drop(columns=['Timestamp'],inplace=True,axis=1)
    ## drop na
    df = df.dropna()
    ## splliting `age` in to different catogories
    bins = [0, 18, 40, 65, float('inf')]
    labels = ['0-18', '18-40', '40-65', '65+']
    df["age"] = pd.cut(df['age'], bins=bins, labels=labels, right=False)
    df['age'] = df['age'].map({'0-18':0, '18-40':1, '40-65':2, '65+':3})
    ## one hot encoding for `Gender`
    gender = df['Gender'].str.get_dummies().add_prefix('Gender_')
    gender.drop(['Gender_Prefer not to say'],inplace=True,axis=1)
    df.drop(['Gender'],inplace=True,axis=1)
    df = pd.concat([df,gender],axis=1)
    ## one hot encoding for `Purchase_Categories`
    pur_cate = df['Purchase_Categories'].str.get_dummies(sep=';').add_prefix('BUY ')
    df.drop(['Purchase_Categories'],inplace=True,axis=1)
    df = pd.concat([df,pur_cate],axis=1)
    ## modify `Purchase_Frequency`
    df['Purchase_Frequency'] = df['Purchase_Frequency'].map({'Less than once a month':0,'Once a month':1,'Few times a month':2,'Once a week':3,'Multiple times a week':4})
    ## modify `Personalized_Recommendation_Frequency`
    df['Personalized_Recommendation_Frequency'] = df['Personalized_Recommendation_Frequency'].map({'No':0,'Sometimes':1,'Yes':2,}) 
    ## modify `Recommendation_Helpfulness`
    df['Recommendation_Helpfulness'] = df['Recommendation_Helpfulness'].map({'No':0,'Sometimes':1,'Yes':2,}) 
    ## modify `Browsing_Frequency`
    df['Browsing_Frequency'] = df['Browsing_Frequency'].map({'Rarely':0,'Few times a month':1,'Few times a week':2,'Multiple times a day':3}) 
    ## one hot encoding for `Product_Search_Method`
    ser_method = df['Product_Search_Method'].str.get_dummies().add_prefix('Product_Search_Method_')
    ser_method.drop(['Product_Search_Method_others'],inplace=True,axis=1)
    df.drop(['Product_Search_Method'],inplace=True,axis=1)
    df = pd.concat([df,ser_method],axis=1)
    ## modify `Search_Result_Exploration`
    df['Search_Result_Exploration'] = df['Search_Result_Exploration'].map({'First page':0,'Multiple pages':1,}) 
    ## modify `Add_to_Cart_Browsing`
    df['Add_to_Cart_Browsing'] = df['Add_to_Cart_Browsing'].map({'No': 0, 'Maybe': 1, 'Yes': 2}) 
    ## modify `Cart_Completion_Frequency`
    df['Cart_Completion_Frequency'] = df['Cart_Completion_Frequency'].map({'Never': 0,'Rarely': 1,'Sometimes': 2,'Often': 3,'Always': 4,}) 
    ## one hot encoding for `Cart_Abandonment_Factors`
    abandon = df['Cart_Abandonment_Factors'].str.get_dummies().add_prefix('Cart_Abandonment_Factors_')
    abandon.drop(['Cart_Abandonment_Factors_others'],inplace=True,axis=1)
    abandon.rename(columns={'Cart_Abandonment_Factors_Changed my mind or no longer need the item': 'Cart_Abandonment_Factors_ChangedMind', 'Cart_Abandonment_Factors_Found a better price elsewhere': 'Cart_Abandonment_Factors_FoundBetterPrice','Cart_Abandonment_Factors_High shipping costs': 'Cart_Abandonment_Factors_HighCosts'},inplace=True)
    df.drop(['Cart_Abandonment_Factors'],inplace=True,axis=1)
    df = pd.concat([df,abandon],axis=1) 
    ## drop `Personalized_Recommendation_Frequency `
    df.drop(['Personalized_Recommendation_Frequency '],inplace=True,axis=1)
    ## modify `Saveforlater_Frequency`  
    df['Saveforlater_Frequency'] = df['Saveforlater_Frequency'].map({'Never': 0,'Rarely': 1,'Sometimes': 2,'Often': 3,'Always': 4})
    ## modify `Review_Left`
    df['Review_Left'] = df['Review_Left'].map({'No': 0,'Yes': 1})
    ## modify `Review_Reliability`
    df['Review_Reliability'] = df['Review_Reliability'].map({'Never': 0,'Rarely': 1,'Occasionally': 2,'Moderately': 3,'Heavily': 4,})
    ## modify `Review_Helpfulness`
    df['Review_Helpfulness'] = df['Review_Helpfulness'].map({'No': 0,'Sometimes': 1,'Yes': 2})
    ## one hot encoding for `Service_Appreciation`
    ser_appre = df['Service_Appreciation'].str.get_dummies().add_prefix('Service_Appreciation_')
    ser_appre.drop(['Service_Appreciation_.'],inplace=True,axis=1)
    df.drop(['Service_Appreciation'],inplace=True,axis=1)
    df = pd.concat([df,ser_appre],axis=1)
    ## drop `Improvement_Areas`
    df.drop(['Improvement_Areas'],inplace=True,axis=1)
    ## merge `Service_Appreciation_Customer service` and `Service_Appreciation_Customer service `
    df.columns = df.columns.str.strip()
    df = df.T.groupby(level=0).sum().T

    
    ## reorder columns
    col =['age', 'Gender_Female','Gender_Male','Gender_Others','BUY Beauty and Personal Care','BUY Clothing and Fashion','BUY Groceries and Gourmet Food',
          'BUY Home and Kitchen','BUY others','Purchase_Frequency','Personalized_Recommendation_Frequency','Rating_Accuracy','Shopping_Satisfaction',
          'Browsing_Frequency','Product_Search_Method_Filter','Product_Search_Method_Keyword','Product_Search_Method_categories','Search_Result_Exploration',
          'Customer_Reviews_Importance','Add_to_Cart_Browsing','Cart_Completion_Frequency','Cart_Abandonment_Factors_ChangedMind','Cart_Abandonment_Factors_FoundBetterPrice',
          'Cart_Abandonment_Factors_HighCosts','Saveforlater_Frequency','Review_Left','Review_Reliability','Review_Helpfulness','Service_Appreciation_All the above',
          'Service_Appreciation_Competitive prices','Service_Appreciation_Customer service' ,'Service_Appreciation_Product recommendations','Service_Appreciation_Quick delivery',
          'Service_Appreciation_User-friendly website/app interface','Service_Appreciation_Wide product selection','Recommendation_Helpfulness']
    df_modify = df[col]
    ## dump dataframe to csv
    df_modify.to_csv("/tmp/AmazonCustomerBehaviorSurvey_modified.csv",index=False)
    # save to dataset file, store in Minio
    print("saving to minio")
    minio_client.fput_object(minio_bucket,f"datasets/amazon_customer_behavior/AmazonCustomerBehaviorSurvey_modified.csv","/tmp/AmazonCustomerBehaviorSurvey_modified.csv")

def split_data():
    from sklearn.model_selection import train_test_split
    import pandas as pd
    from minio import Minio
    minio_client = Minio(
       "<minio_ep>",
        access_key="<minio_accK>",
        secret_key="<minio_secK>",
        secure=False
    )
    minio_bucket = "kubeflow"
    minio_client.fget_object(minio_bucket,f'datasets/amazon_customer_behavior/AmazonCustomerBehaviorSurvey_modified.csv',"/tmp/AmazonCustomerBehaviorSurvey_modified.csv")
    df = pd.read_csv("/tmp/AmazonCustomerBehaviorSurvey_modified.csv")
    print("splitting data !")
    train_dataset, test_dataset = train_test_split(df, test_size=0.2, random_state=42)
    train_dataset.to_csv("/tmp/train_dataset.csv",index=False)
    test_dataset.to_csv("/tmp/test_dataset.csv",index=False)
    
    print("saving to minio")
    minio_client.fput_object(minio_bucket,f"datasets/amazon_customer_behavior/train_dataset.csv","/tmp/train_dataset.csv")
    minio_client.fput_object(minio_bucket,f"datasets/amazon_customer_behavior/test_dataset.csv","/tmp/test_dataset.csv")

    

def train_model(
        rand_iter: int, rand_cv: int, n_estimators: int, max_depth: int
) -> NamedTuple('Output', [('mlpipeline_ui_metadata', 'UI_metadata'),('mlpipeline_metrics', 'Metrics')]):
    
    from sklearn.model_selection import RandomizedSearchCV
    from scipy.stats import randint
    from sklearn.ensemble import RandomForestClassifier
    from minio import Minio
    import numpy as np
    import pandas as pd
    import json

    minio_client = Minio(
       "<minio_ep>",
        access_key="<minio_accK>",
        secret_key="<minio_secK>",
        secure=False
    )
    minio_bucket = "kubeflow"

    minio_client.fget_object(minio_bucket,f'datasets/amazon_customer_behavior/train_dataset.csv',"/tmp/train_dataset.csv")
    minio_client.fget_object(minio_bucket,f'datasets/amazon_customer_behavior/test_dataset.csv',"/tmp/test_dataset.csv")

    train_dataset = pd.read_csv("/tmp/train_dataset.csv")
    test_dataset = pd.read_csv("/tmp/test_dataset.csv")
    X_train = train_dataset.drop(columns=['Recommendation_Helpfulness'], axis=1)
    y_train = train_dataset['Recommendation_Helpfulness']

    X_test = test_dataset.drop(columns=['Recommendation_Helpfulness'], axis=1)
    y_test = test_dataset['Recommendation_Helpfulness']

    rand_params = {
        'n_estimators': randint(5, n_estimators),
        'max_depth': randint(1, max_depth)
    }
    model_rf = RandomForestClassifier(random_state=42)
    rand_search = RandomizedSearchCV(
        estimator=model_rf, 
        param_distributions=rand_params, 
        n_iter=rand_iter, 
        cv=rand_cv, 
        )

    rand_search.fit(X_train, y_train)

    test_pred = rand_search.best_estimator_.predict(X_test)
    from sklearn.metrics import accuracy_score, classification_report,confusion_matrix
    import matplotlib.pyplot as plt
    #show model summary - how it looks
    test_accuracy = accuracy_score(y_test, test_pred)
    test_report = classification_report(y_test, test_pred)
    cm = confusion_matrix(y_test, test_pred)
    
    # Confusion Matrix
    vocab = list(np.unique(y_test))
    data = []
    for target_index, target_row in enumerate(cm):
        for predicted_index, count in enumerate(target_row):
            data.append((vocab[target_index], vocab[predicted_index], count))

    df_cm = pd.DataFrame(data, columns=['target', 'predicted', 'count'])
    cm_csv = df_cm.to_csv(header=False, index=False)
    
    metadata = {
        "outputs": [
            {
                "type": "confusion_matrix",
                "format": "csv",
                "schema": [
                    {'name': 'target', 'type': 'CATEGORY'},
                    {'name': 'predicted', 'type': 'CATEGORY'},
                    {'name': 'count', 'type': 'NUMBER'},
                  ],
                "target_col" : "actual",
                "predicted_col" : "predicted",
                "source": cm_csv,
                "storage": "inline",
                "labels": [0,1,2]
            },
            {
                'storage': 'inline',
                'source': '''# Model Overview
## Model Summary

```
{}
```

'''.format(test_report),
                'type': 'markdown',
            }
        ]
    }
    
    metrics = {
      'metrics': [{
          'name': 'accuracy',
          'numberValue':  float(test_accuracy),
          'format' : "PERCENTAGE"
        }]}


    # save model to minio
    import joblib
    joblib.dump(rand_search.best_estimator_, "/tmp/model.joblib")
    minio_client.fput_object(minio_bucket,f"models/amazon_customer_behavior/model.joblib","/tmp/model.joblib")
    from collections import namedtuple
    output = namedtuple('output', ['mlpipeline_ui_metadata', 'mlpipeline_metrics'])
    return output(json.dumps(metadata),json.dumps(metrics))

component_data_review = components.create_component_from_func(get_data_review,base_image="python:3.10.0",
                                                            packages_to_install=['scikit-learn','minio','pandas','evidently'])
component_get_data_batch = components.create_component_from_func(get_data_batch,base_image="python:3.10.0",
                                                            packages_to_install=['scikit-learn','minio','pandas'])
component_split_data = components.create_component_from_func(split_data,base_image="python:3.10.0",
                                                            packages_to_install=['scikit-learn','minio','pandas'])
component_build_model = components.create_component_from_func(train_model,base_image="kubeflownotebookswg/jupyter-tensorflow-full:v1.7.0")


@dsl.pipeline(
    name='amazon-customer-behavior-pipeline',
    description='example pipeline for amazon customer behavior dataset'
)
def output_test( rand_iter, rand_cv, n_estimators, max_depth):
    
    now = datetime.now()
    v = now.strftime("%Y%m%d%H%M%S")
    minio_bucket = "kubeflow"

    step1_1 = component_data_review()
    step1_2 = component_get_data_batch()
    step2 = component_split_data()
    step2.after(step1_1,step1_2)
    step3 = component_build_model(rand_iter, rand_cv, n_estimators, max_depth)
    step3.after(step2)
    seldon_deployment = {
        "apiVersion": "machinelearning.seldon.io/v1",
        "kind": "SeldonDeployment",
        "metadata": {
            "name": f"customer-behavior-{v}",
            "namespace": "kubeflow-user-example-com"
        },
        "spec": {
            "protocol": "seldon",
            "predictors": [
                {
                    "name": "sklearn-predictor",
                    "replicas": 1,
                    "graph": {
                        "name": "classifier",
                        "implementation": "SKLEARN_SERVER",
                        "modelUri": f"s3://{minio_bucket}/models/amazon_customer_behavior/",
                        "envSecretRefName": "seldon-init-container-secret",
                        "parameters": [
                            {
                                "name": "method",
                                "type": "STRING",
                                "value": "predict"
                            }
                        ]
                    }
                }
            ]
        }
    }
    
    step4 = dsl.ResourceOp(
        name=f'seldon-deployment-{v}',
        k8s_resource=seldon_deployment,
        action="create",
        attribute_outputs={"name": "{.metadata.name}"}
    )
    step4.after(step3)

if __name__ == "__main__":
    kubeflow_gateway_endpoint = "<kubeflow-gateway-endpoint>" # e.g. 172.0.0.1
    authservice_session_cookie = "<authservice_session_cookie>"
    
    client = kfp.Client(host=f"https://{kubeflow_gateway_endpoint}/pipeline",
                        cookies=f"authservice_session={authservice_session_cookie}",
                        ssl_ca_cert="cert/tls.crt") # need to store tls.crt before running the pipeline

    arguments = {
        "rand_iter": 1000,
        "rand_cv": 5,
        "n_estimators": 50,
        "max_depth": 20
    }


    client.create_run_from_pipeline_func(output_test,arguments=arguments,experiment_name="amazon-customer-behavior")