# 🪙 G-Research Crypto Kubeflow Pipeline
![](./images/vector-blockchain-poster.jpg)

---


In this [Kaggle competition](https://www.kaggle.com/competitions/g-research-crypto-forecasting/overview), you'll use your machine learning expertise to forecast short term returns in 14 popular cryptocurrencies. The dataset provided contains information on historic trades for several cryptoassets, such as Bitcoin and Ethereum. 

> G-Research is a leading quantitative research and technology company. By using the latest scientific techniques, they produce world-beating predictive research and build advanced technology to analyse the world's data.

# Install relevant libraries


>Update pip `pip install --user --upgrade pip`

>Install and upgrade kubeflow sdk `pip install kfp --upgrade --user --quiet`

You may need to restart your notebook kernel after installing the kfp sdk

In [1]:
!pip install --user --upgrade pip



In [2]:
!pip install kfp --upgrade --user --quiet

In [3]:
# confirm the kfp sdk
! pip show kfp

Name: kfp
Version: 1.8.11
Summary: KubeFlow Pipelines SDK
Home-page: https://github.com/kubeflow/pipelines
Author: The Kubeflow Authors
Author-email: 
License: UNKNOWN
Location: /home/jovyan/.local/lib/python3.6/site-packages
Requires: absl-py, click, cloudpickle, dataclasses, Deprecated, docstring-parser, fire, google-api-python-client, google-auth, google-cloud-storage, jsonschema, kfp-pipeline-spec, kfp-server-api, kubernetes, protobuf, pydantic, PyYAML, requests-toolbelt, strip-hints, tabulate, typer, typing-extensions, uritemplate
Required-by: kubeflow-kale


In [4]:
import kfp
import kfp.components as comp
import kfp.dsl as dsl
from kfp.components import OutputPath
from typing import NamedTuple

# Kubeflow pipeline component creation

## Download the dataset

In [5]:
# download data step
def download_data(dataset, 
                  data_path):
        
    # install the necessary libraries
    import os, sys, subprocess, zipfile, pickle;
    subprocess.run(["python", "-m", "pip", "install", "--upgrade", "pip"])
    subprocess.run([sys.executable, '-m', 'pip', 'install','pandas'])
    subprocess.run([sys.executable, '-m', 'pip', 'install','kaggle'])
    
    # import libraries
    import pandas as pd

    # setup kaggle environment for data download
    with open('/secret/kaggle-secret/password', 'r') as file:
        kaggle_key = file.read().rstrip()
    with open('/secret/kaggle-secret/username', 'r') as file:
        kaggle_user = file.read().rstrip()
        
    os.environ['KAGGLE_USERNAME'], os.environ['KAGGLE_KEY'] = kaggle_user, kaggle_key
    
    # create data_path directory
    if not os.path.exists(data_path):
        os.makedirs(data_path)
    
    # download kaggle's g-research-crypto-forecasting data
    subprocess.run(["kaggle","competitions", "download", "-c", dataset])
    
    # extract 'train.csv' and 'asset_details.csv' in g-research-crypto-forecasting.zip to data_path
    with zipfile.ZipFile(f"{dataset}.zip","r") as zip_ref:
        zip_ref.extractall(data_path, members=['train.csv', 'asset_details.csv'])
    
    return(print('Done!'))

## Load Data

In [6]:
# load data step
def load_data(data_path):
        
    # install the necessary libraries
    import os, sys, subprocess, pickle;
    subprocess.run(["python", "-m", "pip", "install", "--upgrade", "pip"])
    subprocess.run([sys.executable, '-m', 'pip', 'install','pandas'])
    
    # import libraries
    import pandas as pd

    TRAIN_CSV = f'{data_path}/train.csv'
    ASSET_DETAILS_CSV = f'{data_path}/asset_details.csv'
    
    # read TRAIN_CSV and ASSET_DETAILS_CSV
    df_train = pd.read_csv(TRAIN_CSV)
    df_asset_details = pd.read_csv(ASSET_DETAILS_CSV).sort_values("Asset_ID")
    
    df_train['datetime'] = pd.to_datetime(df_train['timestamp'], unit='s')
    df_train = df_train[df_train['datetime'] >= '2020-01-01 00:00:00'].copy()
    
    # Save the df_train data as a pickle file to be used by the feature_engineering component.
    with open(f'{data_path}/df_train', 'wb') as f:
        pickle.dump(df_train, f)
        
    # Save the df_train data as a pickle file to be used by the merge_data component.
    with open(f'{data_path}/df_asset_details', 'wb') as g:
        pickle.dump(df_asset_details, g)

    
    return(print('Done!'))

## Feature Engineering

In [7]:
# feature engineering step

def feature_engineering(data_path):
    
    # install the necessary libraries
    import sys, subprocess;
    subprocess.run(["python", "-m", "pip", "install", "--upgrade", "pip"])
    subprocess.run([sys.executable, '-m', 'pip', 'install','pandas'])
    subprocess.run([sys.executable, '-m', 'pip', 'install','tqdm'])
    subprocess.run([sys.executable, '-m', 'pip', 'install','talib-binary'])
    
    # import Library
    import os, pickle, time, talib, datetime;
    import numpy as np
    import pandas as pd
    from tqdm import tqdm

    # loading the df_train data
    with open(f'{data_path}/df_train', 'rb') as f:
        df_train = pickle.load(f)
    
    # creating technical indicators
    
    # Create a function to calculate the Relative Strength Index
    def RSI(df, n):
        return talib.RSI(df['Close'], n)
    
    # Create a function to calculate the Average True Range
    def ATR(df, n):
        return talib.ATR(df["High"], df.Low, df.Close, n)

    # Create a function to calculate the Double Exponential Moving Average (DEMA)
    def DEMA(data, time_period):
        #Calculate the Exponential Moving Average for some time_period (in days)
        EMA = data['Close'].ewm(span=time_period, adjust=False).mean()
        #Calculate the DEMA
        DEMA = 2*EMA - EMA.ewm(span=time_period, adjust=False).mean()
        return DEMA
    
    # Create a function to calculate the upper_shadow
    def upper_shadow(df):
        return df['High'] - np.maximum(df['Close'], df['Open'])
    
    # Create a function to calculate the lower_shadow
    def lower_shadow(df):
        return np.minimum(df['Close'], df['Open']) - df['Low']
    
    
    def get_features(df, asset_id, train=True):
        '''
        This function takes a dataframe with all asset data and return the lagged features for a single asset.

        df - Full dataframe with all assets included
        asset_id - integer from 0-13 inclusive to represent a cryptocurrency asset
        train - True - you are training your model
              - False - you are submitting your model via api
        '''
        # filter based on asset id
        df = df[df['Asset_ID']==asset_id]

        # sort based on time stamp
        df = df.sort_values('timestamp')

        if train == True:
            df_feat = df.copy()

            # define a train_flg column to split your data into train and validation
            totimestamp = lambda s: np.int32(time.mktime(datetime.datetime.strptime(s, "%d/%m/%Y").timetuple()))
            valid_window = [totimestamp("01/05/2021")]

            df_feat['train_flg'] = np.where(df_feat['timestamp']>=valid_window[0], 0,1)
            df_feat = df_feat[['timestamp','Asset_ID', 'High', 'Low', 'Open', 'Close', 'Volume','Target','train_flg']].copy()
        else:
            df = df.sort_values('row_id')
            df_feat = df[['Asset_ID', 'High', 'Low', 'Open', 'Close', 'Volume','row_id']].copy()

        for i in tqdm([30, 120, 240]):
            # Applyin technical indicators
            df_feat[f'RSI_{i}'] = RSI(df_feat, i)
            df_feat[f'ATR_{i}'] = ATR(df_feat, i)
            df_feat[f'DEMA_{i}'] = DEMA(df_feat, i)

        for i in tqdm([30, 120, 240]):
            # creating lag features
            df_feat[f'sma_{i}'] = df_feat['Close'].rolling(i).mean()/df_feat['Close'] -1
            df_feat[f'return_{i}'] = df_feat['Close']/df_feat['Close'].shift(i) -1

        # new features
        df_feat['HL'] = np.log(df_feat['High'] - df_feat['Low'])
        df_feat['OC'] = np.log(df_feat['Close'] - df_feat['Open'])
        
        # Applyin lower_shadow and upper_shadow indicators
        df_feat['lower_shadow'] = np.log(lower_shadow(df)) 
        df_feat['upper_shadow'] = np.log(upper_shadow(df))

        # replace inf with nan
        df_feat.replace([np.inf, -np.inf], np.nan, inplace=True)

        # datetime features
        df_feat['Date'] = pd.to_datetime(df_feat['timestamp'], unit='s')
        df_feat['Day'] = df_feat['Date'].dt.weekday.astype(np.int32)
        df_feat["dayofyear"] = df_feat['Date'].dt.dayofyear
        df_feat["weekofyear"] = df_feat['Date'].dt.weekofyear
        df_feat["season"] = ((df_feat['Date'].dt.month)%12 + 3)//3
        
        # drop features
        df_feat = df_feat.drop(['Open','Close','High','Low', 'Volume', 'Date'], axis=1)

        # fill nan values with 0
        df_feat = df_feat.fillna(0)

        return df_feat
    
    # create your features dataframe for each asset and concatenate
    feature_df = pd.DataFrame()
    for i in range(14):
        print(i)
        feature_df = pd.concat([feature_df,get_features(df_train,i,train=True)])
      
    # save the feature engineered data as a pickle file to be used by the modeling component.
    with open(f'{data_path}/feature_df', 'wb') as f:
        pickle.dump(feature_df, f)
    
    return(print('Done!'))  

## Merge Assets Data and Features

In [8]:
# merge_assets_features step

def merge_assets_features(data_path):
    
    # install the necessary libraries
    import sys, subprocess;
    subprocess.run(["python", "-m", "pip", "install", "--upgrade", "pip"])
    subprocess.run([sys.executable, '-m', 'pip', 'install','pandas'])
    
    # import Library
    import os, pickle;
    import pandas as pd

    #loading the feature_df data
    with open(f'{data_path}/feature_df', 'rb') as f:
        feature_df = pickle.load(f)
        
    #loading the df_asset_details data
    with open(f'{data_path}/df_asset_details', 'rb') as g:
        df_asset_details = pickle.load(g)
    
    # assign weight column feature dataframe
    feature_df = pd.merge(feature_df, df_asset_details[['Asset_ID','Weight']], how='left', on=['Asset_ID'])

    #Save the feature_df as a pickle file to be used by the modelling component.
    with open(f'{data_path}/merge_feature_df', 'wb') as h:
        pickle.dump(feature_df, h)
        
    return(print('Done!'))  

## Modelling
    

In [9]:
# modeling step

def modeling(data_path):
    
    # install the necessary libraries
    import sys, subprocess;
    subprocess.run(["python", "-m", "pip", "install", "--upgrade", "pip"])
    subprocess.run([sys.executable, '-m', 'pip', 'install','pandas'])
    subprocess.run([sys.executable, '-m', 'pip', 'install','lightgbm'])
    
    # import Library
    import os, pickle, joblib;
    import pandas as pd
    import numpy as np
    import lightgbm as lgb
    from lightgbm import LGBMRegressor

    #loading the new_feats data
    with open(f'{data_path}/merge_feature_df', 'rb') as f:
        feature_df = pickle.load(f)
        
    # define features for LGBM
    features = ['Asset_ID', 'RSI_30', 'ATR_30',
           'DEMA_30', 'RSI_120', 'ATR_120', 'DEMA_120', 'RSI_240', 'ATR_240',
           'DEMA_240', 'sma_30', 'return_30', 'sma_120', 'return_120', 'sma_240',
           'return_240', 'HL', 'OC', 'lower_shadow', 'upper_shadow', 'Day',
           'dayofyear', 'weekofyear', 'season']
    categoricals = ['Asset_ID']
    
    # define the evaluation metric
    def weighted_correlation(a, train_data):

        weights = train_data.add_w.values.flatten()
        b = train_data.get_label()


        w = np.ravel(weights)
        a = np.ravel(a)
        b = np.ravel(b)

        sum_w = np.sum(w)
        mean_a = np.sum(a * w) / sum_w
        mean_b = np.sum(b * w) / sum_w
        var_a = np.sum(w * np.square(a - mean_a)) / sum_w
        var_b = np.sum(w * np.square(b - mean_b)) / sum_w

        cov = np.sum((a * b * w)) / np.sum(w) - mean_a * mean_b
        corr = cov / np.sqrt(var_a * var_b)

        return 'eval_wcorr', corr, True
    
    # define train and validation weights and datasets
    weights_train = feature_df.query('train_flg == 1')[['Weight']]
    weights_test = feature_df.query('train_flg == 0')[['Weight']]

    train_dataset = lgb.Dataset(feature_df.query('train_flg == 1')[features], 
                                feature_df.query('train_flg == 1')['Target'].values, 
                                feature_name = features,
                               categorical_feature= categoricals)
    val_dataset = lgb.Dataset(feature_df.query('train_flg == 0')[features], 
                              feature_df.query('train_flg == 0')['Target'].values, 
                              feature_name = features,
                             categorical_feature= categoricals)
    # add weights
    train_dataset.add_w = weights_train
    val_dataset.add_w = weights_test
    
    # LGBM params
    evals_result = {}
    params = {'n_estimators': 1200,
            'objective': 'regression',
            'metric': 'rmse',
            'boosting_type': 'gbdt',
            'max_depth': -1, 
            'learning_rate': 0.01,
            'seed': 2022,
            'verbose': -1,
            }

    # train LGBM
    model = lgb.train(params = params,
                      train_set = train_dataset, 
                      valid_sets = [val_dataset],
                      early_stopping_rounds=60,
                      verbose_eval = 30,
                      feval=weighted_correlation,
                      evals_result = evals_result 
                     )
    
    # saving model
    joblib.dump(model, f'{data_path}/lgb.jl')
        
    return(print('Done!'))  

## Evaluation

In [10]:
# evaluation step

def evaluation_result(data_path, 
                metrics_path: OutputPath(str)) -> NamedTuple("EvaluationOutput", [("mlpipeline_metrics", "Metrics")]):
    
    # import Library
    import sys, subprocess;
    subprocess.run(["python", "-m", "pip", "install", "--upgrade", "pip"])
    subprocess.run([sys.executable, '-m', 'pip', 'install','lightgbm'])
    import json;
    from collections import namedtuple
    import joblib
    import lightgbm as lgb
    from lightgbm import LGBMRegressor
    
    # load model
    model = joblib.load(f'{data_path}/lgb.jl')

    # model evaluation
    root_mean_squared_error = model.best_score.get('valid_0').get('rmse')
    weighted_correlation = model.best_score.get('valid_0').get('eval_wcorr')
    
    # create kubeflow metric metadata for UI    
    metrics = {
                'metrics': [
                    {'name': 'root-mean-squared-error',
                    'numberValue':  root_mean_squared_error,
                    'format': 'RAW'},
                    {'name': 'weighted-correlation',
                    'numberValue':  weighted_correlation,
                    'format': 'RAW'}
                            ]
              }
    

    with open(metrics_path, "w") as f:
        json.dump(metrics, f)

    output_tuple = namedtuple("EvaluationOutput", ["mlpipeline_metrics"])

    return output_tuple(json.dumps(metrics))

## Create pipeline components 

using `create_component_from_func`

In [11]:
# create light weight components
download_op = comp.create_component_from_func(download_data,base_image="python:3.7.1")
load_op = comp.create_component_from_func(load_data,base_image="python:3.7.1")
merge_assets_features_op = comp.create_component_from_func(merge_assets_features,base_image="python:3.7.1")
feature_eng_op = comp.create_component_from_func(feature_engineering,base_image="python:3.7.1")
modeling_op = comp.create_component_from_func(modeling, base_image="python:3.7.1")
evaluation_op = comp.create_component_from_func(evaluation_result, base_image="python:3.7.1")

## Kubeflow pipeline creation

In [12]:
# define pipeline
@dsl.pipeline(name="g-research-crypto-forecasting-pipeline", 
              description="Forecasting short term returns in 14 popular cryptocurrencies.")

# Define parameters to be fed into pipeline
def g_research_crypto_forecast_pipeline(
                             dataset: str,
                             data_path: str
                            ):
    # Define volume to share data between components.
    vop = dsl.VolumeOp(
    name="create_data_volume",
    resource_name="data-volume", 
    size="16Gi", 
    modes=dsl.VOLUME_MODE_RWO)
    
    
    # Create download container.
    download_container = download_op(dataset, data_path)\
                        .add_pvolumes({data_path: vop.volume}).add_pod_label("kaggle-secret", "true")
    # Create load container.
    load_container = load_op(data_path)\
                    .add_pvolumes({data_path: download_container.pvolume})
    # Create feature engineering container.
    feat_eng_container = feature_eng_op(data_path)\
                            .add_pvolumes({data_path: load_container.pvolume})
    # Create merge_assets_feat container.
    merge_assets_feat_container = merge_assets_features_op(data_path)\
                                 .add_pvolumes({data_path: feat_eng_container.pvolume})
    # Create modeling container.
    modeling_container = modeling_op(data_path)\
                        .add_pvolumes({data_path: merge_assets_feat_container.pvolume})
    # Create prediction container.
    evaluation_container = evaluation_op(data_path).add_pvolumes({data_path: modeling_container.pvolume})

In [13]:
# create client that would enable communication with the Pipelines API server 
client = kfp.Client()

In [14]:
# arguments
dataset = "g-research-crypto-forecasting"
data_path = "/mnt"

In [15]:
pipeline_func = g_research_crypto_forecast_pipeline

experiment_name = 'g_research_crypto_forecast_pipeline_lightweight'
run_name = pipeline_func.__name__ + ' run'

arguments = {
             "dataset": dataset,
             "data_path": data_path
            }

# Compile pipeline to generate compressed YAML definition of the pipeline.
kfp.compiler.Compiler().compile(pipeline_func,  
  '{}.zip'.format(experiment_name))

# Submit pipeline directly from pipeline function
run_result = client.create_run_from_pipeline_func(pipeline_func, 
                                                  experiment_name=experiment_name, 
                                                  run_name=run_name, 
                                                  arguments=arguments
                                                 )
