# JPX Tokyo Stock Exchange Kubeflow Pipeline

In this [Kaggle competition](https://www.kaggle.com/competitions/jpx-tokyo-stock-exchange-prediction/overview) 

>Japan Exchange Group, Inc. (JPX) is a holding company operating one of the largest stock exchanges in the world, Tokyo Stock Exchange (TSE), and derivatives exchanges Osaka Exchange (OSE) and Tokyo Commodity Exchange (TOCOM). JPX is hosting this competition and is supported by AI technology company AlpacaJapan Co.,Ltd.

> In this competition, you will model real future returns of around 2,000 stocks. The competition will involve building portfolios from the stocks eligible for predictions. The stocks are ranked from highest to lowest expected returns and they are evaluated on the difference in returns between the top and bottom 200 stocks.

# Install relevant libraries


>Update pip `pip install --user --upgrade pip`

>Install and upgrade kubeflow sdk `pip install kfp --upgrade --user --quiet`

You may need to restart your notebook kernel after installing the kfp sdk

In [1]:
!pip install --user --upgrade pip



In [2]:
!pip install kfp --upgrade --user --quiet

In [3]:
# confirm the kfp sdk
! pip show kfp

Name: kfp
Version: 1.8.11
Summary: KubeFlow Pipelines SDK
Home-page: https://github.com/kubeflow/pipelines
Author: The Kubeflow Authors
Author-email: 
License: UNKNOWN
Location: /home/jovyan/.local/lib/python3.6/site-packages
Requires: absl-py, click, cloudpickle, dataclasses, Deprecated, docstring-parser, fire, google-api-python-client, google-auth, google-cloud-storage, jsonschema, kfp-pipeline-spec, kfp-server-api, kubernetes, protobuf, pydantic, PyYAML, requests-toolbelt, strip-hints, tabulate, typer, typing-extensions, uritemplate
Required-by: kubeflow-kale


In [4]:
import kfp
import kfp.components as comp
import kfp.dsl as dsl
from kfp.components import InputPath, OutputPath
from typing import NamedTuple

# Kubeflow pipeline component creation

## Download and load the dataset

In [5]:
# load data step
def load_data(dataset: str, data_path: OutputPath(str)):
        
    # install the necessary libraries
    import os, sys, subprocess, zipfile, pickle;
    subprocess.run(["python", "-m", "pip", "install", "--upgrade", "pip"])
    subprocess.run([sys.executable, '-m', 'pip', 'install','pandas'])
    subprocess.run([sys.executable, '-m', 'pip', 'install','kaggle'])
    
    # import libraries
    import pandas as pd

    # setup kaggle environment for data download
    with open('/secret/kaggle-secret/password', 'r') as file:
        kaggle_key = file.read().rstrip()
    with open('/secret/kaggle-secret/username', 'r') as file:
        kaggle_user = file.read().rstrip()
        
    os.environ['KAGGLE_USERNAME'], os.environ['KAGGLE_KEY'] = kaggle_user, kaggle_key
    
    # create data_path directory
    if not os.path.exists(data_path):
        os.makedirs(data_path)
    
    # download kaggle's jpx-tokyo-stock-exchange-prediction data
    subprocess.run(["kaggle","competitions", "download", "-c", dataset])
    
    # extract jpx-tokyo-stock-exchange-prediction.zip to data_path
    with zipfile.ZipFile(f"{dataset}.zip","r") as zip_ref:
        zip_ref.extractall(data_path)
    
    # read train_files/stock_prices.csv
    df_prices = pd.read_csv(f"{data_path}/train_files/stock_prices.csv", parse_dates=['Date'])
    
    # Save the loaded data as a pickle file to be used by the tranform_data component.
    with open(f'{data_path}/df_prices', 'wb') as f:
        pickle.dump(df_prices, f)

    
    return(print('Done!'))

## Transform data

In [6]:
# transform data step

def transform_data(data_path: InputPath(str), 
              transform_data_path: OutputPath(str)):
    
    # install the necessary libraries
    import sys, subprocess;
    subprocess.run(["python", "-m", "pip", "install", "--upgrade", "pip"])
    subprocess.run([sys.executable, '-m', 'pip', 'install','pandas'])
    subprocess.run([sys.executable, '-m', 'pip', 'install','scipy'])
    
    # import Libraries
    import os, pickle;
    import pandas as pd
    import numpy as np
    from scipy import stats
    
    # load the df_prices data from load_data_path
    with open(f'{data_path}/df_prices', 'rb') as f:
        df_prices = pickle.load(f)

    # sort data by 'Date' and 'SecuritiesCode'
    df_prices.sort_values(by=['Date','SecuritiesCode'], inplace=True)

    # filter out data with less than 2000 stock counts in a day
    # dates before ‘2020-12-23’ all have stock counts less than 2000
    # This is done to work with consistent data  
    df_prices = df_prices[(df_prices["Date"]>="2020-12-23")]

    df_prices = df_prices.reset_index(drop=True)
    
    # calculate z-scores of `df`for outlier removal
    z_scores = stats.zscore(df_prices[['Open', 'High', 'Low', 'Close','Volume']], nan_policy='omit')
    abs_z_scores = np.abs(z_scores)
    filtered_entries = (abs_z_scores < 3).all(axis=1)
    df_zscore = df_prices[filtered_entries]
    df_zscore = df_zscore.reset_index(drop=True)
    
    #creating the transform_data_path
    os.makedirs(transform_data_path, exist_ok = True)
    
    #Save the df_zscore data as a pickle file to be used by the feature_engineering component.
    with open(f'{transform_data_path}/df_zscore', 'wb') as f:
        pickle.dump(df_zscore, f)
    
    return(print('Done!'))

## Feature Engineering

In [7]:
# feature engineering step

def feature_engineering(transform_data_path: InputPath(str), 
            feat_eng_path: OutputPath(str)):
    
    # install the necessary libraries
    import sys, subprocess;
    subprocess.run(["python", "-m", "pip", "install", "--upgrade", "pip"])
    subprocess.run([sys.executable, '-m', 'pip', 'install','pandas'])
    subprocess.run([sys.executable, '-m', 'pip', 'install','tqdm'])
    
    # import Library
    import os, pickle;
    import numpy as np
    import pandas as pd
    from tqdm import tqdm

    # loading the df_zscore data
    with open(f'{transform_data_path}/df_zscore', 'rb') as f:
        df_zscore = pickle.load(f)
        
    def feat_eng(df, features):

        for i in tqdm(range(1, 6)):
            # creating lag features
            tmp = df[features].shift(i)
            tmp.columns = [c + f'_next_shift_{i}' for c in tmp.columns]
            df = pd.concat([df, tmp], sort=False, axis=1)

        for i in tqdm(range(1, 6)):
            df[f'weighted_vol_price_{i}'] = np.log(df[f'Volume_next_shift_{i}'] * df[[col for col in df if col.endswith(f'next_shift_{i}')][:-1]].apply(np.mean, axis=1) + 1.0)

        # feature engineering
        df['weighted_vol_price'] = np.log(df['Volume'] * (np.mean(df[features[:-1]], axis=1)) + 1.0)
        df['BOP'] = (df['Open']-df['Close'])/(df['High']-df['Low'])
        df['HL'] = df['High'] - df['Low']
        df['OC'] = df['Close'] - df['Open']
        df['OHLCstd'] = df[['Open','Close','High','Low']].std(axis=1)
        
        # replace inf with nan
        df.replace([np.inf, -np.inf], np.nan, inplace=True)
        
        # datetime features
        df['Date'] = pd.to_datetime(df['Date'])
        df['Day'] = df['Date'].dt.weekday.astype(np.int32)
        df["dayofyear"] = df['Date'].dt.dayofyear
        df["is_weekend"] = df['Day'].isin([5, 6])
        df["weekofyear"] = df['Date'].dt.weekofyear
        df["month"] = df['Date'].dt.month
        df["season"] = (df["month"]%12 + 3)//3
        
        # fill nan values
        df = df.fillna(0)
        return df
    
    new_feats = feat_eng(df_zscore, ['High', 'Low', 'Open', 'Close', 'Volume'])
    new_feats['Target'] = df_zscore['Target']
    
    # creating the feat_eng_path
    os.makedirs(feat_eng_path, exist_ok = True)
      
    # save the feature engineered data as a pickle file to be used by the modeling component.
    with open(f'{feat_eng_path}/new_feats', 'wb') as f:
        pickle.dump(new_feats, f)
    
    return(print('Done!'))  

## Modelling
    

In [8]:
# modeling step

def modeling(feat_eng_path: InputPath(str), 
            model_path: OutputPath(str)):
    
    # install the necessary libraries
    import sys, subprocess;
    subprocess.run(["python", "-m", "pip", "install", "--upgrade", "pip"])
    subprocess.run([sys.executable, '-m', 'pip', 'install','pandas'])
    subprocess.run([sys.executable, '-m', 'pip', 'install','lightgbm'])
    
    # import Library
    import os, pickle, joblib;
    from lightgbm import LGBMRegressor

    #loading the new_feats data
    with open(f'{feat_eng_path}/new_feats', 'rb') as f:
        new_feats = pickle.load(f)
        
    # columns to be used for modelling.
    feats = ['Date','SecuritiesCode', 'Open', 'High', 'Low', 'Close', 'Volume',
         'weighted_vol_price_1', 'weighted_vol_price_2', 'weighted_vol_price_3',
       'weighted_vol_price', 'BOP', 'HL', 'OC', 'OHLCstd', 'Day', 'dayofyear',
       'is_weekend', 'weekofyear', 'month', 'season']
    
    # transform date to int
    new_feats['Date'] = new_feats['Date'].dt.strftime("%Y%m%d").astype(int)
    
    # split data into valid for validation and train for model training
    valid = new_feats[(new_feats['Date'] >= 20211111)].copy()
    train = new_feats[(new_feats['Date'] < 20211111)].copy()

    #creating the model_path directory
    os.makedirs(model_path, exist_ok = True)
    
    # model parameter
    params = {
          'n_estimators': 100,
          'verbose' : 2,
          'random_state': 1,
          'learning_rate': 0.379687157316759}
    
    # model initialization
    model = LGBMRegressor(**params)


    X = train[feats]
    y = train["Target"]

    X_test = valid[feats]
    y_test = valid["Target"]
    
    # fitting
    model.fit(X, y, verbose=False, eval_set=(X_test, y_test))
    
    # saving model
    joblib.dump(model, f'{model_path}/model')

    #Save the test_data as a pickle file to be used by the predict component.
    with open(f'{model_path}/test', 'wb') as f:
        pickle.dump((X_test,  y_test), f)
        
    return(print('Done!'))  

## Evaluation and Prediction

In [9]:
def prediction(model_path: InputPath(str), 
                metrics_path: OutputPath(str)) -> NamedTuple("EvaluationOutput", [("mlpipeline_metrics", "Metrics")]):
    
    # import Library
    import sys, subprocess;
    subprocess.run(["python", "-m", "pip", "install", "--upgrade", "pip"])
    subprocess.run([sys.executable, '-m', 'pip', 'install','scikit-learn'])
    subprocess.run([sys.executable, '-m', 'pip', 'install','pandas'])
    subprocess.run([sys.executable, '-m', 'pip', 'install','lightgbm'])
    import pickle, json;
    import pandas as  pd
    import numpy as  np
    from collections import namedtuple
    import joblib
    from sklearn.metrics import mean_squared_error
    from lightgbm import LGBMRegressor

    
    
    # load test_data
    with open(f'{model_path}/test', 'rb') as f:
        X_test,  y_test = pickle.load(f)
    
    # load model
    model = joblib.load(f'{model_path}/model')
    
    # model prediction
    preds = model.predict(X_test)
    
    # model evaluation
    rmse = np.round(mean_squared_error(preds, y_test)**0.5, 5)
    
    # create kubeflow metric metadata for UI    
    metrics = {
                'metrics': [
                    {'name': 'root-mean-squared-error',
                    'numberValue':  rmse,
                    'format': 'RAW'},
                            ]
              }
    

    with open(metrics_path, "w") as f:
        json.dump(metrics, f)

    output_tuple = namedtuple("EvaluationOutput", ["mlpipeline_metrics"])

    return output_tuple(json.dumps(metrics))

## Create pipeline components 

using `create_component_from_func`

In [10]:
# create light weight components
load_op = comp.create_component_from_func(load_data,base_image="python:3.7.1")
transform_op = comp.create_component_from_func(transform_data,base_image="python:3.7.1")
feature_eng_op = comp.create_component_from_func(feature_engineering,base_image="python:3.7.1")
modeling_op = comp.create_component_from_func(modeling, base_image="python:3.7.1")
predict_op = comp.create_component_from_func(prediction, base_image="python:3.7.1")

## Kubeflow pipeline creation

In [11]:
# define pipeline
@dsl.pipeline(name="jpx-tokyo-stock-exchange", 
              description="Predicting real future returns of around 2,000 stocks.")

# Define parameters to be fed into pipeline
def tokyo_stock_exchange_pipeline(
                             dataset: str,
                             data_path: str,
                             transform_data_path: str, 
                             feat_eng_data_path: str,
                             model_path:str
                            ):

    vop = dsl.VolumeOp(
    name="create_volume",
    resource_name="data-volume", 
    size="2Gi", 
    modes=dsl.VOLUME_MODE_RWO)
    
    # Create load container.
    load_container = load_op(dataset).add_pvolumes({"/mnt": vop.volume}).add_pod_label("kaggle-secret", "true")
    # Create transform container.
    transform_container = transform_op(load_container.output)
    # Create feature engineering container.
    feature_eng_container = feature_eng_op(transform_container.output)
    # Create modeling container.
    modeling_container = modeling_op(feature_eng_container.output)
    # Create prediction container.
    predict_container = predict_op(modeling_container.output)

In [12]:
# create client that would enable communication with the Pipelines API server 
client = kfp.Client()

In [13]:
# arguments
dataset = "jpx-tokyo-stock-exchange-prediction"
data_path = "mnt/data"
transform_data_path = "tdp"
feat_eng_data_path = "feat"
model_path = "model"

In [14]:
pipeline_func = tokyo_stock_exchange_pipeline

experiment_name = 'tokyo_stock_exchange_pipeline_lightweight'
run_name = pipeline_func.__name__ + ' run1'

arguments = {
             "dataset": dataset,
             "data_path": data_path,
             "transform_data_path": transform_data_path,
             "feat_eng_data_path": feat_eng_data_path,
             "model_path":model_path
            }

# Compile pipeline to generate compressed YAML definition of the pipeline.
kfp.compiler.Compiler().compile(pipeline_func,  
  '{}.zip'.format(experiment_name))

# Submit pipeline directly from pipeline function
run_result = client.create_run_from_pipeline_func(pipeline_func, 
                                                  experiment_name=experiment_name, 
                                                  run_name=run_name, 
                                                  arguments=arguments
                                                 )
