In [None]:
!python -m pip install --user --upgrade pip

!pip3 install lasio pandas==0.24.2 matplotlib==3.2.2 scipy==1.4.1 statsmodels==0.12.0 scikit-learn==0.23.1 --user

In [None]:
!pip3 install kfp --upgrade --user

In [None]:
#Check if the install was successful
!which dsl-compile

In [None]:
# Import Kubeflow SDK
import kfp
import kfp.dsl as dsl
import kfp.components as comp

In [None]:
# where the outputs are stored
out_dir = "/home/jovyan/Volve_ML/data/"

In [None]:
def train_tranform(data_path):
    
    import pickle
    import sys, subprocess;
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'pandas==0.24.2'])
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'scikit-learn==0.23.1'])
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'ipython'])
    
    import numpy as np
    import pandas as pd
    from sklearn.preprocessing import PowerTransformer
    
    # Download the dataset and split into training and test data. 
    data = pd.read_csv("https://raw.githubusercontent.com/Josepholaidepetro/Volve_ML/main/data/train.csv")
    
    # Preprocess
    data.drop(['DEPTH', 'BS', 'RD', 'ROP', 'RM', 'DRHO'], axis=1, inplace=True)
    
    # If Nan, drop
    data.dropna(inplace=True)
    
    # transform the RT to logarithmic
    data['RT'] = np.log10(data['RT'])
    
    # perform a yeo-johnson transform of the train dataset
    ptrain = PowerTransformer(method='yeo-johnson')
    train_df_yj = ptrain.fit_transform(data.drop('DT', axis=1))
    train_df_yj_norm = pd.DataFrame(train_df_yj, columns=data.columns.drop('DT'))
    y_train = data['DT']
    
    #Save the train_data as a pickle file to be used by the predict component.
    with open(f'{data_path}/train_data', 'wb') as f:
        pickle.dump((train_df_yj_norm,  y_train), f)

In [None]:
train_tranform(out_dir)

In [None]:
def test_tranform(data_path):
    
    import pickle
    import sys, subprocess;
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'pandas==0.24.2'])
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'scikit-learn==0.23.1'])
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'ipython'])
    
    import numpy as np
    import pandas as pd
    from sklearn.preprocessing import PowerTransformer
    
    # Download the dataset and split into training and test data. 
    data = pd.read_csv("https://raw.githubusercontent.com/Josepholaidepetro/Volve_ML/main/data/test.csv")
    
    # Preprocess
    data.drop(['DEPTH', 'BS', 'ROP', 'DRHO'], axis=1, inplace=True)
    
    # If Nan, drop
    data.dropna(inplace=True)
    
    # transform the RT to logarithmic
    data['RT'] = np.log10(data['RT'])
    
    # perform a yeo-johnson transform of the train dataset
    ptest = PowerTransformer(method='yeo-johnson')
    test_df_yj = ptest.fit_transform(data.drop('DT', axis=1))
    test_df_yj_norm = pd.DataFrame(test_df_yj, columns=data.columns.drop('DT'))
    y_test = data['DT']
    
    #Save the test_data as a pickle file to be used by the predict component.
    with open(f'{data_path}/test_data', 'wb') as f:
        pickle.dump((test_df_yj_norm,  y_test), f)

In [None]:
test_tranform(out_dir)

In [None]:
def Outlier_removal(data_path):
    
    import pickle
    import sys, subprocess;
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'pandas==0.24.2'])
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'scikit-learn==0.23.1'])
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'ipython'])
    
    from sklearn.svm import OneClassSVM
    import numpy as np
    import pandas as pd
    
    # Load and unpack the train_data
    with open(f'{data_path}/train_data','rb') as f:
        train_data = pickle.load(f)
    # Separate the train_df_yj_norm from y_train.
    train_df_yj_norm,  y_train = train_data

    # Method 4: One-class SVM
    svm = OneClassSVM(nu=0.13)
    yhat = svm.fit_predict(train_df_yj_norm)
    mask = yhat != -1
    train_df_svm = train_df_yj_norm[mask]
    y_train_svm = y_train[mask]

    # prepare train data for modelling
    X_train = train_df_svm.drop('label', axis=1)
    y_train = y_train_svm.copy()
    
    #Save the train_data as a pickle file to be used by the predict component.
    with open(f'{data_path}/train_data2', 'wb') as f:
        pickle.dump((X_train,  y_train), f)
    
    print('Outlier removed successfully!')

In [None]:
Outlier_removal(out_dir)

In [None]:
import cython
print(cython.__version__)  

In [None]:
def model_building(data_path):
    
    import pickle
    import sys, subprocess;
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'pandas==0.24.2'])
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'scikit-learn==0.23.1'])
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'cython==0.27.3'])
    import numpy as np
    import pandas as pd
    import cython
    from sklearn.ensemble import ExtraTreesRegressor
    
    # Load and unpack the train_data
    with open(f'{data_path}/train_data2','rb') as f:
        train_data2 = pickle.load(f)
    # Separate the train_df_yj_norm from y_train.
    X_train,  y_train = train_data2
    
    # Load and unpack the test_data
    with open(f'{data_path}/test_data','rb') as f:
        test_data = pickle.load(f)
    # Separate the train_df_yj_norm from y_train.
    X_test, y_test = test_data
    
    # Define the model.
    model = ExtraTreesRegressor(n_estimators=800, max_depth=6, random_state=21)

    # Run a training job
    model.fit(X_train, y_train)

    #Evaluate the model and print the results
    score = model.score(X_test, y_test)
    print("R-squared of Well 3: {}".format(score))

    
    #Save the model to the designated 
    filename = f'{data_path}/modelo.pkl'
    pickle.dump(model, open(filename, 'wb'))

In [None]:
regressor = model_building(out_dir)

In [None]:
def predict(data_path):
    
    import pickle
    import sys, subprocess;
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'pandas==0.24.2'])
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'scikit-learn==0.23.1'])
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'cython==0.27.3'])
    import numpy as np
    import pandas as pd
    import cython
    from sklearn.ensemble import ExtraTreesRegressor
    
    # Load the saved Keras model
    filename = f'{data_path}/modelo.pkl'
    regressor = pickle.load(open(filename, 'rb'))

    # Load and unpack the test_data
    with open(f'{data_path}/test_data','rb') as f:
        test_data = pickle.load(f)
    # Separate the X_test from y_test.
    test_df_yj_norm,  y_test = test_data

    # make predictions.
    y_pred = regressor.predict(test_df_yj_norm)

    
    with open(f'{data_path}/result1', 'wb') as f:
        pickle.dump(y_pred, f)
        
    with open(f'{data_path}/result.txt', 'w') as result:
        result.write(" Prediction: {}, Actual: {} ".format(y_pred,y_test))
    
    print('Prediction has be saved successfully!')

In [None]:
predict(out_dir)

In [None]:
# Create train and predict lightweight components.
train_tranform_op = comp.func_to_container_op(train_tranform , base_image = "tensorflow/tensorflow:latest-gpu-py3")
test_tranform_op = comp.func_to_container_op(test_tranform , base_image = "tensorflow/tensorflow:latest-gpu-py3")
Outlier_removal_op = comp.func_to_container_op(Outlier_removal , base_image = "tensorflow/tensorflow:latest-gpu-py3")
train_op = comp.func_to_container_op(model_building , base_image = "tensorflow/tensorflow:latest-gpu-py3")
predict_op = comp.func_to_container_op(predict , base_image = "tensorflow/tensorflow:latest-gpu-py3")

## Build Kubeflow Pipeline

In [None]:
#Create a client to enable communication with the Pipelines API server.
client = kfp.Client()

In [None]:
# Define the pipeline
@dsl.pipeline(
   name='Sonic Log Prediction Pipeline',
   description='An ML pipeline that performs Sonic Log Prediction prediction.'
)

# Define parameters to be fed into pipeline
def dt_container_pipeline(
    data_path: str
):
    
    # Define volume to share data between components.
    vop = dsl.VolumeOp(
    name="create_volume",
    resource_name="data-volume", 
    size="1Gi", 
    modes=dsl.VOLUME_MODE_RWO)
    
    # Create deploy component.
    train_tranform_container = train_tranform_op(data_path) \
                                    .add_pvolumes({data_path: vop.volume})

    # Create data transformation component.
    test_tranform_container = test_tranform_op(data_path) \
                                    .add_pvolumes({data_path: train_tranform_container.pvolume})
    
    # Create model validation component.
    Outlier_removal_container = Outlier_removal_op(data_path) \
                                    .add_pvolumes({data_path: test_tranform_container.pvolume})
    
    # Create model validation component.
    train_container = train_op(data_path) \
                                    .add_pvolumes({data_path: Outlier_removal_container.pvolume})
    
    # Create model training component.
    predict_container = predict_op(data_path) \
                                    .add_pvolumes({data_path: train_container.pvolume})
    
    
    # Print the result of the prediction
    churn_result_container = dsl.ContainerOp(
        name="print_prediction",
        image='library/bash:4.4.23',
        pvolumes={data_path: predict_container.pvolume},
        arguments=['cat', f'{data_path}/result.txt']
    )

## Run the Pipeline

In [None]:
pipeline_func = dt_container_pipeline

In [None]:
DATA_PATH = out_dir

In [None]:
MODEL_PATH='modelo'

In [None]:
experiment_name = 'dt_prediction_kubeflow'
run_name = pipeline_func.__name__ + ' run'

arguments = {"data_path":DATA_PATH}

# Compile pipeline to generate compressed YAML definition of the pipeline.
kfp.compiler.Compiler().compile(pipeline_func, '{}.zip'.format(experiment_name))

# Submit pipeline directly from pipeline function
run_result = client.create_run_from_pipeline_func(pipeline_func, 
                                                  experiment_name=experiment_name, 
                                                  run_name=run_name, 
                                                  arguments=arguments)

http://localhost:8888/pipeline#/experiments/details/644a33c5-3114-4b3c-b6de-3b0b6467eafa

http://localhost:8888/pipeline#/runs/details/2aadb518-ead4-4035-90ea-011a44bfe16d