## Load training dataset from parquet file
We are loading data from a list of parquest files.

In [None]:
metrics_parquet_list = [
    '230803_VH00972_180_AACNG7GM5_interop_2023_Oct_19_15_09/230803_VH00972_180_AACNG7GM5_tile.parquet',
    '230804_VH00972_181_AACV5CHM5_interop_2023_Oct_19_15_11/230804_VH00972_181_AACV5CHM5_tile.parquet',
    '230810_VH00972_185_AAC3WL7HV_interop_2023_Oct_19_15_22/230810_VH00972_185_AAC3WL7HV_tile.parquet',
    '230821_VH00972_190_AAC5HJYHV_interop_2023_Oct_19_23_23/230821_VH00972_190_AAC5HJYHV_tile.parquet',
    '230823_VH00972_192_AAC5HKMHV_interop_2023_Oct_19_23_27/230823_VH00972_192_AAC5HKMHV_tile.parquet',
    '230824_VH00972_193_AACVFWKM5_interop_2023_Oct_19_23_29/230824_VH00972_193_AACVFWKM5_tile.parquet',
    '230829_VH00972_195_AAC5HJNHV_interop_2023_Oct_19_23_34/230829_VH00972_195_AAC5HJNHV_tile.parquet',
    '230830_VH00972_196_AACTYFVM5_interop_2023_Oct_19_23_36/230830_VH00972_196_AACTYFVM5_tile.parquet',
    '230904_VH00972_198_AAC5HF5HV_interop_2023_Oct_19_23_40/230904_VH00972_198_AAC5HF5HV_tile.parquet',
    '230905_VH00972_199_AACVFYGM5_interop_2023_Oct_19_23_42/230905_VH00972_199_AACVFYGM5_tile.parquet',
    '230727_VH00972_176_AAC3WKCHV_interop_2023_Oct_19_15_00/230727_VH00972_176_AAC3WKCHV_tile.parquet',
    '230731_VH00972_177_AAC3WNLHV_interop_2023_Oct_19_15_03/230731_VH00972_177_AAC3WNLHV_tile.parquet',
    '230925_VH00972_207_AACVY5KM5_interop_2023_Oct_20_10_01/230925_VH00972_207_AACVY5KM5_tile.parquet',
    '230928_VH00972_210_AACTYCLM5_interop_2023_Oct_20_10_05/230928_VH00972_210_AACTYCLM5_tile.parquet',
    '230920_VH00972_205_AAC5KGMHV_interop/230920_VH00972_205_AAC5KGMHV_tile.parquet',
    '231003_VH00972_212_AAC5HK3HV_interop_2023_Oct_20_10_06/231003_VH00972_212_AAC5HK3HV_tile.parquet']

## Load all the Python libraries

In [None]:
from pyspark.sql import SparkSession
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession
import pyspark.sql.functions as sf
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import (
    StandardScaler,
    MinMaxScaler)
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import (
    train_test_split,
    cross_validate)
from sklearn.metrics import (
    confusion_matrix,
    f1_score, 
    classification_report,
    precision_recall_curve,
    average_precision_score,
    roc_curve)
from sklearn.pipeline import Pipeline

## Read data using PySpark

In [None]:
spark = SparkSession.builder.master("local[4]").getOrCreate()

In [None]:
training_df = spark.read.parquet(*metrics_parquet_list)

In [None]:
labelDataDf = spark.createDataFrame(pd.DataFrame([
    {"Run_id": "230803_VH00972_180_AACNG7GM5", "Lane": 1, "is_failed": 0},
    {"Run_id": "230804_VH00972_181_AACV5CHM5", "Lane": 1, "is_failed": 0},
    {"Run_id": "230810_VH00972_185_AAC3WL7HV", "Lane": 1, "is_failed": 0},
    {"Run_id": "230810_VH00972_185_AAC3WL7HV", "Lane": 2, "is_failed": 0},
    {"Run_id": "230821_VH00972_190_AAC5HJYHV", "Lane": 1, "is_failed": 1},
    {"Run_id": "230821_VH00972_190_AAC5HJYHV", "Lane": 2, "is_failed": 1},
    {"Run_id": "230823_VH00972_192_AAC5HKMHV", "Lane": 1, "is_failed": 0},
    {"Run_id": "230823_VH00972_192_AAC5HKMHV", "Lane": 2, "is_failed": 0},
    {"Run_id": "230824_VH00972_193_AACVFWKM5", "Lane": 1, "is_failed": 1},
    {"Run_id": "230829_VH00972_195_AAC5HJNHV", "Lane": 1, "is_failed": 0},
    {"Run_id": "230829_VH00972_195_AAC5HJNHV", "Lane": 2, "is_failed": 0},
    {"Run_id": "230830_VH00972_196_AACTYFVM5", "Lane": 1, "is_failed": 0},
    {"Run_id": "230904_VH00972_198_AAC5HF5HV", "Lane": 1, "is_failed": 0},
    {"Run_id": "230904_VH00972_198_AAC5HF5HV", "Lane": 2, "is_failed": 0},
    {"Run_id": "230905_VH00972_199_AACVFYGM5", "Lane": 1, "is_failed": 1},
    {"Run_id": "230727_VH00972_176_AAC3WKCHV", "Lane": 1, "is_failed": 1},
    {"Run_id": "230727_VH00972_176_AAC3WKCHV", "Lane": 2, "is_failed": 1},
    {"Run_id": "230731_VH00972_177_AAC3WNLHV", "Lane": 1, "is_failed": 1},
    {"Run_id": "230731_VH00972_177_AAC3WNLHV", "Lane": 2, "is_failed": 1},
    {"Run_id": "230925_VH00972_207_AACVY5KM5", "Lane": 1, "is_failed": 1},
    {"Run_id": "230928_VH00972_210_AACTYCLM5", "Lane": 1, "is_failed": 1},
    {"Run_id": "230920_VH00972_205_AAC5KGMHV", "Lane": 1, "is_failed": 1},
    {"Run_id": "230920_VH00972_205_AAC5KGMHV", "Lane": 2, "is_failed": 1},
    {"Run_id": "231003_VH00972_212_AAC5HK3HV", "Lane": 1, "is_failed": 0},
    {"Run_id": "231003_VH00972_212_AAC5HK3HV", "Lane": 2, "is_failed": 1}
]))

## Add data lebel to training data

In [None]:
joinExpression = \
    (training_df['Run_id']==labelDataDf['Run_id'])&\
    (training_df['Lane']==labelDataDf['Lane'])
joinType = "inner"
## set alias
training_df_alias = training_df.alias('m')
labelDataDf_alias = labelDataDf.alias('l')
## join df
joined_df = \
    training_df_alias.\
    join(labelDataDf_alias, joinExpression, joinType).\
    select(
        "m.Run_id",
        "m.Lane",
        'm.Tile',
        'm.PCT_ClusterCountPF',
        'm.PCT_DensityPF',
        'm.mean_CalledCount_A',
        'm.mean_CalledCount_T',
        'm.mean_CalledCount_G',
        'm.mean_CalledCount_C',
        'm.PCT_Q30',
        'm.mean_ErrorRate',
        'm.PCT_Occupied',
        'm.intensity_c1',
        'm.slope_p',
        'm.offset_p',
        'm.slope_pr',
        'm.offset_pr',
        "l.is_failed"
    )

## Write joined data to a parquet file

In [None]:
joined_df.write.parquet('joined_table.parquet', mode='overwrite')

## Modify data to training and test set

In [None]:
## read data from parquet file, if required
df = spark.read.parquet('joined_table.parquet')
pdf = df.toPandas()
##
## dividie data to training or test set
##
X = pdf[['PCT_ClusterCountPF', 'PCT_DensityPF',
         'mean_CalledCount_A', 'mean_CalledCount_T', 'mean_CalledCount_G',
         'mean_CalledCount_C', 'PCT_Q30', 'PCT_Occupied',
         'intensity_c1', 'slope_p', 'offset_p', 'slope_pr', 'offset_pr']]
y = pdf['is_failed']
X = \
    X.astype({
        'PCT_ClusterCountPF': float,
        'PCT_DensityPF': float,
        'PCT_Occupied': float})
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, random_state=42, stratify=y)

## Training model using Dummy Classifier

In [None]:
dc = DummyClassifier()
dc.fit(X_train ,y_train)
print(f"Dummy classifier score: {dc.score(X_test, y_test):.2f}")
print(f"Confusion matrix:\n {confusion_matrix(y_test, dc.predict(X_test))}")
print(f"F1 score: {f1_score(y_test, dc.predict(X_test), average='weighted'):.2f}")
print(f"classification report:\n{classification_report(y_test, dc.predict(X_test))}")
print(f"Cross validation score:\n{cross_validate(dc, X_train, y_train, cv=5)}")
print(f"Average precision score: {average_precision_score(y_test, dc.predict_proba(X_test)[:, 1]):.4f}")
precision, recall, thresholds = \
    precision_recall_curve(y_test, dc.predict_proba(X_test)[:, 1])
close_zero = \
    np.argmin(np.abs(thresholds - 0.5))
plt.plot(
    precision[close_zero],
    recall[close_zero],
    "o",
    label="threshold zero",
    fillstyle="none")
plt.plot(
    precision,
    recall,
    label="Precision - recall - XGBoost")
plt.xlabel("Precision")
plt.ylabel("Recall")
plt.legend(loc='best')
plt.show()
fpr, tpr, thresholds = \
    roc_curve(
        y_test,
        dc.predict_proba(X_test)[:, 1])
plt.plot(fpr, tpr, label="ROC - RF")
close_zero = \
    np.argmin(np.abs(thresholds - 0.5))
plt.plot(
    fpr[close_zero],
    tpr[close_zero],
    "o",
    label="threshold",
    fillstyle="none")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.legend(loc="best")
plt.show()

## Training model using RandomForest classifier

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train ,y_train)
print(f"Random forest classifier score: {rf.score(X_test, y_test):.2f}")
print(f"Confusion matrix:\n {confusion_matrix(y_test, rf.predict(X_test))}")
print(f"F1 score: {f1_score(y_test, rf.predict(X_test), average='weighted'):.2f}")
print(f"classification report:\n{classification_report(y_test, rf.predict(X_test))}")
print(f"Cross validation score:\n{cross_validate(rf, X_train, y_train, cv=5)}")
print(f"Average precision score: {average_precision_score(y_test, rf.predict_proba(X_test)[:, 1]):.4f}")
precision, recall, thresholds = \
    precision_recall_curve(
        y_test,
        rf.predict_proba(X_test)[:, 1])
close_zero = \
    np.argmin(np.abs(thresholds - 0.5))
plt.plot(
    precision[close_zero],
    recall[close_zero],
    "o",
    label="threshold zero",
    fillstyle="none")
plt.plot(
    precision,
    recall,
    label="Precision - recall - XGBoost")
plt.xlabel("Precision")
plt.ylabel("Recall")
plt.legend(loc='best')
plt.show()
fpr, tpr, thresholds = \
    roc_curve(
        y_test,
        rf.predict_proba(X_test)[:, 1])
plt.plot(fpr, tpr, label="ROC - RF")
close_zero = \
    np.argmin(np.abs(thresholds - 0.5))
plt.plot(
    fpr[close_zero],
    tpr[close_zero],
    "o",
    label="threshold",
    fillstyle="none")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.legend(loc="best")
plt.show()

## Training model with XGBoost

In [None]:
xgbc = \
    xgb.XGBClassifier(objective='binary:logistic')
xgbc.fit(X_train ,y_train)
print(f"XGBoost classifier score: {xgbc.score(X_test, y_test):.2f}")
print(f"Confusion matrix:\n {confusion_matrix(y_test, xgbc.predict(X_test))}")
print(f"F1 score: {f1_score(y_test, xgbc.predict(X_test), average='weighted'):.2f}")
print(f"classification report:\n{classification_report(y_test, xgbc.predict(X_test))}")
print(f"Cross validation score:\n{cross_validate(xgbc, X_train, y_train, cv=5)}")
print(f"Average precision score: {average_precision_score(y_test, xgbc.predict_proba(X_test)[:, 1]):.4f}")
precision, recall, thresholds = \
    precision_recall_curve(
        y_test,
        xgbc.predict_proba(X_test)[:, 1])
close_zero = \
    np.argmin(np.abs(thresholds - 0.5))
plt.plot(
    precision[close_zero],
    recall[close_zero],
    "o",
    label="threshold zero",
    fillstyle="none")
plt.plot(
    precision,
    recall,
    label="Precision - recall - XGBoost")
plt.xlabel("Precision")
plt.ylabel("Recall")
plt.legend(loc='best')
plt.show()
fpr, tpr, thresholds = \
    roc_curve(
        y_test,
        xgbc.predict_proba(X_test)[:, 1])
plt.plot(fpr, tpr, label="ROC - RF")
close_zero = \
    np.argmin(np.abs(thresholds - 0.5))
plt.plot(
    fpr[close_zero],
    tpr[close_zero],
    "o",
    label="threshold",
    fillstyle="none")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.legend(loc="best")
plt.show()

## Training model using Support Vector Machine (SVC)

In [None]:
svc = SVC()
svc.fit(X_train ,y_train)
print(f"SVC classifier score: {svc.score(X_test, y_test):.2f}")
print(f"Confusion matrix:\n {confusion_matrix(y_test, svc.predict(X_test))}")
print(f"F1 score: {f1_score(y_test, svc.predict(X_test), average='weighted'):.2f}")
print(f"classification report:\n{classification_report(y_test, svc.predict(X_test))}")
print(f"Cross validation score:\n{cross_validate(svc, X_train, y_train, cv=5)}")
print(f"Average precision score: {average_precision_score(y_test, svc.decision_function(X_test)):.4f}")
precision, recall, thresholds = \
    precision_recall_curve(
        y_test,
        svc.decision_function(X_test))
close_zero = \
    np.argmin(np.abs(thresholds))
plt.plot(
    precision[close_zero],
    recall[close_zero],
    "o",
    label="threshold zero",
    fillstyle="none")
plt.plot(
    precision,
    recall,
    label="Precision - recall - XGBoost")
plt.xlabel("Precision")
plt.ylabel("Recall")
plt.legend(loc='best')
plt.show()
fpr, tpr, thresholds = \
    roc_curve(
        y_test,
    svc.decision_function(X_test))
plt.plot(fpr, tpr, label="ROC - RF")
close_zero = \
    np.argmin(np.abs(thresholds))
plt.plot(
    fpr[close_zero],
    tpr[close_zero],
    "o",
    label="threshold",
    fillstyle="none")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.legend(loc="best")
plt.show()

## Training model using GridSearchCV
We are using GridSearchSV to find the best model.

In [None]:
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, random_state=42, stratify=y)

ct = ColumnTransformer(
        [("scaling",
          StandardScaler(), 
          ['mean_CalledCount_A', 
           'mean_CalledCount_T', 
           'mean_CalledCount_G',
           'mean_CalledCount_C'])],
        remainder='passthrough')
pipe = Pipeline([("classifier", SVC())])
params = [
    {"classifier": [SVC()],
     "classifier__C": [0.001, 0.01, 0.1, 1],
     "classifier__gamma": [0.001, 0.01, 0.1, 1]
    },
    {"classifier": [RandomForestClassifier()],
     "classifier__max_depth": [2, 3, 4, 5, 6, 7, 8, 9, 10, None],
     "classifier__max_features": [1, 2, 3, 4]
    },
    {"classifier": [xgb.XGBClassifier(objective='binary:logistic')],
     "classifier__max_depth": [2, 3, 4, 5, 6, 7, 8, 9, 10],
     "classifier__max_leaves": [0, 1, 2, 3, 4],
     "classifier__learning_rate": [None, 0.001, 0.01, 0.1, 1.0]
    }]
grid = GridSearchCV(pipe, param_grid=params, cv=5, n_jobs=4)
X_train_scaled = ct.fit_transform(X_train)
grid.fit(X_train_scaled, y_train)
X_test_scaled  = ct.fit_transform(X_test)
print(grid.score(X_test_scaled, y_test))
print(grid.best_estimator_)
print(grid.best_params_)
## get the best model
model = grid.best_estimator_
print(f"Best model score: {model.score(X_test_scaled, y_test):.2f}")
print(f"Confusion matrix:\n {confusion_matrix(y_test, model.predict(X_test_scaled))}")
print(f"F1 score: {f1_score(y_test, model.predict(X_test_scaled), average='weighted'):.2f}")
print(f"classification report:\n{classification_report(y_test, model.predict(X_test_scaled))}")
print(f"Cross validation score:\n{cross_validate(model, X_train_scaled, y_train, cv=5)}")
print(f"Average precision score: {average_precision_score(y_test, model.predict_proba(X_test_scaled)[:, 1]):.4f}")
precision, recall, thresholds = \
    precision_recall_curve(
        y_test,
        model.predict_proba(X_test_scaled)[:, 1])
close_zero = \
    np.argmin(np.abs(thresholds - 0.5))
plt.plot(
    precision[close_zero],
    recall[close_zero],
    "o",
    label="threshold zero",
    fillstyle="none")
plt.plot(
    precision,
    recall,
    label="Precision - recall - XGBoost")
plt.xlabel("Precision")
plt.ylabel("Recall")
plt.legend(loc='best')
plt.show()
fpr, tpr, thresholds = \
    roc_curve(
        y_test,
        model.predict_proba(X_test_scaled)[:, 1])
plt.plot(fpr, tpr, label="ROC - RF")
close_zero = \
    np.argmin(np.abs(thresholds - 0.5))
plt.plot(
    fpr[close_zero],
    tpr[close_zero],
    "o",
    label="threshold",
    fillstyle="none")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.legend(loc="best")
plt.show()

## Export model

In [None]:
import pickle
file_name = "xgb_reg.pkl"
pickle.dump(model, open(file_name, "wb"))