# Probability To Fail

One of the key performance indicators that we would like to create greater visibility into and for a given moment in time, calculate what is the probability that the test will fail. Using this information, we plan to build a model that uses the probability to fail to predict the time to failure. 

In order to provide maxium flexibility for the end-user of this work, instead of creating a number of dataframes to answer each of these specifc questions, we will define a long and narrow data structure (a list of tuples saved as a csv for now) that contains only 5 columns ("timestamp", "tab","grid","test","flake"). This allows superset (or pandas) to perform the last filter and/or aggreagtion of interest to an end user. Which is to say, there may appear to be a lot of repetion within the final dataset, but each row should be unique, and it should provide the simpelest useability for an end-user. 

Linked issue(s) : [Issue](https://github.com/aicoe-aiops/ocp-ci-analysis/issues/282)

In [1]:
from enum import Enum
import json
import gzip
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split
import numpy as np

from ipynb.fs.defs.metric_template import testgrid_labelwise_encoding
from ipynb.fs.defs.metric_template import CephCommunication
from ipynb.fs.defs.metric_template import save_to_disk, read_from_disk
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

True

In [3]:
## Specify variables
METRIC_NAME = "probability_to_fail"
# Specify the path for input grid data,
INPUT_DATA_PATH = "../../../../data/raw/testgrid_183.json.gz"

# Specify the path for output metric data
OUTPUT_DATA_PATH = f"../../../../data/processed/metrics/{METRIC_NAME}"

# Specify whether or not we are running this as a notebook or part of an automation pipeline.
AUTOMATION = os.getenv("True")

## CEPH Bucket variables
## Create a .env file on your local with the correct configs,
s3_endpoint_url = os.getenv(S3_ENDPOINT)
s3_access_key = os.getenv(S3_ACCESS_KEY)
s3_secret_key = os.getenv(S3_SECRET_KEY)
s3_bucket = os.getenv(S3_BUCKET)
s3_input_data_path = "raw_data"
metric_path = f"metrics/{METRIC_NAME}"

In [4]:
## Import data
timestamp = datetime.datetime.today()

if AUTOMATION:
    filename = f"testgrid_{timestamp.day}{timestamp.month}.json"
    cc = CephCommunication(s3_endpoint_url, s3_access_key, s3_secret_key, s3_bucket)
    s3_object = cc.s3_resource.Object(s3_bucket, f"{s3_input_data_path}/{filename}")
    file_content = s3_object.get()["Body"].read().decode("utf-8")
    testgrid_data = json.loads(file_content)

else:
    with gzip.open(INPUT_DATA_PATH, "rb") as read_file:
        testgrid_data = json.load(read_file)

## Calculation

In [5]:
failures_list = testgrid_labelwise_encoding(testgrid_data, 12, overall_only = False)

In [6]:
# Convert to dataframe
failures_df = pd.DataFrame(
    failures_list,
    columns=["timestamp", "tab", "grid", "test", "test_duration", "failure"],
)
failures_df.head()

Unnamed: 0,timestamp,tab,grid,test,test_duration,failure
0,2021-03-15 23:40:20,"""redhat-assisted-installer""",periodic-ci-openshift-release-master-nightly-4...,Overall,80.283333,False
1,2021-03-15 00:01:06,"""redhat-assisted-installer""",periodic-ci-openshift-release-master-nightly-4...,Overall,92.05,False
2,2021-03-13 20:51:32,"""redhat-assisted-installer""",periodic-ci-openshift-release-master-nightly-4...,Overall,80.983333,False
3,2021-03-13 07:51:20,"""redhat-assisted-installer""",periodic-ci-openshift-release-master-nightly-4...,Overall,117.716667,False
4,2021-03-13 06:43:20,"""redhat-assisted-installer""",periodic-ci-openshift-release-master-nightly-4...,Overall,108.633333,False


In [7]:
passing_list = testgrid_labelwise_encoding(testgrid_data, 1, overall_only = False)

In [8]:
# Convert to dataframe
passing_df = pd.DataFrame(
    passing_list,
    columns=["timestamp", "tab", "grid", "test", "test_duration", "passing"],
)
passing_df.head()

Unnamed: 0,timestamp,tab,grid,test,test_duration,passing
0,2021-03-15 23:40:20,"""redhat-assisted-installer""",periodic-ci-openshift-release-master-nightly-4...,Overall,80.283333,True
1,2021-03-15 00:01:06,"""redhat-assisted-installer""",periodic-ci-openshift-release-master-nightly-4...,Overall,92.05,True
2,2021-03-13 20:51:32,"""redhat-assisted-installer""",periodic-ci-openshift-release-master-nightly-4...,Overall,80.983333,True
3,2021-03-13 07:51:20,"""redhat-assisted-installer""",periodic-ci-openshift-release-master-nightly-4...,Overall,117.716667,True
4,2021-03-13 06:43:20,"""redhat-assisted-installer""",periodic-ci-openshift-release-master-nightly-4...,Overall,108.633333,True


In [9]:
combined_df = pd.merge(
    failures_df,
    passing_df,
    on=["timestamp", "tab", "grid", "test", "test_duration"],
)
combined_df.head()

Unnamed: 0,timestamp,tab,grid,test,test_duration,failure,passing
0,2021-03-15 23:40:20,"""redhat-assisted-installer""",periodic-ci-openshift-release-master-nightly-4...,Overall,80.283333,False,True
1,2021-03-15 00:01:06,"""redhat-assisted-installer""",periodic-ci-openshift-release-master-nightly-4...,Overall,92.05,False,True
2,2021-03-13 20:51:32,"""redhat-assisted-installer""",periodic-ci-openshift-release-master-nightly-4...,Overall,80.983333,False,True
3,2021-03-13 07:51:20,"""redhat-assisted-installer""",periodic-ci-openshift-release-master-nightly-4...,Overall,117.716667,False,True
4,2021-03-13 06:43:20,"""redhat-assisted-installer""",periodic-ci-openshift-release-master-nightly-4...,Overall,108.633333,False,True


### Calculate Probability to Fail using Calibrated Classification Model

In [10]:
## This helper function will add pass/fail labels to enable classification model.
def p_f_label(row):
    if row["passing"]:
        return 1
    elif row["failure"]:
        return 0
    else :
        return -1

In [11]:
## Create a sample df with a batch of 1000 to start with.
sample_df = combined_df[1:1000]
## Add the pass_or_fail label using the helper function `p_f_label`
sample_df["pass_or_fail"] = sample_df.apply(lambda row: p_f_label(row), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_df["pass_or_fail"] = sample_df.apply(lambda row: p_f_label(row), axis=1)


In [12]:
## For all the missing test_duration values, the label is -1 since it's NaN.
sample_df = sample_df.fillna(-1)
## Labels for multiclass classification 
sample_df.pass_or_fail.unique()

array([ 1,  0, -1])

In [13]:
## Create X = test_duration and y = pass_or_fail and calculate probabilty.
X, y = np.array([sample_df["test_duration"].values]), np.array([sample_df["pass_or_fail"].values])
X = X.reshape(-1,1)
y = y.reshape(-1,1)

# split into train/test sets.
trainX, testX, trainy, testy = train_test_split(X, y, train_size=0.8, test_size = 0.2,random_state=None)

# fit a Calibrated Classification Model.
model = SVC()
calibrated = CalibratedClassifierCV(model, method='sigmoid', cv=5)
calibrated.fit(trainX, trainy)

# predict probabilities.
probs_train = calibrated.predict_proba(trainX)[:, 1]
probs_test = calibrated.predict_proba(testX)[:, 1]
probs = np.append(probs_train, probs_test, axis=None)
sample_df["prob"] = probs

  return f(*args, **kwargs)


In [14]:
sample_df.head()

Unnamed: 0,timestamp,tab,grid,test,test_duration,failure,passing,pass_or_fail,prob
1,2021-03-15 00:01:06,"""redhat-assisted-installer""",periodic-ci-openshift-release-master-nightly-4...,Overall,92.05,False,True,1,0.081762
2,2021-03-13 20:51:32,"""redhat-assisted-installer""",periodic-ci-openshift-release-master-nightly-4...,Overall,80.983333,False,True,1,0.011232
3,2021-03-13 07:51:20,"""redhat-assisted-installer""",periodic-ci-openshift-release-master-nightly-4...,Overall,117.716667,False,True,1,0.011204
4,2021-03-13 06:43:20,"""redhat-assisted-installer""",periodic-ci-openshift-release-master-nightly-4...,Overall,108.633333,False,True,1,0.011371
5,2021-03-13 03:03:06,"""redhat-assisted-installer""",periodic-ci-openshift-release-master-nightly-4...,Overall,87.066667,False,True,1,0.011205


## Save results to Ceph or locally
* Use the following helper function to save the data frame in a parquet format on the Ceph bucket if we are running in automation, and locally if not.

In [15]:
# timestamp = datetime.datetime.now()

# if AUTOMATION:
#     cc = CephCommunication(s3_endpoint_url, s3_access_key, s3_secret_key, s3_bucket)
#     cc.upload_to_ceph(
#         combined_df.head(1000000),
#         s3_path,
#         f"{METRIC_NAME}/{METRIC_NAME}-{timestamp.year}-{timestamp.month}-{timestamp.day}.parquet",
#     )
# else:
#     save_to_disk(
#         combined_df.head(1000000),
#         OUTPUT_DATA_PATH,
#         f"{METRIC_NAME}-{timestamp.year}-{timestamp.month}-{timestamp.day}.parquet",
#     )

In [16]:
# ## Sanity check to see if the dataset is the same
# if AUTOMATION:
#     sanity_check = cc.read_from_ceph(
#         s3_path,
#         f"{METRIC_NAME}/{METRIC_NAME}-{timestamp.year}-{timestamp.month}-{timestamp.day}.parquet",
#     )
# else:
#     sanity_check = read_from_disk(
#         OUTPUT_DATA_PATH,
#         f"{METRIC_NAME}-{timestamp.year}-{timestamp.month}-{timestamp.day}.parquet",
#     )

# sanity_check