# Shape Tech-Test

Author: Jonatas Cesar 

In [None]:
import matplotlib.pyplot as plt
import pandas
import pandas as pd
import shap
from catboost import CatBoostClassifier, Pool
from pandas_profiling import ProfileReport
from sklearn.metrics import RocCurveDisplay, classification_report
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_sample_weight

In [None]:
#%matplotlib inline

In [None]:
df = pd.read_excel("O&G Equipment Data.xlsx", index_col="Cycle")

In [None]:
df.describe().T

In [None]:
# check imbalanced data
df.Fail.mean()

In [None]:
profile = ProfileReport(df, title="Shape Data - Profiling Report", minimal=True)

In [None]:
profile.to_widgets()

## Split Data

In [None]:
X, y = df.iloc[:, :-1], df.iloc[:, -1]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

## Model Fit 

In [None]:
model = CatBoostClassifier()

In [None]:
cat_features = ["Preset_1", "Preset_2"]

In [None]:
model.fit(
    X_train,
    y_train,
    cat_features=cat_features,
    verbose=False,
)

## Classification perfomance

In [None]:
y_pred_prob = model.predict(X_test, prediction_type="Probability")[:, 1]

In [None]:
RocCurveDisplay.from_predictions(y_test.values, y_pred_prob)
plt.show()

In [None]:
y_pred = y_pred_prob > 0.5

In [None]:
print(classification_report(y_test.values, y_pred))

## Feature importance 

In [None]:
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(Pool(X_train, y_train, cat_features=cat_features))
shap.summary_plot(shap_values, X_train)