In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,confusion_matrix,accuracy_score,recall_score,precision_score,classification_report,roc_auc_score
import shap
import catboost

In [2]:
df2 = pd.read_excel(r'fraud.xlsx')

In [None]:
y  = df2['Class'].values
X = df2.drop(['Class'],axis=1).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print(len(y_train), y_train.sum(),len(y_test), y_test.sum())
print((X_train.shape), (y_train.shape), (X_test.shape), (y_test.shape))


categorical_features = np.where(df2.drop(['Class'],axis=1).dtypes == np.object)[0]
params = {
          'learning_rate':0.1,
          'cat_features':categorical_features,
          'depth':6,
          'eval_metric':'AUC',
          'verbose':200,
          'od_type':"Iter", # overfit detector
          'od_wait':500, # most recent best iteration to wait before stopping
          #'random_seed': 2,
          #'n_estimators': 200
          }


cat_model = catboost.CatBoostClassifier(**params)
cat_model.fit(X_train, y_train,   
          eval_set=(X_test, y_test), 
          use_best_model=True, # True if we don't want to save trees created after iteration with the best validation score
          plot=True  
         );

pred =  cat_model.predict(X_test)


print(classification_report(y_test,pred))

print('test-set confusion matrix:\n') 
confusion_matrix_df = pd.DataFrame(confusion_matrix(y_test,pred))
confusion_matrix_df.columns = ['Predicted 0','Predicted 1'] 
confusion_matrix_df.index = ['actual 0','actual 1'] 
print(confusion_matrix_df)

In [None]:
cat_model.save_model('fraud')

In [None]:
explainer = shap.Explainer(cat_model)
shap_values = explainer(X_test)

shapvaluedf = pd.DataFrame(shap_values.values, columns= df2.drop(['Class'],axis=1).columns)
shapdatadf = pd.DataFrame(shap_values.data, columns= df2.drop(['Class'],axis=1).columns)

shapdatadf.to_excel(r'shapdatadf.xlsx')
shapvaluedf.to_excel(r'shapvaluedf.xlsx')