# Fraud, Waste and Abuse Classifier Walkthrough


Anomaly detection typically deals with highly imbalanced datasets. There are a few ways to address this imbalance. One approach is to upsampling the minority class using the SMOTE algorithm to generate synthetic minority samples (see 1st link).



### Relevant Links:

1. http://www.jmlr.org/papers/volume18/16-365/16-365.pdf
2. https://scikit-learn.org/stable/modules/ensemble.html#adaboost
3. https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html
4. https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html


What follows is the source code comprising the FWA classifier. The codebase is maintained under version control, a working copy of which can be checkout by running the following command (requires that Git be installed, which can be downloaded [here](https://git-scm.com/downloads)):

```
$ git clone file:///s/GLTCVAL.W/Repos/FWA.git
```



### Pre-Processing


In [43]:
"""
Read in FWA training dataset. Update PATH variable to point
to local working copy's `TRAINING.csv`.
"""
import datetime
import os
import os.path
import re
import sys
import time
import uuid
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
import importlib.util
from sklearn import metrics

from sklearn import preprocessing
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import  Pipeline
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.ensemble import (
    RandomForestClassifier, AdaBoostClassifier, VotingClassifier,
    GradientBoostingClassifier, ExtraTreesClassifier
    )
pd.options.mode.chained_assignment = None # "warn"
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 500)
np.set_printoptions(
    edgeitems=5, linewidth=200, suppress=True, nanstr='NaN',
    infstr='Inf', precision=5
    )

CSV_PATH = "C:\\Users\\cac9159\\Repos\\LTC\\FWA\\Datasets\\TRAINING.csv"
dfinit = pd.read_csv(DATA_PATH, sep=",", error_bad_lines=False)

keycols   = ["CLAIM_NUMBER", "POLICY_NUMBER", "RESIDENT_STATE", "FRAUD_INDICATOR"]
cont_vars = ["PAID_AMOUNT", "DLR_AMT"]
cat_vars  = ["ATTAINED_AGE_BANDED", "BENEFIT_PERIOD", "BENEFIT_TRIGGER_OPTIONS", "COLI", 
             "DAILY_BENEFIT_INFL_BANDED", "DUAL_WAIVER", "INDEMNITY_VS_EXPENSE_INCURRED", 
             "LINKED_POLICY_INDICATOR", "MAX_REPEATED_CALLS", "PREMIUM_PAYMENT_MODE", 
             "PREMIUM_WAIVED", "RESTORATION_OF_BENEFITS", "SITUS_CURRENT", 
             "TAX_QUALIFIED_STATUS", "UNDERWRITING_CLASS", "ELIM_PERIOD_BANDED",]

df = dfinit[keycols + cont_vars + cat_vars]
nrows, ncols = df.shape[0], df.shape[1]


# Assign unique identifier to each record in sample cohort.
df["ID"] = dforiginal["POLICY_NUMBER"].astype(str) + "-" + dforiginal["CLAIM_NUMBER"].astype(str)

lb = LabelBinarizer()
df["FRAUD_INDICATOR"] = lb.fit_transform(df["FRAUD_INDICATOR"])

dffeatures = df.drop(labels=["ID", "FRAUD_INDICATOR"], axis=1)
response = df["FRAUD_INDICATOR"]
datindex = df["ID"]


# Separate training features from response; partition training and test data.
Xtrain, Xtest, ytrain, ytest, indxtrain, indxtest = \
    train_test_split(
        dffeatures, response, datindex, test_size=.25,
        random_state=RANDOM_STATE
        )


# Impute missing categorical or continuous values ============================]
catimp  = SimpleImputer(missing_values=MISSING_CAT_STR, strategy=IMPUTE_STRATEGY)
contimp = SimpleImputer(missing_values=np.NaN, strategy=IMPUTE_STRATEGY)
dfcont, dfcat = Xtrain[cont_vars], Xtrain[cat_vars]

dfcontimp = contimp.fit_transform(dfcont)
dfcatimp  = catimp.fit_transform(dfcat)


# Scale continuous features to eliminate magnitude bias ======================]
# Standardization of a dataset is a common requirement for many machine
# learning estimators: they might behave badly if the individual features
# do not more or less look like standard normally distributed data.
std_scaler  = StandardScaler()
cont_df_scl = pd.DataFrame(
    std_scaler.fit_transform(dfcontimp), columns=dfcont.columns
    )

# One-hot encode categorical features.
cat_df_enc = pd.get_dummies(
    pd.DataFrame(dfcatimp, columns=dfcat.columns), drop_first=True
    )

Xtrain = pd.concat([cont_df_scl, cat_df_enc], axis=1)

# Transform test data using same objects from training data ==================]
# NOTE: Call only `transform` on test dataset.
_cont_df2, _cat_df2 = Xtest[cont_vars], Xtest[cat_vars]
cont_df_imp2 = contimp.transform(_cont_df2)
cat_df_imp2  = catimp.transform(_cat_df2)

# Scale continuous features to eliminate magnitude bias.
cont_df_scl2 = pd.DataFrame(
    std_scaler.transform(cont_df_imp2), columns=dfcont.columns
    )

# One-hot encode categorical features.
cat_df_enc2 = pd.get_dummies(
    pd.DataFrame(cat_df_imp2, columns=dfcat.columns), drop_first=True
    )

Xtest = pd.concat([cont_df_scl2, cat_df_enc2], axis=1)


##### Modeling ################################################################

scoring_metrics = ["accuracy", "precision", "recall", "f1_micro", "f1_macro", "roc_auc"]


In [47]:
# LogisticRegression Classifier ==============================================]
# Initialize default LogisticRegression model. Then perform grid search
# to identify optimal parameters w.r.t. "recall".




model0 = LogisticRegression()

param_grid = [{
    "fit_intercept":[True, False],
    "solver"       :["newton-cg", "lbfgs", "saga"],
    "C"            :[1/10000, 1/1000, 1/100, 1/10, 1, 10, 100, 1000, 10000],
    }]


# Evaluate performance of optimal model on test data.
grid_search = GridSearchCV(model0, param_grid, cv=5, scoring=scoring_metrics[2], verbose=0)
grid_search.fit(Xtrain, ytrain)
bestparams0 = grid_search.best_params_
bestmodel0  = grid_search.best_estimator_


# Apply bestmodel0 to out-of-sample test data `Xtest`.
# model0_pred = bestmodel0.predict(Xtest)
# model0_prob = bestmodel0.predict_proba(Xtest)


# # Asses performance of model0.
# print("Accuracy : {}".format(metrics.accuracy_score(ytest, model0_pred)))
# print("Precision: {}".format(metrics.precision_score(ytest, model0_pred, average="weighted")))
# print("Recall   : {}".format(metrics.recall_score(ytest, model0_pred, average="weighted")))
# print("f1-score : {}".format(metrics.f1_score(ytest, model0_pred, average="weighted")))
# print("ROC-AUC  : {}".format(metrics.roc_auc_score(ytest, model0_pred))
# print(confusion_matrix(ytest, model0_pred))








In [56]:
type(Xtest)

pandas.core.frame.DataFrame

In [60]:
Xtest = Xtrain.iloc[1000:2555,:]
ytest = ytrain[1000:2555]

In [59]:
Xtest.shape

(1555, 55)

In [61]:
# Evaluate performance of optimal model on test data.
# grid_search = GridSearchCV(model0, param_grid, cv=5, scoring=scoring_metrics[2], verbose=0)
# grid_search.fit(Xtrain, ytrain)
# bestparams0 = grid_search.best_params_
# bestmodel0  = grid_search.best_estimator_


# Apply bestmodel0 to out-of-sample test data `Xtest`.
model0_pred = bestmodel0.predict(Xtest)
model0_prob = bestmodel0.predict_proba(Xtest)


# # Asses performance of model0.
print("Accuracy : {}".format(metrics.accuracy_score(ytest, model0_pred)))
print("Precision: {}".format(metrics.precision_score(ytest, model0_pred, average="weighted")))
print("Recall   : {}".format(metrics.recall_score(ytest, model0_pred, average="weighted")))
print("f1-score : {}".format(metrics.f1_score(ytest, model0_pred, average="weighted")))
print("ROC-AUC  : {}".format(metrics.roc_auc_score(ytest, model0_pred)))
print(confusion_matrix(ytest, model0_pred))


Accuracy : 0.9890675241157556
Precision: 0.9891872741737661
Recall   : 0.9890675241157556
f1-score : 0.9850573636683075
ROC-AUC  : 0.575
[[1535    0]
 [  17    3]]


In [None]:
# RandomForest Classifier ====================================================]
model1 = RandomForestClassifier()


param_grid = [{
    "n_estimators":[10, 25, 50, 75, 100, 150, 200, 250, 500],
    "max_depth"   :[None, 2, 5, 10, 15],
    "criterion"   :["gini", "entropy"],
    "bootstrap"   :[True, False],
    "warm_start"  :[True, False],
    }]


# Evaluate performance of optimal model on test data.
grid_search = GridSearchCV(model1, param_grid, cv=5, scoring=scoring_metrics[2], verbose=1)
grid_search.fit(Xtrain, ytrain)
bestparams1 = grid_search.best_params_
bestmodel1  = grid_search.best_estimator_


# Apply bestmodel0 to out-of-sample test data `Xtest`.
model1_pred = bestmodel1.predict(Xtest)
model1_prob = bestmodel1.predict_proba(Xtest)


# Asses performance of model1.
print("Accuracy : {}".format(metrics.accuracy_score(ytest, model1_pred)))
print("Precision: {}".format(metrics.precision_score(ytest, model1_pred, average="weighted")))
print("Recall   : {}".format(metrics.recall_score(ytest, model1_pred, average="weighted")))
print("f1-score : {}".format(metrics.f1_score(ytest, model1_pred, average="weighted")))
print("ROC-AUC  : {}".format(metrics.roc_auc_score(ytest, model1_pred))
print(confusion_matrix(ytest, model1_pred))

# Difference between RandomForestClassifier and ExtraTreesClassifier:
# ExtraTreesClassifier always tests random splits over fraction of features.
# RandomForestClassifier tests all possible splits over fraction of features.


# RandomForestClassifier can produce feature importances.
importances  = bestmodel1.feature_importances_
stddev       = np.std([tree.feature_importances_ for tree in bestmodel1.estimators_], ddof=1)
feature_indx = np.argsort(importances)[::-1]

print("model1 feature ranking: ")
for f in range(Xtrain.shape[1]):
    print("{}. feature {} ({})".format(f + 1, feature_indx[f], importances[feature_indx[f]]))

plt.figure()
plt.title("Feature Importances")
plt.bar(range(Xtrain.shape[1]), importances[feature_indx], color="r",
        yerr=stddev[feature_indx], align="center")
plt.xlim([-1, Xtrain.shape[1]])
plt.show()


# AdaBoost Classifier ========================================================]
model2 = AdaBoostClassifier()


param_grid = [{
    "algorithm"    :["SAMME", "SAMME.R"],
    "n_estimators" :[10, 50, 100, 150, 200],
    "learning_rate":[.50, 1.0, 1.5],
    }]


# Evaluate performance of optimal model on test data.
grid_search = GridSearchCV(model2, param_grid, cv=5, scoring=scoring_metrics[2], verbose=1)
grid_search.fit(Xtrain, ytrain)
bestparams1 = grid_search.best_params_
bestmodel2  = grid_search.best_estimator_


# Apply bestmodel2 to out-of-sample test data `Xtest`.
model2_pred = bestmodel2.predict(Xtest)
model2_prob = bestmodel2.predict_proba(Xtest)


# Asses performance of model2.
print("Accuracy : {}".format(metrics.accuracy_score(ytest, model2_pred)))
print("Precision: {}".format(metrics.precision_score(ytest, model2_pred, average="weighted")))
print("Recall   : {}".format(metrics.recall_score(ytest, model2_pred, average="weighted")))
print("f1-score : {}".format(metrics.f1_score(ytest, model2_pred, average="weighted")))
print("ROC-AUC  : {}".format(metrics.roc_auc_score(ytest, model2_pred))
print(confusion_matrix(ytest, model2_pred))



# Multi-Layer Perceptron Classifier ==========================================]
model3 = MLPClassifier()

param_grid = [{
    "activation":["identity", "logistic", "tanh", "relu"],
    "solver"    :["lbgfs", "sgd", "adam"],
    "warm_start":[True, False],
    "momentum"  :[.1, .25, .50, .75, .9, .99],
    }]

# Evaluate performance of optimal model on test data.
grid_search = GridSearchCV(model3, param_grid, cv=5, scoring=scoring_metrics[2], verbose=1)
grid_search.fit(Xtrain, ytrain)
bestparams1 = grid_search.best_params_
bestmodel3  = grid_search.best_estimator_


# Apply bestmodel3 to out-of-sample test data `Xtest`.
model3_pred = bestmodel3.predict(Xtest)
model3_prob = bestmodel3.predict_proba(Xtest)


# Asses performance of model3.
print("Accuracy : {}".format(metrics.accuracy_score(ytest, model3_pred)))
print("Precision: {}".format(metrics.precision_score(ytest, model3_pred, average="weighted")))
print("Recall   : {}".format(metrics.recall_score(ytest, model3_pred, average="weighted")))
print("f1-score : {}".format(metrics.f1_score(ytest, model3_pred, average="weighted")))
print("ROC-AUC  : {}".format(metrics.roc_auc_score(ytest, model3_pred))
print(confusion_matrix(ytest, model3_pred))



# Gradient Boosting Classifier ===============================================]
model4 = GradientBoostingClassifier()

param_grid = [{
    "loss"        :["deviance", "exponential"],
    "n_estimators":[50, 100, 250, 500, 1000],
    "subsample"   :[.01, .25, .50, .75, .99, 1.0],
    "warm_start"  :[True, False],
    }]

# Evaluate performance of optimal model on test data.
grid_search = GridSearchCV(model4, param_grid, cv=5, scoring=scoring_metrics[2], verbose=1)
grid_search.fit(Xtrain, ytrain)
bestparams1 = grid_search.best_params_
bestmodel4  = grid_search.best_estimator_


# Apply bestmodel4 to out-of-sample test data `Xtest`.
model4_pred = bestmodel4.predict(Xtest)
model4_prob = bestmodel4.predict_proba(Xtest)


# Asses performance of model4.
print("Accuracy : {}".format(metrics.accuracy_score(ytest, model4_pred)))
print("Precision: {}".format(metrics.precision_score(ytest, model4_pred, average="weighted")))
print("Recall   : {}".format(metrics.recall_score(ytest, model4_pred, average="weighted")))
print("f1-score : {}".format(metrics.f1_score(ytest, model4_pred, average="weighted")))
print("ROC-AUC  : {}".format(metrics.roc_auc_score(ytest, model4_pred))
print(confusion_matrix(ytest, model4_pred))




# Final classification with VotingClassifier =================================]
voting_model = VotingClassifier(
    estimators=[("lr", bestmodel0), ("rf", bestmodel1), ("ada", bestmodel2),
                ("mlp", bestmodel3), ("gb", bestmodel4)],
    voting="hard",
    )

voting_model_pred = voting_model.predict(Xtest)
voting_model_prob = voting_model.predict_proba(Xtest)


# Asses performance of voting_model.
print("Accuracy : {}".format(metrics.accuracy_score(ytest, voting_model_pred)))
print("Precision: {}".format(metrics.precision_score(ytest, voting_model_pred, average="weighted")))
print("Recall   : {}".format(metrics.recall_score(ytest, voting_model_pred, average="weighted")))
print("f1-score : {}".format(metrics.f1_score(ytest, voting_model_pred, average="weighted")))
print("ROC-AUC  : {}".format(metrics.roc_auc_score(ytest, voting_model_pred))
print(confusion_matrix(ytest, voting_model_pred))