In [None]:
import pandas as pd
import numpy as np
import time
from pandas_profiling import ProfileReport
from matplotlib import pyplot
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline 
from sklearn.metrics import roc_auc_score,precision_recall_curve, classification_report, confusion_matrix, auc


1. Get dataset 

In [None]:
 
file_path = 'C:/temp/dataset/data/creditcard.csv'
df = pd.read_csv(file_path, sep=',')

 2. Explore Data Analysis.

In [None]:
 #  We see 31 columns and 284907 rows
df.shape

In [None]:
 # Check top 5 rows of dataset 
df.head()

In [None]:
# Check bottom 5 rows of dataset
df.tail()

In [None]:
# The time column is elapsed time for transaction, it could be removed as it is not useful for model. Columns V1-V28 are customer personal info and have been transformated via PCA. 
# We see the difference in scale between the PCA variables and the column amount suggests that data scaling should be done.
df = df.drop('Time', axis = 1)
scaler = StandardScaler()
df[['Amount']] = StandardScaler().fit_transform(df[[ 'Amount']])

In [None]:
df.head(1)

In [None]:
# Check if any missing values. We see no missing values
df.isnull().sum

In [None]:
# Check the frequency count on fraud and non-fraud,  we can see the values are highly skewed.There are 492 fraudulent transactions and  284315 non-fraudulent transactions
print(df['Class'].value_counts())

In [None]:
#  Define function to prepare data

def prep_data(df):
    X = df.iloc[:, 0:29]
    X = np.array(X).astype(np.float)
    y = df.iloc[:, 29]
    y = np.array(y).astype(np.float)
    return X, y

In [None]:
X, y = prep_data(df)

In [None]:
# Define a function to create a scatter plot of the data and labels
def plot_data(X, y):
    plt.scatter(X[y == 0, 0], X[y == 0, 1], label="Class #0", alpha=0.5, linewidth=0.15)
    plt.scatter(X[y == 1, 0], X[y == 1, 1], label="Class #1", alpha=0.5, linewidth=0.15, c='r')
    plt.legend()
    return plt.show()

    

In [None]:
# We can see that fraud and non-fraud are highly imbalanced
plot_data(X, y)

In [None]:
# Check distribution of input variables. We can see that the distribution of most of the PCA components is Gaussian
# drop the target variable
data = df.drop(['Class'], axis=1)
# create a histogram plot of each numeric variable
data = data.hist(bins=100)
# disable axis labels to avoid the clutter
for axis in data.flatten():
    axis.set_xticklabels([])
    axis.set_yticklabels([])
    pyplot.show()



3. Split data into train and test

In [None]:
y = df['Class']
X = df.drop(['Class'], axis = 1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

4. Fit different ML models' classifier 

In [None]:
# Logistic Regression
model_lr = LogisticRegression(random_state=1)

In [None]:
# Random Forest
model_rf = RandomForestClassifier(n_estimators = 100, random_state=1)

In [None]:
# K-Nearest Neighbors
model_knn = KNeighborsClassifier()

In [None]:
# Support Vector Machine
model_svc = SVC(probability=True,random_state=1)

In [None]:
# Gaussian Naive Bayes 
model_nb = GaussianNB()

In [None]:
# Extreme Gradient Boosting
model_xgb = XGBClassifier(random_state=1)

5. Get model results

In [None]:
# Define a funciton to calculate precision-recall area under curve
def pr_auc(y_true, probas_pred):
    # calculate precision-recall curve
    p, r, _ = precision_recall_curve(y_true, probas_pred)
    return auc(r, p)

In [None]:
# Define a function to print ROC, classification report, confusion matrix and AUC
def get_model_result(X_train, y_train, X_test, y_test, model):
    model.fit(X_train, y_train)
    predicted = model.predict(X_test)
    probs = model.predict_proba(X_test)
    print('The ROC_AUC_SCORE is:\n', roc_auc_score(y_test, probs[:,1]))
    print('The Classification report:\n', classification_report(y_test, predicted))
    print('The Confusion matrix:\n', confusion_matrix(y_true=y_test, y_pred=predicted))
    print('The AUC is :\n', pr_auc(y_test, probs[:,1]))


In [None]:
start = time.time()
get_model_result(X_train, y_train, X_test, y_test, model_lr)
end = time.time()
print('The total Execution Time for LR is : \n',  end - start)
   

In [None]:
start = time.time()
get_model_result(X_train, y_train, X_test, y_test, model_rf)
end = time.time()
print('The total Execution Time for RF is : \n',  end - start) 

In [None]:
start = time.time()
get_model_result(X_train, y_train, X_test, y_test, model_knn)
end = time.time()
print('The total Execution Time for KNN is : \n',  end - start)

In [None]:
start = time.time()
get_model_result(X_train, y_train, X_test, y_test, model_svc)
end = time.time()
print('The total Execution Time for SVC is : \n',  end - start)

In [None]:
start = time.time()
get_model_result(X_train, y_train, X_test, y_test, model_nb)
end = time.time()
print('The total Execution Time for NB is : \n',  end - start)

In [None]:
start = time.time()
get_model_result(X_train, y_train, X_test, y_test, model_xgb)
end = time.time()
print('The total Execution Time for XGB is : \n',  end - start)

6. Based on models' performance metrics, we combine top three classifiers-RF, KNN and XGB in the ensemble model

In [None]:
# Train ensemable model basesd on soft voting 
model_ensemble = VotingClassifier(estimators=[('rf', model_rf), ('knn', model_knn), ('xgb', model_xgb)], voting='soft')


In [None]:
# Get performance metrics for ensemble model. We can see that it predicts more number of ture postive and less number of false positive with high recall and AUC. 
# However, XGB is almost competitive with this ensemable model and use less time
start = time.time()
get_model_result(X_train, y_train, X_test, y_test, model_ensemble)
end = time.time()
print('The total Execution Time for Ensembled model is : \n',  end - start)

7. Compare this ensemable model with SMOTE(Synthetic Minority Over-sampling Technique) in XGB on this imbalanced dataset

In [None]:
#Get the feature X and y
X, y = prep_data(df)


In [None]:
# Define SMOTE  
method = SMOTE()

In [None]:
# Create the resampled feature set
X_resampled, y_resampled = method.fit_sample(X, y)

In [None]:
# Plot the resampled data, Now we see the  minority class is now much more prominently visible in the data. 
plot_data(X_resampled, y_resampled)

In [None]:
# Print the value_counts on the original labels y
print(pd.value_counts(pd.Series(y)))


In [None]:
# Print the value_counts, we see the number of counts are same in class
print(pd.value_counts(y_resampled))


In [None]:
# Deing resampling method
resampling = SMOTE(random_state=1)

In [None]:
# Apply sampling method on training data 
X_resampled, y_resampled = method.fit_sample(X_train, y_train)

In [None]:
# Build a pipeliine to chain SMOTE and model together
pipeline = Pipeline([('SMOTE', resampling), ('XGB', model_xgb)])

In [None]:
# using pipeline to combine with model
pipeline.fit(X_resampled, y_resampled)

In [None]:
# Make a prediction
predicted = pipeline.predict(X_test)

In [None]:
# Get predicted probability
probs = pipeline.predict_proba(X_test)

In [None]:
# Get the performance results .
start = time.time()
print('The ROC_AUC_SCORE is:\n', roc_auc_score(y_test, probs[:,1]))
print('The Classification report:\n', classification_report(y_test, predicted))
print('The Confusion matrix:\n', confusion_matrix(y_true=y_test, y_pred=predicted))
print('The AUC is :\n', pr_auc(y_test, probs[:,1]))
end = time.time()
print('The total Execution Time for SMOTE Combined with XGB is : \n',  end - start)

We can see that this model predicts more number of TP and FP with almost the same AUC as ensemabled model and XGB alone. So overall, ensemabled model is best if time is not a matter, otherwise, XGB could be used.