In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
from pathlib import Path
from datetime import datetime as dt
import time, os
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, roc_auc_score, roc_curve, balanced_accuracy_score
from sklearn.preprocessing import StandardScaler, TargetEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
#import custom functions for feature engineering and parameter tuning
from src.fraud import gen_cm_cr, get_distance, get_region, print_dataframe, refit_strategy
import warnings
warnings.filterwarnings('ignore')

In [None]:
#importing dataset
fraud_test_df = pd.read_csv('Resources/fraud test.csv')

In [None]:
#reviewing distribution of the target class
ycounts = pd.DataFrame(fraud_test_df['is_fraud'].value_counts())
ycounts['Class']=['Legitmate', 'Fraudulent']
ycounts.set_index('Class', inplace = True)
fig = px.bar(ycounts, title='Distribution of the Target Class for the Transactions', text='value')
fig.show()

Methods for dealing with the high imbalance in the target class will be of primary importance in this analysis

# Preprocessing and Feature Engineering
A number of additional features wre extracted from the data as part of feature engineering in parictular:  
1. Cardholder 'Age' was extracted from the DOB feature.
2. Distance_km: the distance between the card holder location ('lat', 'long') and the merchant location ('merch_lat' and 'merch_long')
3. The number of 'Job' categories was reduced to remove classes with very few candidates.
4. A feature called 'region' which mapped each state to a region according to the  U.S. Bureau of Economic Analysis was added to which the aggregated features from each state were catagorized.
5. The 'amt' feature (transaction amount) was reviewed and found to have very high dispersion.  

In [None]:
# moving the Unnamed column to the index and renaming it 'ID'
fraud_test_df.rename(columns={'Unnamed: 0':'ID'}, inplace=True)
fraud_test_df.set_index('ID', inplace=True, drop=True, verify_integrity ='True')

In [None]:
#creating cardholder age feature
fraud_test_df['date_dob'] = pd.to_datetime(fraud_test_df['dob'], format='%d/%m/%Y')
fraud_test_df['dt_trans_date_time'] = pd.to_datetime(fraud_test_df['trans_date_trans_time'], format='%d/%m/%Y %H:%M')
fraud_test_df['age_years'] = (dt.today()- fraud_test_df['date_dob'])/pd.Timedelta('365 days')


In [None]:
#amount column covers a very large range so converting to log-space
fig = px.violin(fraud_test_df['amt'], box=True, orientation = 'h', title = 'Transaction Amount')
fig.show()

myEpsilon = 0.001
fraud_test_df['log_amount'] = np.log(fraud_test_df['amt']+myEpsilon)
fig1 = px.violin(fraud_test_df['log_amount'], box = True, orientation = 'h', title = 'Log of Transaction Amount')
fig1.show()

In [None]:
#creating distance between cardholder and merchant feature and economic region feature from the state abbreviation.
for index, row in fraud_test_df.iterrows():
    fraud_test_df.at[index, 'distance_km'] = get_distance(row['lat'], row['long'], row['merch_lat'], row['merch_long'])[0][1]
    fraud_test_df.at[index, 'region'] = get_region(row['state'])

In [None]:
# collapsing job categories to 'other' category for any category with fewer than 225 entries
job_counts=fraud_test_df['job'].value_counts()
job_counts_df = job_counts.to_frame(name='counts')
for index, row in job_counts_df.iterrows():
    if job_counts_df.at[index, 'counts'] < 225:
        fraud_test_df.loc[fraud_test_df['job'] == index, ['job']] = 'other'
        

In [None]:
fraud_test_df['job'].value_counts()

In [None]:
fraud_test_df['merchant'].value_counts()

## Feature Encoding
1. get_dummies/one_hot_encoders: Gender, region
3. Target_encoder:  city, state, jobs, merchant, cc_num  

In [None]:
y = fraud_test_df['is_fraud']
X = fraud_test_df.copy()

In [None]:
dummies= pd.get_dummies(X[['category','gender', 'region']])

In [None]:
dummy_columns = dummies.columns.tolist()
X[dummy_columns] = dummies[dummy_columns]

In [None]:
X.drop(['is_fraud', 'amt', 'category', 'cc_num', 'state', 'city', 'city_pop', 'date_dob', 'gender', 'first', 'last', 'street', 'dob', 'trans_date_trans_time', 'dt_trans_date_time', 'lat', 'long', 'merch_lat', 'merch_long', 'trans_num', 'unix_time', 'region'], axis=1, inplace=True)


In [None]:
X.head()

## Splitting Training and Testing data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)



In [None]:
# creating Target Encoder with only the training features and training targets
myEncoder = TargetEncoder(random_state = 1)
X_train_encoded = myEncoder.fit_transform(X_train[['merchant', 'job' ]], y_train)
X_test_encoded = myEncoder.transform(X_test[['merchant', 'job' ]])

In [None]:
#checking the distribution of the labels by set to ensure reasonable distribution.
print(f'Average class probability in data set:    {y.mean()*100:.4f}%')
print(f'Average class probability in training set: {y_train.mean()*100:.4f}%') 
print(f'Average class probability in test set:     {y_test.mean()*100:.4f}%')

In [None]:
X_train_enc_df=X_train.copy()
X_test_enc_df=X_test.copy()
X_train_enc_df[['merchant', 'job' ]]=X_train_encoded
X_test_enc_df[['merchant',  'job' ]]=X_test_encoded

In [None]:
y_test.value_counts()

In [None]:
#writing out encoded file for sharing
# fraud_encoded_df.to_csv('G:\My Drive\Boot Camp\Project_4\\fraud_test_encoded.csv', sep =',')

In [None]:
# putting feature column names into a list for later use
X_train_col = X_train_enc_df.columns.tolist()
X_train_col

## Scale the training and testing features

In [None]:
scaler=StandardScaler()

In [None]:
X_train_enc_scaled=scaler.fit_transform(X_train_enc_df)
X_test_enc_scaled = scaler.transform(X_test_enc_df)

In [None]:
X_train_enc_scaled_df = pd.DataFrame(X_train_enc_scaled, columns=X_train_col)
X_test_enc_scaled_df = pd.DataFrame(X_test_enc_scaled, columns=X_train_col)

In [None]:
X_train_enc_scaled_df.head()

In [None]:
#Reviewing the distribution of y_postitive and y_negative lables.
scatter_df=X_test_enc_scaled_df.copy()
scatter_df['is_fraud']=y_test
fig3 = px.scatter(scatter_df.loc[scatter_df['is_fraud'] == 1], 
           x='age_years', 
           y='log_amount', 
           color='is_fraud', 
           marginal_y = 'histogram', 
           marginal_x = 'histogram',
           title = 'Distribution y_positive',
           template='plotly_white',
           width=800, height=600)
fig3.update_layout(coloraxis_showscale=False)

fig3.show()

fig4=px.scatter(scatter_df.loc[scatter_df['is_fraud'] == 0], 
           x='age_years', 
           y='log_amount', 
           color='is_fraud', 
           marginal_y = 'histogram', 
           marginal_x = 'histogram', 
           title = 'Distribution of y_negative',
           template='plotly_white',
           width=800, height=600)
fig4.update_layout(coloraxis_showscale=False)

fig4.show()

In [None]:
# Purposely commented out due to overhead used for feature assessment
## reviewing the distribution of continous features
i=0
for feature in X_train_col:
    if i<5:
        fig = px.violin(X_train_enc_scaled_df[feature], box=True, orientation = 'h', title = feature)
        fig.show()
    i=i+1
    


# Models Under Review

## Create a Logistic Regression Classifier

In [None]:
log_model = LogisticRegression(random_state=1)

In [None]:
start_time = time.time()

log_model.fit(X_train_enc_scaled_df, y_train)

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
y_pred=log_model.predict(X_test_enc_scaled_df)

In [None]:
gen_cm_cr('LogisticRegression', y_test, y_pred)

## Create a Non-Linear Support Vector Machine Classifier

In [None]:
SVCmodel = SVC(kernel='rbf', random_state=1)

In [None]:
start_time = time.time()

SVCmodel.fit(X_train_enc_scaled_df, y_train)

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
y_pred = SVCmodel.predict(X_test_enc_scaled_df)

In [None]:
gen_cm_cr('SVM with kernal rbf', y_test, y_pred)

## Create a Random Forest Classifier Model

In [None]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=750, random_state=1)

In [None]:
# Fitting the model
start_time = time.time()
rf_model = rf_model.fit(X_train_enc_scaled, y_train)
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
# Making predictions using the testing data
y_pred = rf_model.predict(X_test_enc_scaled)

In [None]:
gen_cm_cr('RandomForestClasssifer', y_test, y_pred)

In [None]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf_model.feature_importances_
# We can sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

In [None]:
# Visualize the features by importance
importances_df = pd.DataFrame(sorted(zip(rf_model.feature_importances_, X.columns), reverse=True))
importances_df.set_index(importances_df[1], inplace=True)
importances_df.drop(columns=1, inplace=True)
importances_df.rename(columns={0: 'Feature Importances'}, inplace=True)
importances_sorted = importances_df.sort_values(by='Feature Importances')
fig = px.bar(importances_sorted, 
             x='Feature Importances', 
             y=importances_sorted.index, orientation='h', 
             title='Random Forest Classifier',
            width=800, height=600)
fig.show()


## Create XGBoost Classifier

In [None]:
# Calculating the ratio of postitive labels to negative labels for weight scaling given the very high imbalance in 0 and 1 labels.
y_dist = y_train.value_counts()
my_scale_weight = 1/(y_dist[1]/y_dist[0])
my_scale_weight = my_scale_weight.round()
my_scale_weight

In [None]:
BSTBaseModel = XGBClassifier(tree_method='auto',
                          scale_pos_weight = my_scale_weight,
                          objective='binary:logistic', 
                          random_state = 1)

In [None]:
start_time = time.time()

BSTBaseModel = BSTBaseModel.fit(X_train_enc_scaled, y_train)

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
BST0importances = BSTBaseModel.feature_importances_
# We can sort the features by their importance
sorted(zip(BSTBaseModel.feature_importances_, X.columns), reverse=True)


In [None]:
BSTimportances_df = pd.DataFrame(sorted(zip(BSTBaseModel.feature_importances_, X.columns), reverse=True))
BSTimportances_df.set_index(BSTimportances_df[1], inplace=True)
BSTimportances_df.drop(columns=1, inplace=True)
BSTimportances_df.rename(columns={0: 'Feature Importances'}, inplace=True)
BSTimportances_sorted = BSTimportances_df.sort_values(by='Feature Importances')
fig = px.bar(BSTimportances_sorted,
             x='Feature Importances', 
             y=BSTimportances_sorted.index, 
             orientation='h', 
             title='XGBoost Classifer - Base Model',
            width=800, height=600)
fig.show()

In [None]:
y_pred=BSTBaseModel.predict(X_test_enc_scaled_df)


In [None]:
gen_cm_cr('XGBoost Baseline Model', y_test, y_pred)

## Tuning the XGBoost Base Model with a Parameter Tuner

In [None]:
#scores to select best candidate 
scores =  ['balanced_accuracy', 'precision']
#defining default parameters
default_params={'scale_pos_weight':my_scale_weight, 'random_state': 1 }
gparams = BSTBaseModel.get_params()
for key in gparams.keys():
    gp=gparams[key]
    default_params[key] = [gp]

In [None]:
# tuning for n_estimators which equates to the number of boosting rounds and the max_depth of the tree
param_grid = [
    {'n_estimators':[32,64,128,256,512], 'max_depth' : [2, 4, 6, 8, 10, 12] },
    ]

In [None]:
start_time = time.time()
error_score='raise'
#Dictionary for collecting results
grid_search = GridSearchCV(BSTBaseModel, param_grid, scoring=scores, refit=refit_strategy)
grid_search.fit(X_train_enc_scaled, y_train)
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
y_pred=grid_search.best_estimator_.predict(X_test_enc_scaled_df)

In [None]:
gen_cm_cr('XGBoost Grid Search Best Model', y_test, y_pred)

In [None]:
Gridimportances_df = pd.DataFrame(sorted(zip(grid_search.best_estimator_.feature_importances_, X.columns), reverse=True))
Gridimportances_df.set_index(Gridimportances_df[1], inplace=True)
Gridimportances_df.drop(columns=1, inplace=True)
Gridimportances_df.rename(columns={0: 'Feature Importances'}, inplace=True)
Gridimportances_sorted = Gridimportances_df.sort_values(by='Feature Importances')
fig = px.bar(Gridimportances_sorted,
             x='Feature Importances', 
             y=Gridimportances_sorted.index, 
             orientation='h', 
             title='XGBoost Classifer - Best Model',
            width=800, height=600)
fig.show()

In [None]:
All_importance_df=importances_df.merge(BSTimportances_df,how='left', left_index=True, right_index=True)
All_importance_df=All_importance_df.merge(Gridimportances_df,how='left', left_index=True, right_index=True)
All_importance_df = All_importance_df.rename(columns={'Feature Importances_x': 'RandomForest', 'Feature Importances_y':'XGBoost Base', 'Feature Importances': 'XGBoost BEST'})
All_importance_df.sort_values('XGBoost BEST', ascending=False, inplace=True)   
All_importance_df

In [None]:
All_importance_bar = px.imshow(All_importance_df, 
                            # orientation='h', 
                            # barmode='group', 
                            width=1000, height=800, 
                            title='Comparative Model Feature Importance',
                            color_continuous_scale='turbo',
                            # template='plotly_white'
                            ) 
All_importance_bar.update({'layout':{'yaxis' : {'categoryorder':"total descending"}}})
All_importance_bar.show()