In [245]:
# importing libraries
import boto3
import pandas as pd; pd.set_option('display.max_columns', 50)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, make_scorer, confusion_matrix
import precision_recall_cutoff
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from cost_function import cost_function_cutoff

# defining the bucket
s3 = boto3.resource('s3')
bucket_name = 'webster-data445-bucket'
bucket = s3.Bucket(bucket_name)

# defining the csv file
file_key = 'fraud_train.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# reading the csv file
train = pd.read_csv(file_content_stream, sep = '|')
# dropping missing values
train = train.dropna()
train.head()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,fraud
0,5,1054,54.7,7,0,3,0.027514,0.051898,0.241379,0
1,3,108,27.36,5,2,4,0.12963,0.253333,0.357143,0
2,3,1516,62.16,3,10,5,0.008575,0.041003,0.230769,0
3,6,1791,92.31,8,4,4,0.016192,0.051541,0.275862,0
4,5,430,81.53,3,7,2,0.062791,0.189605,0.111111,0


In [246]:
# defining the csv file
file_key = 'fraud_test.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# reading the csv file
test = pd.read_csv(file_content_stream, sep = '|')
# dropping missing values
test = test.dropna()
test.head()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition
0,4,467,88.48,4,8,4,0.014989,0.189465,0.571429
1,3,1004,58.99,7,6,1,0.026892,0.058755,0.259259
2,1,162,14.0,4,5,4,0.006173,0.08642,4.0
3,5,532,84.79,9,3,4,0.026316,0.15938,0.642857
4,5,890,42.16,4,0,0,0.021348,0.047371,0.210526


In [247]:
## Interactions that were engineered in previous homework assignments

# feature 1
train['total_per_trust_level'] = train['grandTotal'] / train['trustLevel']
test['total_per_trust_level'] = test['grandTotal'] / test['trustLevel']

# feature 2
train['interaction_trust_voids'] = train['trustLevel'] * train['lineItemVoids']
test['interaction_trust_voids'] = test['trustLevel'] * test['lineItemVoids']

# feature 3
train['interaction_voids_without_registration'] = train['scansWithoutRegistration'] * train['lineItemVoids']
test['interaction_voids_without_registration'] = test['scansWithoutRegistration'] * test['lineItemVoids']

# feature 4
train['interaction_quantityM_Time'] = train['quantityModifications'] * train['totalScanTimeInSeconds']
test['interaction_quantityM_Time'] = test['quantityModifications'] * test['totalScanTimeInSeconds']

# feature 5
train['interaction_5'] = np.where((train['trustLevel'] <= 1.5) & (train['total_per_trust_level'] <= 49.573) & (train['trustLevel'] <= 2.5), 1, 0)
test['interaction_5'] = np.where((test['trustLevel'] <= 1.5) & (test['total_per_trust_level'] <= 49.573) & (test['trustLevel'] <= 2.5), 1, 0)

# feature 6
train['interaction_quantity_voids'] = train['quantityModifications'] * train['lineItemVoids']
test['interaction_quantity_voids'] = test['quantityModifications'] * test['lineItemVoids']

# feature 7
train['interaction_value_voids'] = train['valuePerSecond'] * train['lineItemVoids']
test['interaction_value_voids'] = test['valuePerSecond'] * test['lineItemVoids']

# feature 8
train['interaction_8'] = train['trustLevel'] * train['scannedLineItemsPerSecond']
test['interaction_8'] = test['trustLevel'] * test['scannedLineItemsPerSecond']

# feature 9
train['interaction_9'] = train['scannedLineItemsPerSecond'] * train['lineItemVoidsPerPosition']
test['interaction_9'] = test['scannedLineItemsPerSecond'] * test['lineItemVoidsPerPosition']

# feature 10
train['interaction_10'] = train['lineItemVoidsPerPosition'] * train['trustLevel']
test['interaction_10'] = test['lineItemVoidsPerPosition'] * test['trustLevel']

# feature 11
train['interaction_11'] = np.where((train['trustLevel'] <= 1.5) & (train['interaction_8'] <= 0.012) & (train['totalScanTimeInSeconds'] <= 995.0), 1, 0)
test['interaction_11'] = np.where((test['trustLevel'] <= 1.5) & (test['interaction_8'] <= 0.012) & (test['totalScanTimeInSeconds'] <= 995.0), 1, 0)

## Data Splitting

In [248]:
# defining x and y
X = train[['total_per_trust_level', 'interaction_10', 'trustLevel', 'interaction_8', 'scannedLineItemsPerSecond']]
Y = train['fraud']

# splitting the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

In [249]:
# defining x and y with 6 variables
X2 = train[['total_per_trust_level', 'interaction_10', 'trustLevel', 'interaction_8', 'scannedLineItemsPerSecond', 'interaction_5']]

# splitting the data
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X2, Y, test_size = 0.2, stratify = Y)

In [250]:
# defining x and y with 7 variables
X3 = train[['total_per_trust_level', 'interaction_10', 'trustLevel', 'interaction_8', 'scannedLineItemsPerSecond', 'interaction_5']]

# splitting the data
X_train3, X_test3, Y_train3, Y_test3 = train_test_split(X3, Y, test_size = 0.2, stratify = Y)

## Random Forest


In [172]:
# defining parameter grid
RF_param_grid = {'n_estimators': [100, 300, 500],
                 'min_samples_split': [10, 15],
                 'min_samples_leaf': [5, 7],
                 'max_depth' : [3, 5, 7]}

# defining customized scoring function
my_score_function = make_scorer(cost_function, greater_is_better = True, needs_proba = True)

# performing grid search with 5 variables
RF_grid_search = GridSearchCV(RandomForestClassifier(), RF_param_grid, cv = 3, scoring = 'f1', 
                              n_jobs = -1).fit(X_train, Y_train)

# defining best parameters
RF_best = RF_grid_search.best_estimator_
RF_best

RandomForestClassifier(max_depth=7, min_samples_leaf=7, min_samples_split=15,
                       n_estimators=300)

In [224]:
# creating best random forest model
RF_md = RandomForestClassifier(max_depth = 7, min_samples_leaf = 7, min_samples_split = 15, n_estimators = 300).fit(X_train, Y_train)

# predicting on test
RF_test_pred = RF_md.predict_proba(X_test)[:, 1]

# identifying the optimal cutoff values
opt_cutoff = cost_function_cutoff(Y_test, RF_test_pred)

# changing likelihoods to labels
RF_label = np.where(RF_test_pred < opt_cutoff, 0, 1)

# computing confusion matrix
RF_conf = confusion_matrix(Y_test, RF_label)

In [226]:
# computing and printing cost of random forest with 5 variables
print(RF_conf)
RF_cost = -25*RF_conf[1, 0] - 5*RF_conf[0, 1] + 5*RF_conf[1, 1]
print('The cost of the RF model is:', RF_cost)

[[352   3]
 [ 13   8]]
The cost of the RF model is: -300


In [188]:
# performing grid search with 5 variables
RF_grid_search2 = GridSearchCV(RandomForestClassifier(), RF_param_grid, cv = 3, scoring = 'f1', 
                              n_jobs = -1).fit(X_train2, Y_train2)

# defining best parameters
RF_best2 = RF_grid_search2.best_estimator_
RF_best2

RandomForestClassifier(max_depth=7, min_samples_leaf=5, min_samples_split=10)

In [191]:
# creating best random forest model
RF_md2 = RandomForestClassifier(max_depth = 7, min_samples_leaf = 5, min_samples_split = 10).fit(X_train2, Y_train2)

# predicting on test
RF_test_pred2 = RF_md2.predict_proba(X_test2)[:, 1]

# identifying the optimal cutoff values
opt_cutoff2 = cost_function_cutoff(Y_test2, RF_test_pred2)

# changing likelihoods to labels
RF_label2 = np.where(RF_test_pred2 < opt_cutoff2, 0, 1)

In [227]:
# computing confusion matrix
RF_conf2 = confusion_matrix(Y_test2, RF_label2)

# computing and printing cost of random forest with 5 variables
print(RF_conf2)
RF_cost2 = -25*RF_conf2[1, 0] - 5*RF_conf2[0, 1] + 5*RF_conf2[1, 1]
print('The cost of the RF model is:', RF_cost2)

[[348   7]
 [ 10  11]]
The cost of the RF model is: -230


In [198]:
# performing grid search with 5 variables
RF_grid_search3 = GridSearchCV(RandomForestClassifier(), RF_param_grid, cv = 3, scoring = 'f1', 
                              n_jobs = -1).fit(X_train3, Y_train3)

# defining best parameters
RF_best3 = RF_grid_search3.best_estimator_
RF_best3

RandomForestClassifier(max_depth=7, min_samples_leaf=7, min_samples_split=10)

In [201]:
# creating best random forest model
RF_md3 = RandomForestClassifier(max_depth = 7, min_samples_leaf = 7, min_samples_split = 10).fit(X_train3, Y_train3)

# predicting on test
RF_test_pred3 = RF_md3.predict_proba(X_test3)[:, 1]

# identifying the optimal cutoff values
opt_cutoff3 = cost_function_cutoff(Y_test3, RF_test_pred3)

# changing likelihoods to labels
RF_label3 = np.where(RF_test_pred3 < opt_cutoff3, 0, 1)

In [228]:
# computing confusion matrix
RF_conf3 = confusion_matrix(Y_test3, RF_label3)

# computing and printing cost of random forest with 5 variables
print(RF_conf3)
RF_cost3 = -25*RF_conf3[1, 0] - 5*RF_conf3[0, 1] + 5*RF_conf3[1, 1]
print('The cost of the RF model is:', RF_cost3)

[[350   5]
 [ 13   8]]
The cost of the RF model is: -310


#### The best random forest model is the one with 6 input variables

## Ada Boost

In [203]:
# defining parameter grid
Ada_param_grid = {'n_estimators': [100, 300, 500],
                  'base_estimator__min_samples_split': [10, 15],
                  'base_estimator__min_samples_leaf': [5, 7],
                  'base_estimator__max_depth': [3, 5, 7],
                  'learning_rate': [0.001, 0.01, 0.1]}

# running grid search with 5 variables
Ada_grid_search = GridSearchCV(AdaBoostClassifier(base_estimator = DecisionTreeClassifier()), Ada_param_grid, cv = 3, scoring = 'f1', n_jobs = -1).fit(X_train, Y_train)

# extracting best parameters
Ada_best = Ada_grid_search.best_estimator_
Ada_best

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=5,
                                                         min_samples_leaf=5,
                                                         min_samples_split=15),
                   learning_rate=0.001, n_estimators=100)

In [205]:
# building ada boost model
Ada_md = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(min_samples_split = 15, min_samples_leaf = 5, max_depth = 5), n_estimators = 100, learning_rate = 0.001).fit(X_train, Y_train)

# predicting on test
Ada_test_pred = Ada_md.predict_proba(X_test)[:, 1]

# identifying the optimal cutoff
opt_cutoff = cost_function_cutoff(Y_test, Ada_test_pred)

# changing to labels
Ada_label = np.where(Ada_test_pred < opt_cutoff, 0, 1)

In [229]:
# computing confusion matrix
Ada_conf = confusion_matrix(Y_test, Ada_label)

# computing and printing cost of random forest with 5 variables
print(Ada_conf)
Ada_cost = -25*Ada_conf[1, 0] - 5*Ada_conf[0, 1] + 5*Ada_conf[1, 1]
print('The cost of the Ada model is:', Ada_cost)

[[347   8]
 [  8  13]]
The cost of the Ada model is: -175


In [210]:
# running grid search with 6 variables
Ada_grid_search2 = GridSearchCV(AdaBoostClassifier(base_estimator = DecisionTreeClassifier()), Ada_param_grid, cv = 3, scoring = 'f1', n_jobs = -1).fit(X_train2, Y_train2)

# extracting best parameters
Ada_best2 = Ada_grid_search2.best_estimator_
Ada_best2

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=5,
                                                         min_samples_leaf=7,
                                                         min_samples_split=10),
                   learning_rate=0.001, n_estimators=100)

In [211]:
# building ada boost model
Ada_md2 = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(min_samples_split = 10, min_samples_leaf = 7, max_depth = 5), n_estimators = 100, learning_rate = 0.001).fit(X_train2, Y_train2)

# predicting on test
Ada_test_pred2 = Ada_md2.predict_proba(X_test2)[:, 1]

# identifying the optimal cutoff
opt_cutoff2 = cost_function_cutoff(Y_test2, Ada_test_pred2)

# changing to labels
Ada_label2 = np.where(Ada_test_pred2 < opt_cutoff2, 0, 1)

In [230]:
# computing confusion matrix
Ada_conf2 = confusion_matrix(Y_test2, Ada_label2)

# computing and printing cost of random forest with 5 variables
print(Ada_conf2)
Ada_cost2 = -25*Ada_conf2[1, 0] - 5*Ada_conf2[0, 1] + 5*Ada_conf2[1, 1]
print('The cost of the Ada model is:', Ada_cost2)

[[344  11]
 [  6  15]]
The cost of the Ada model is: -130


In [213]:
# running grid search with 5 variables
Ada_grid_search3 = GridSearchCV(AdaBoostClassifier(base_estimator = DecisionTreeClassifier()), Ada_param_grid, cv = 3, scoring = 'f1', n_jobs = -1).fit(X_train3, Y_train3)

# extracting best parameters
Ada_best3 = Ada_grid_search3.best_estimator_
Ada_best3

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=3,
                                                         min_samples_leaf=5,
                                                         min_samples_split=10),
                   learning_rate=0.001, n_estimators=100)

In [218]:
# building ada boost model
Ada_md3 = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(min_samples_split = 10, min_samples_leaf = 5, max_depth = 3), n_estimators = 100, learning_rate = 0.001).fit(X_train3, Y_train3)

# predicting on test
Ada_test_pred3 = Ada_md3.predict_proba(X_test3)[:, 1]

# identifying the optimal cutoff
opt_cutoff3 = cost_function_cutoff(Y_test3, Ada_test_pred3)

# changing to labels
Ada_label3 = np.where(Ada_test_pred3 < opt_cutoff3, 0, 1)

In [231]:
# computing confusion matrix
Ada_conf3 = confusion_matrix(Y_test3, Ada_label3)

# computing and printing cost of random forest with 5 variables
print(Ada_conf3)
Ada_cost3 = -25*Ada_conf3[1, 0] - 5*Ada_conf3[0, 1] + 5*Ada_conf3[1, 1]
print('The cost of the Ada model is:', Ada_cost3)

[[355   0]
 [ 21   0]]
The cost of the Ada model is: -525


#### The best ada boost model is the one with 6 input variables

# Support Vector Machine

In [None]:
# defining hyper parameters
logit_param_grid = {'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
                  'C':  [100, 10, 1.0, 0.1, 0.01],
                  'penalty': ['none', 'l1', 'l2', 'elasticnet']}

# performing grid search
logit_grid_search = GridSearchCV(LogisticRegression(), logit_param_grid, cv = 3, scoring = 'f1',
                                 n_jobs = -1).fit(X_train, Y_train)

logit_best = logit_grid_search.best_estimator_
logit_best

In [257]:
# building the support vector machine with best hyper-parameters
logit_md = LogisticRegression(solver = 'liblinear', C = 10, penalty = 'l1').fit(X_train, Y_train)

# predicting on test
logit_test_pred = logit_md.predict_proba(X_test)[:, 1]

# identifying the optimal cutoff
opt_cutoff = cost_function_cutoff(Y_test, logit_test_pred)

# changing to labels
logit_label = np.where(logit_test_pred < opt_cutoff, 0, 1)

In [258]:
# computing confusion matrix
logit_conf = confusion_matrix(Y_test, logit_label)

# computing and printing cost of random forest with 5 variables
print(logit_conf)
logit_cost = -25*logit_conf[1, 0] - 5*logit_conf[0, 1] + 5*logit_conf[1, 1]
print('The cost of the Logistic Regression model is:', logit_cost)

[[355   0]
 [ 21   0]]
The cost of the Logistic Regression model is: -525


In [None]:
# performing grid search
logit_grid_search2 = GridSearchCV(LogisticRegression(), logit_param_grid, cv = 3, scoring = 'f1',
                                 n_jobs = -1).fit(X_train2, Y_train2)

logit_best2 = logit_grid_search2.best_estimator_
logit_best2

In [260]:
# building the support vector machine with best hyper-parameters
logit_md2 = LogisticRegression(solver = 'newton-cg', C = 100).fit(X_train2, Y_train2)

# predicting on test
logit_test_pred2 = logit_md2.predict_proba(X_test2)[:, 1]

# identifying the optimal cutoff
opt_cutoff2 = cost_function_cutoff(Y_test2, logit_test_pred2)

# changing to labels
logit_label2 = np.where(logit_test_pred2 < opt_cutoff2, 0, 1)

In [261]:
# computing confusion matrix
logit_conf2 = confusion_matrix(Y_test2, logit_label2)

# computing and printing cost of random forest with 5 variables
print(logit_conf2)
logit_cost2 = -25*logit_conf2[1, 0] - 5*logit_conf2[0, 1] + 5*logit_conf2[1, 1]
print('The cost of the Logistic Regression model is:', logit_cost2)

[[354   1]
 [ 20   1]]
The cost of the Logistic Regression model is: -500


In [None]:
# performing grid search
logit_grid_search3 = GridSearchCV(LogisticRegression(), logit_param_grid, cv = 3, scoring = 'f1',
                                 n_jobs = -1).fit(X_train3, Y_train3)

logit_best3 = logit_grid_search3.best_estimator_
logit_best3

In [265]:
# building the support vector machine with best hyper-parameters
logit_md3 = LogisticRegression(solver = 'newton-cg', C = 100).fit(X_train3, Y_train3)

# predicting on test
logit_test_pred3 = logit_md3.predict_proba(X_test3)[:, 1]

# identifying the optimal cutoff
opt_cutoff3 = cost_function_cutoff(Y_test3, logit_test_pred3)

# changing to labels
logit_label3 = np.where(logit_test_pred3 < opt_cutoff3, 0, 1)

In [266]:
# computing confusion matrix
logit_conf3 = confusion_matrix(Y_test3, logit_label3)

# computing and printing cost of random forest with 5 variables
print(logit_conf3)
logit_cost3 = -25*logit_conf3[1, 0] - 5*logit_conf3[0, 1] + 5*logit_conf3[1, 1]
print('The cost of the Logistic Regression model is:', logit_cost3)

[[355   0]
 [ 21   0]]
The cost of the Logistic Regression model is: -525


#### The best Logistic Regression is the one with 6 input variables

#### Now we can compare our best model from each and see which would be the best.

In [268]:
print('The cost of the best Random Forest model is:', RF_cost2)
print('The cost of the best Ada Boost model is:', Ada_cost2)
print('The cost of the best Logistic Regression model is:', logit_cost2)

The cost of the best Random Forest model is: -230
The cost of the best Ada Boost model is: -130
The cost of the best Logistic Regression model is: -500


#### Although all of our models are negative in profit, the Ada boost would be the best model to select. In the future, we can determine that we need better input variables (better feature engineering). 