In [1]:
# importing libraries
import boto3
import pandas as pd; pd.set_option('display.max_columns', 50)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, make_scorer, confusion_matrix
import precision_recall_cutoff
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from cost_function import cost_function_cutoff
import cost_function_for_hw

# defining the bucket
s3 = boto3.resource('s3')
bucket_name = 'webster-data445-bucket'
bucket = s3.Bucket(bucket_name)

# defining the csv file
file_key = 'fraud_train.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# reading the csv file
train = pd.read_csv(file_content_stream, sep = '|')
# dropping missing values
train = train.dropna()
train.head()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,fraud
0,5,1054,54.7,7,0,3,0.027514,0.051898,0.241379,0
1,3,108,27.36,5,2,4,0.12963,0.253333,0.357143,0
2,3,1516,62.16,3,10,5,0.008575,0.041003,0.230769,0
3,6,1791,92.31,8,4,4,0.016192,0.051541,0.275862,0
4,5,430,81.53,3,7,2,0.062791,0.189605,0.111111,0


In [2]:
# defining the csv file
file_key = 'fraud_test.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# reading the csv file
test = pd.read_csv(file_content_stream, sep = '|')
# dropping missing values
test = test.dropna()
test.head()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition
0,4,467,88.48,4,8,4,0.014989,0.189465,0.571429
1,3,1004,58.99,7,6,1,0.026892,0.058755,0.259259
2,1,162,14.0,4,5,4,0.006173,0.08642,4.0
3,5,532,84.79,9,3,4,0.026316,0.15938,0.642857
4,5,890,42.16,4,0,0,0.021348,0.047371,0.210526


In [3]:
## Interactions that were engineered in previous homework assignments

# feature 1
train['total_per_trust_level'] = train['grandTotal'] / train['trustLevel']
test['total_per_trust_level'] = test['grandTotal'] / test['trustLevel']

# feature 2
train['interaction_trust_voids'] = train['trustLevel'] * train['lineItemVoids']
test['interaction_trust_voids'] = test['trustLevel'] * test['lineItemVoids']

# feature 3
train['interaction_voids_without_registration'] = train['scansWithoutRegistration'] * train['lineItemVoids']
test['interaction_voids_without_registration'] = test['scansWithoutRegistration'] * test['lineItemVoids']

# feature 4
train['interaction_quantityM_Time'] = train['quantityModifications'] * train['totalScanTimeInSeconds']
test['interaction_quantityM_Time'] = test['quantityModifications'] * test['totalScanTimeInSeconds']

# feature 5
train['interaction_5'] = np.where((train['trustLevel'] <= 1.5) & (train['total_per_trust_level'] <= 49.573) & (train['trustLevel'] <= 2.5), 1, 0)
test['interaction_5'] = np.where((test['trustLevel'] <= 1.5) & (test['total_per_trust_level'] <= 49.573) & (test['trustLevel'] <= 2.5), 1, 0)

# feature 6
train['interaction_quantity_voids'] = train['quantityModifications'] * train['lineItemVoids']
test['interaction_quantity_voids'] = test['quantityModifications'] * test['lineItemVoids']

# feature 7
train['interaction_value_voids'] = train['valuePerSecond'] * train['lineItemVoids']
test['interaction_value_voids'] = test['valuePerSecond'] * test['lineItemVoids']

# feature 8
train['interaction_8'] = train['trustLevel'] * train['scannedLineItemsPerSecond']
test['interaction_8'] = test['trustLevel'] * test['scannedLineItemsPerSecond']

# feature 9
train['interaction_9'] = train['scannedLineItemsPerSecond'] * train['lineItemVoidsPerPosition']
test['interaction_9'] = test['scannedLineItemsPerSecond'] * test['lineItemVoidsPerPosition']

# feature 10
train['interaction_10'] = train['lineItemVoidsPerPosition'] * train['trustLevel']
test['interaction_10'] = test['lineItemVoidsPerPosition'] * test['trustLevel']

# feature 11
train['interaction_11'] = np.where((train['trustLevel'] <= 1.5) & (train['interaction_8'] <= 0.012) & (train['totalScanTimeInSeconds'] <= 995.0), 1, 0)
test['interaction_11'] = np.where((test['trustLevel'] <= 1.5) & (test['interaction_8'] <= 0.012) & (test['totalScanTimeInSeconds'] <= 995.0), 1, 0)

#### Random Forest

In [4]:
# defining x, x_test, and y
X = train[['total_per_trust_level', 'interaction_10', 'trustLevel', 'interaction_8', 'scannedLineItemsPerSecond', 'interaction_5']]
X_test = test[['total_per_trust_level', 'interaction_10', 'trustLevel', 'interaction_8', 'scannedLineItemsPerSecond', 'interaction_5']]
Y = train['fraud']

# splitting the data into training and validation
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size = 0.2, stratify = Y)

In [5]:
# creating our model with best parameters from previous homework
RF_md = RandomForestClassifier(max_depth = 7, min_samples_leaf = 5, min_samples_split = 10).fit(X_train, Y_train)

# predicting on val and test
RF_val_pred = RF_md.predict_proba(X_val)[:, 1]
RF_test_pred = RF_md.predict_proba(X_test)[:, 1]

# computing cost and cutoff on val
print(f'The cost of the Random Forest Model is {cost_function_for_hw.cost_function(Y_val, RF_val_pred)}')
print(f'The cutoff value for the Random Forest Model is {cost_function_for_hw.cost_function_cutoff(Y_val, RF_val_pred)}')

The cost of the Random Forest Model is -60.0
The cutoff value for the Random Forest Model is 0.4


#### Ada Boost

In [6]:
# building ada boost model with best parameters from previous homework
Ada_md = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(min_samples_split = 10, min_samples_leaf = 7, max_depth = 5), n_estimators = 100, learning_rate = 0.001).fit(X_train, Y_train)

# predicting on val and test
Ada_val_pred = Ada_md.predict_proba(X_val)[:, 1]
Ada_test_pred = Ada_md.predict_proba(X_test)[:, 1]

# computing cost and cutoff on val
print(f'The cost of the Ada Boost Model is {cost_function_for_hw.cost_function(Y_val, Ada_val_pred)}')
print(f'The cutoff value for the Ada Boost Model is {cost_function_for_hw.cost_function_cutoff(Y_val, Ada_val_pred)}')

The cost of the Ada Boost Model is -65.0
The cutoff value for the Ada Boost Model is 0.47


#### Logistic Regression

In [7]:
# building the logistic regression model with best parameters from previous homework
logit_md = LogisticRegression(solver = 'newton-cg', C = 100).fit(X_train, Y_train)

# predicting on test
logit_val_pred = logit_md.predict_proba(X_val)[:, 1]
logit_test_pred = logit_md.predict_proba(X_test)[:, 1]

# computing cost and cutoff on val
print(f'The cost of Logistic Regression Model is {cost_function_for_hw.cost_function(Y_val, logit_val_pred)}')
print(f'The cutoff value for the Logistic Regression Model is {cost_function_for_hw.cost_function_cutoff(Y_val, logit_val_pred)}')

The cost of Logistic Regression Model is -100.0
The cutoff value for the Logistic Regression Model is 0.23


#### Ensemble

In [10]:
## Because our best model was the random forest, I will use it as the base for my ensembled model.

# building ensemble using predictions from previous models
X_ens = pd.DataFrame({'RF': RF_val_pred, 'Ada': Ada_val_pred, 'Logistic': logit_val_pred})
X_test_ens = pd.DataFrame({'RF': RF_test_pred, 'Ada': Ada_test_pred, 'Logistic': logit_test_pred})

# defining customized score function
my_score_function = make_scorer(cost_function_for_hw.cost_function, greater_is_better = True, needs_proba = True)

# defining parameter grid using random forest hyper parameters
RF_ens_param_grid = {'n_estimators': [100, 300, 500],
                 'min_samples_split': [10, 15],
                 'min_samples_leaf': [5, 7],
                 'max_depth' : [3, 5, 7]}

# performing grid search using RF parameters
ens_grid_search = GridSearchCV(RandomForestClassifier(), RF_ens_param_grid, cv = 3, scoring = my_score_function, n_jobs = -1).fit(X_ens, Y_val)

# finding best estimator
ens_md = ens_grid_search.best_estimator_

# printing the best cutoff values
print(f' The best cutoff value for the ensembled model is {cost_function_for_hw.cost_function_cutoff(Y_val, ens_md.predict_proba(X_ens)[:, 1])}')

# predicting on test using best ensemble model
ens_pred = ens_md.predict_proba(X_test_ens)[:, 1]

 The best cutoff value for the ensembled model is 0.12


In [11]:
# exporting our predictions to a csv file
ens_pred_csv = pd.DataFrame({'Ensemble Model Predictions': ens_pred})
ens_pred_csv.to_csv('ens_model_pred.csv', index = False)