In [85]:
# importing libraries
import boto3
import pandas as pd; pd.set_option('display.max_columns', 50)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE, RFECV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report
import precision_recall_cutoff

# defining the bucket
s3 = boto3.resource('s3')
bucket_name = 'webster-data445-bucket'
bucket = s3.Bucket(bucket_name)

# defining the train csv file
file_key = 'fraud_train.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# reading the csv file
train = pd.read_csv(file_content_stream, sep = '|')
# dropping missing values
train = train.dropna()
train.head()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,fraud
0,5,1054,54.7,7,0,3,0.027514,0.051898,0.241379,0
1,3,108,27.36,5,2,4,0.12963,0.253333,0.357143,0
2,3,1516,62.16,3,10,5,0.008575,0.041003,0.230769,0
3,6,1791,92.31,8,4,4,0.016192,0.051541,0.275862,0
4,5,430,81.53,3,7,2,0.062791,0.189605,0.111111,0


In [86]:
# defining the test csv file
file_key = 'fraud_test.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# reading the csv file
test = pd.read_csv(file_content_stream, sep = '|')
# dropping missing values
test = test.dropna()
test.head()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition
0,4,467,88.48,4,8,4,0.014989,0.189465,0.571429
1,3,1004,58.99,7,6,1,0.026892,0.058755,0.259259
2,1,162,14.0,4,5,4,0.006173,0.08642,4.0
3,5,532,84.79,9,3,4,0.026316,0.15938,0.642857
4,5,890,42.16,4,0,0,0.021348,0.047371,0.210526


In [87]:
## Interactions that were engineered in previous homework assignments

# feature 1
train['total_per_trust_level'] = train['grandTotal'] / train['trustLevel']
test['total_per_trust_level'] = test['grandTotal'] / test['trustLevel']

# feature 2
train['interaction_trust_voids'] = train['trustLevel'] * train['lineItemVoids']
test['interaction_trust_voids'] = test['trustLevel'] * test['lineItemVoids']

# feature 3
train['interaction_voids_without_registration'] = train['scansWithoutRegistration'] * train['lineItemVoids']
test['interaction_voids_without_registration'] = test['scansWithoutRegistration'] * test['lineItemVoids']

# feature 4
train['interaction_quantityM_Time'] = train['quantityModifications'] * train['totalScanTimeInSeconds']
test['interaction_quantityM_Time'] = test['quantityModifications'] * test['totalScanTimeInSeconds']

# feature 5
train['interaction_5'] = np.where((train['trustLevel'] <= 1.5) & (train['total_per_trust_level'] <= 49.573) & (train['trustLevel'] <= 2.5), 1, 0)
test['interaction_5'] = np.where((test['trustLevel'] <= 1.5) & (test['total_per_trust_level'] <= 49.573) & (test['trustLevel'] <= 2.5), 1, 0)

# feature 6
train['interaction_quantity_voids'] = train['quantityModifications'] * train['lineItemVoids']
test['interaction_quantity_voids'] = test['quantityModifications'] * test['lineItemVoids']

# feature 7
train['interaction_value_voids'] = train['valuePerSecond'] * train['lineItemVoids']
test['interaction_value_voids'] = test['valuePerSecond'] * test['lineItemVoids']

# feature 8
train['interaction_8'] = train['trustLevel'] * train['scannedLineItemsPerSecond']
test['interaction_8'] = test['trustLevel'] * test['scannedLineItemsPerSecond']

# feature 9
train['interaction_9'] = train['scannedLineItemsPerSecond'] * train['lineItemVoidsPerPosition']
test['interaction_9'] = test['scannedLineItemsPerSecond'] * test['lineItemVoidsPerPosition']

# feature 10
train['interaction_10'] = train['lineItemVoidsPerPosition'] * train['trustLevel']
test['interaction_10'] = test['lineItemVoidsPerPosition'] * test['trustLevel']

# feature 11
train['interaction_11'] = np.where((train['trustLevel'] <= 1.5) & (train['interaction_8'] <= 0.012) & (train['totalScanTimeInSeconds'] <= 995.0), 1, 0)
test['interaction_11'] = np.where((test['trustLevel'] <= 1.5) & (test['interaction_8'] <= 0.012) & (test['totalScanTimeInSeconds'] <= 995.0), 1, 0)

In [None]:
# defining input and target variables
X = train.drop(columns = ['fraud'], axis = 1)
Y = train['fraud']

logit_results = []
RF_results = []
AB_results = []

for i in range(100):
    
    # splitting the data
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

    # changing scale to 0-1
    scaler = MinMaxScaler()

    X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
    X_test = pd.DataFrame(scaler.fit_transform(X_test), columns = X_test.columns)
    
    ### LOGISTIC REGRESSION ###
    
    # running RFE with logistic
    logit_rfe = RFE(estimator = LogisticRegression(), n_features_to_select = 5).fit(X_train, Y_train)

    # storing results
    logit_results.append(logit_rfe.support_)
    
    
    ### RANDOM FOREST ###
    
    # running RFE with Random Forest
    RF_rfe = RFE(estimator = RandomForestClassifier(n_estimators = 500, max_depth = 3), n_features_to_select = 5).fit(X_train, Y_train)
    
    # storing results
    RF_results.append(RF_rfe.support_)
    
    
    ### ADA BOOST ###
    
    # running RFE with Ada Boost
    AB_rfe = RFE(estimator = AdaBoostClassifier(n_estimators = 500, learning_rate = 0.01), n_features_to_select = 5).fit(X_train, Y_train)

    # storing results
    AB_results.append(AB_rfe.support_)
    
    # printing iter
    print(i)

In [102]:
# identifying features to be selected for logistic regression
logit_results = pd.DataFrame(logit_results)
logit_results.columns = X_train.columns

print('Logistic Regression\n')
print((100*logit_results.apply(np.sum, axis = 0) / logit_results.shape[0]).sort_values(ascending = False), '\n', '\n')


# identifying features to be selected for random forest
RF_results = pd.DataFrame(RF_results)
RF_results.columns = X_train.columns

print('Random Forest\n')
print((100*RF_results.apply(np.sum, axis = 0) / RF_results.shape[0]).sort_values(ascending = False), '\n', '\n')

# identifying features to be selected for ada boost
AB_results = pd.DataFrame(AB_results)
AB_results.columns = X_train.columns

print('Ada Boost\n')
print((100*AB_results.apply(np.sum, axis = 0) / AB_results.shape[0]).sort_values(ascending = False))

Logistic Regression

trustLevel                                100.0
lineItemVoidsPerPosition                  100.0
totalScanTimeInSeconds                    100.0
total_per_trust_level                      64.0
interaction_trust_voids                    40.0
interaction_11                             35.0
lineItemVoids                              33.0
interaction_5                              20.0
scansWithoutRegistration                    6.0
interaction_voids_without_registration      2.0
interaction_value_voids                     0.0
interaction_quantity_voids                  0.0
interaction_quantityM_Time                  0.0
interaction_10                              0.0
interaction_8                               0.0
valuePerSecond                              0.0
scannedLineItemsPerSecond                   0.0
quantityModifications                       0.0
grandTotal                                  0.0
interaction_9                               0.0
dtype: float64 
 



In [None]:
## For our logistic regression, our top 5 features are trustLevel, lineItemsVoidsPerPosition, totalScanTimeInSeconds, total_per_trust_level, and interaction_trust_voids.

## For our random forest, our top 5 features are totalPerTrustLevel, interaction_10, trustLevel, interaction_8, and scannedLineItemsPerSecond.

## For our ada boost, our top 5 features are trustLevel, totalScanTimeInSeconds, scannedLineItemsPerSecond, interaction_voids_without_registration, and 
## lineItemVoidsPerPosition.