In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

In [None]:
card_transactions = pd.read_csv("data/fraudTrain.csv")

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
card_transactions.head()

In [10]:
df = pd.read_csv("data/fraudTest.csv")

In [11]:
df = df.sample(frac = 0.662)

In [12]:
df.to_csv('outTest.csv', index=False)

# 1. Dataset

## EDA

###### Initial Insights

In [None]:
card_transactions.dtypes

In [None]:
corr = card_transactions.corr()
sns.heatmap(corr)

There does not seem to be much correlation between any numerical column and fraud, so we do not know if there is any feature that is important right away.

###### Drop columns

The 'street' column is categorical and is perfectly related to the credit card number since the same billing address is attached to the same card, so we can drop the 'street' column.

In [None]:
card_transactions.value_counts('cc_num')

In [None]:
card_transactions.value_counts('street')

The 'first' and 'last' name columns remain the same for every card and should not provide new useful information in theory, so we can drop them.

In [None]:
card_transactions[card_transactions['cc_num'] == 6538441737335434]['first'].value_counts()

In [None]:
card_transactions[card_transactions['cc_num'] == 6538441737335434]['last'].value_counts()

Since 'trans_num' is unique for each transaction, it does not provide useful info so we drop this too.

In [None]:
card_transactions['trans_num'].value_counts()

In [None]:
#drop columns as stated above
card_transactions = card_transactions.drop(['street', 'first', 'last', 'trans_num'], axis = 1)
#drop duplicate index column
card_transactions = card_transactions.drop('Unnamed: 0', axis = 1)

Clean the DOB column by changing the date string to the year as a int:

In [None]:
card_transactions['dob'] = pd.to_datetime(card_transactions['dob'])
card_transactions['dob'] = card_transactions['dob'].dt.year
card_transactions = card_transactions.rename(columns={'dob': 'birth_year'})

This is the result of our initial cleaning of the data.

In [None]:
card_transactions.head()

###### Interesting Findings

Class Imbalance

In [None]:
f_counts = card_transactions['is_fraud'].value_counts()

x = ['Not Fraud', 'Fraud']
y = [f_counts[0], f_counts[1]]

fig, ax = plt.subplots()
plt.title('Real vs Fraud transactions in data set');
ax.bar(x, y)

plt.show()

Lets analyze the distributions of each category between fraud and non fraud instances to see which factors may be significant

In [None]:
fraud = card_transactions[card_transactions['is_fraud'] == 1]
not_fraud = card_transactions[card_transactions['is_fraud'] == 0]

sns.kdeplot(not_fraud['amt'], label='Not Fraud')
sns.kdeplot(fraud['amt'], label='Fraud')
plt.title('Distribution of Transaction Amounts');
plt.legend(loc='upper right')
plt.show()
print("Not fraud mean amount: " + str(not_fraud['amt'].mean()))
print("Fraud mean amount: " + str(fraud['amt'].mean()))
print("Not fraud max amount: " + str(not_fraud['amt'].max()))
print("Fraud max amount: " + str(fraud['amt'].max()))

In [None]:
fraud_props = fraud['category'].value_counts() / len(fraud['category'])
not_fraud_props = not_fraud['category'].value_counts() / len(not_fraud['category'])
data = {'category': fraud_props.index,
        'prop_fraud': fraud_props.values,
        'prop_not_fraud': not_fraud_props.values}

cat_df = pd.DataFrame(data)
fig, ax = plt.subplots()
cat_df.plot(kind='bar', x='category', y=['prop_not_fraud', 'prop_fraud'], ax=ax)
plt.title('Real vs Fraud Transactions by Category');

plt.show()

In [None]:
sns.kdeplot(not_fraud['long'], label='long - Not Fraud')
sns.kdeplot(fraud['long'], label='long - Fraud')
sns.kdeplot(not_fraud['lat'], label='lat - Not Fraud')
sns.kdeplot(fraud['lat'], label='lat - Fraud lat')
plt.legend(loc='upper left')
plt.xlabel('Position')
plt.title('Real vs Fraud Transactions by Location');
plt.show()

In [None]:
sns.kdeplot(not_fraud['merch_long'], label='long - Not Fraud')
sns.kdeplot(fraud['merch_long'], label='long - Fraud')
sns.kdeplot(not_fraud['merch_lat'], label='lat - Not Fraud')
sns.kdeplot(fraud['merch_lat'], label='lat - Fraud lat')
plt.legend(loc='upper left')
plt.xlabel('Position')
plt.title('Real vs Fraud Transactions by Merchant Location');
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(not_fraud['birth_year'], not_fraud['city_pop'], label='Not Fraud', c='blue', s = 8)
plt.scatter(fraud['birth_year'], fraud['city_pop'], label='Fraud', c='red', s = 10)

plt.xlabel('Birth Year')
plt.ylabel('City Population')
plt.title('Fraudulent Transactions in City Population vs Birth Year');
plt.legend()
plt.show()

# 2. Predictive Task

Model Imports

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

Evaluation Metrics Imports

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

Shuffle data with random seed so that results between different models are comparable

In [None]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [None]:
cc_shuffled = shuffle(card_transactions, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(cc_shuffled.drop('is_fraud', \
        axis = 1), cc_shuffled['is_fraud'], test_size = 0.2, shuffle=False)

Features we are using for the baseline model:

In [None]:
baseline_ord = ['cc_num', 'merchant', 'category', 'zip']
baseline_quant = ['amt', 'lat', 'long', 'city_pop', 'birth_year', 'unix_time', 'merch_lat', 'merch_long']
baseline_ct = ColumnTransformer(transformers=[('one-hot', OneHotEncoder(handle_unknown = 'ignore'), baseline_ord), ('standard', StandardScaler(), baseline_quant)])

LogisticRegression

In [None]:
baseline_log_pl = Pipeline([('preprocessing', baseline_ct), ('logistic-reg', LogisticRegression(max_iter = 1000))])

In [None]:
baseline_log_pl.fit(X_train, y_train)

In [None]:
baseline_log_pred = baseline_log_pl.predict(X_val)

In [None]:
confusion_matrix(baseline_log_pred, y_val)

In [None]:
f1_score(baseline_log_pred, y_val)

In [None]:
precision_score(baseline_log_pred, y_val)

In [None]:
recall_score(baseline_log_pred, y_val)

RandomForestClassifier

In [None]:
baseline_rfc_pl = Pipeline([('preprocessing', baseline_ct), ('random-forest', RandomForestClassifier(n_estimators = 30))])

In [None]:
baseline_rfc_pl.fit(X_train, y_train)

In [None]:
baseline_rfc_pred = baseline_rfc_pl.predict(X_val)

In [None]:
confusion_matrix(baseline_rfc_pred, y_val)

In [None]:
f1_score(baseline_rfc_pred, y_val)

In [None]:
precision_score(baseline_rfc_pred, y_val)

In [None]:
recall_score(baseline_rfc_pred, y_val)

GradientBoostingClassifier

In [None]:
baseline_gbc_pl = Pipeline([('preprocessing', baseline_ct), ('gradient-boosting', GradientBoostingClassifier())])

In [None]:
baseline_gbc_pl.fit(X_train, y_train)

In [None]:
baseline_gbc_pred = baseline_gbc_pl.predict(X_val)

In [None]:
confusion_matrix(baseline_gbc_pred, y_val)

In [None]:
f1_score(baseline_gbc_pred, y_val)

In [None]:
precision_score(baseline_gbc_pred, y_val)

In [None]:
recall_score(baseline_gbc_pred, y_val)

# 3. Model

###### Create Features

Our model may be able to extract specific cyclic patterns from our data better if we split the date and time into multiple features. It would be easier to find patterns over the months of the year or through different times in the day than if time was stored as a single attribute.

In [None]:
card_transactions['trans_date_trans_time'] = pd.to_datetime(card_transactions['trans_date_trans_time'])
#create a feature to see if there is a relationship between time of year and fraud
card_transactions['month'] = card_transactions['trans_date_trans_time'].dt.month
#create a feature to see if there is a relationship between time of day and fraud
card_transactions['hour'] = card_transactions['trans_date_trans_time'].dt.hour

We can now remove 'trans_date_trans_time' since we have unix time representing the cycle of the entire 2 year period.

In [None]:
card_transactions = card_transactions.drop('trans_date_trans_time', axis = 1)

In [None]:
fraud = card_transactions[card_transactions['is_fraud'] == 1]
not_fraud = card_transactions[card_transactions['is_fraud'] == 0]

sns.kdeplot(not_fraud['hour'], label='Not Fraud')
sns.kdeplot(fraud['hour'], label='Fraud')
plt.xlim(0, 25)
plt.title('Distribution Across Hour of Day');
plt.legend(loc='upper left')
plt.show()

In [None]:
sns.kdeplot(not_fraud['unix_time'], label='Not Fraud')
sns.kdeplot(fraud['unix_time'], label='Fraud')
plt.title('Distribution Across Time Frame');

Tuning Hyperparameters

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
card_transactions.head()

In [None]:
cc_shuffled = shuffle(card_transactions, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(cc_shuffled.drop('is_fraud', \
        axis = 1), cc_shuffled['is_fraud'], test_size = 0.2, shuffle=False)

In [None]:
ordinal = ['cc_num', 'merchant', 'category', 'city', 'state', 'zip']
quant = ['amt', 'lat', 'long', 'city_pop', 'birth_year', 'unix_time', 'merch_lat', 'merch_long', 'month', 'hour']
ct = ColumnTransformer(transformers=[('one-hot', OneHotEncoder(handle_unknown = 'ignore'), ordinal), ('standard', StandardScaler(), quant)])

Finding optimal hyperparameters for RandomForestClassifier using CVGridSearch

In [None]:
rfc_pl = Pipeline([('preprocessing', ct), ('random-forest', RandomForestClassifier())])

In [None]:
param_grid = {
    'random-forest__max_depth': [8, 10, 15, 20],
}

grid_search = GridSearchCV(search_rfc_pl, param_grid, cv=5, scoring='f1')
grid_search.fit(X_train, y_train)

In [None]:
print("Best hyperparameters: ", grid_search.best_params_)
print("Best f1 score: ", grid_search.best_score_)

In [None]:
rfc_pl.fit(X_train, y_train)

In [None]:
rfc_pred = rfc_pl.predict(X_val)

In [None]:
confusion_matrix(rfc_pred, y_val)

In [None]:
f1_score(rfc_pred, y_val)

In [None]:
precision_score(rfc_pred, y_val)

In [None]:
recall_score(rfc_pred, y_val)

Finding optimal hyperparameters for GradientBoostingClassifier using CVGridSearch

In [None]:
search_gbc_pl = Pipeline([('preprocessing', ct), ('gradient-boosting', GradientBoostingClassifier())])

In [None]:
param_grid = {
    'gradient-boosting__min_samples_split': [1000, 1500, 3000],
    'gradient-boosting__min_samples_leaf': [50, 100, 200],
    'gradient-boosting__max_depth': [6, 8, 10]
}
grid_search = GridSearchCV(search_gbc_pl, param_grid, cv=5, scoring='f1')
grid_search.fit(X_train, y_train)

In [None]:
print("Best hyperparameters: ", grid_search.best_params_)
print("Best f1 score: ", grid_search.best_score_)

In [None]:
gbc_pl = Pipeline([('preprocessing', ct), ('gradient-boosting', GradientBoostingClassifier(min_samples_split = 1500, min_samples_leaf = 50, max_depth = 8, max_features = 0.3, subsample = 0.8))])

In [None]:
gbc_pl.fit(X_train, y_train)

In [None]:
gbc_pred = gbc_pl.predict(X_val)

In [None]:
confusion_matrix(gbc_pred, y_val)

In [None]:
f1_score(gbc_pred, y_val)

In [None]:
precision_score(gbc_pred, y_val)

In [None]:
recall_score(gbc_pred, y_val)

#### Results on final test set

In [None]:
test_transactions = pd.read_csv('data/fraudTest.csv')

In [None]:
test_transactions.head()

Clean test set

In [None]:
test_transactions['dob'] = pd.to_datetime(test_transactions['dob'])
test_transactions['dob'] = test_transactions['dob'].dt.year
test_transactions = test_transactions.rename(columns={'dob': 'birth_year'})

In [None]:
test_transactions['trans_date_trans_time'] = pd.to_datetime(test_transactions['trans_date_trans_time'])
#create a feature to see if there is a relationship between time of year and fraud
test_transactions['month'] = test_transactions['trans_date_trans_time'].dt.month
#create a feature to see if there is a relationship between time of day and fraud
test_transactions['hour'] = test_transactions['trans_date_trans_time'].dt.hour

Make predictions on test data

In [None]:
X_test = test_transactions.drop('is_fraud', axis = 1)
y_test = test_transactions['is_fraud']

In [None]:
final_pred = gbc_pl.predict(X_test)

Plot confusion matrix and print evaluation metrics

In [None]:
cm = confusion_matrix(y_test, final_pred)

classes = ['Not Fraud', 'Fraud']
fig, ax = plt.subplots()

im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Reds)
cbar = ax.figure.colorbar(im, ax=ax)

ax.set(xticks=np.arange(cm.shape[1]),
       yticks=np.arange(cm.shape[0]),
       xticklabels=classes, yticklabels=classes,
       title='Confusion matrix',
       ylabel='True label',
       xlabel='Predicted label')

for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(j, i, format(cm[i, j], 'd'),
                ha="center", va="center",
                color="white" if cm[i, j] > cm.max() / 2. else "black")
plt.show()

In [None]:
f1_score(final_pred, y_test)

In [None]:
precision_score(final_pred, y_test)

In [None]:
recall_score(final_pred, y_test)