In [1]:
# 443901, m.joo@wustl.edu, Joo, Michael
# 472926, michael.kincheloe@wustl.edu, Kincheloe, Michael
# 457870, jfeibelman@wustl.edu, Feibelman, Jason
# 463959, m.fishman@wustl.edu, Fishman, Matthew 
# 458013, mattwitzerman@wustl.edu, Witzerman, Matt

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/suspicious-transaction-detection/sampleSubmission.csv
/kaggle/input/suspicious-transaction-detection/train.csv
/kaggle/input/suspicious-transaction-detection/test.csv


In [3]:
train = pd.read_csv('/kaggle/input/suspicious-transaction-detection/train.csv')
test = pd.read_csv('/kaggle/input/suspicious-transaction-detection/test.csv')

N_train, dim = train.shape
N_test, _ = test.shape

X_train = train.drop(['Target'], axis=1)
y_train = train['Target'].copy()

Upon inspection of csv files, we can categorize the features as such:

0. Miscellaneous
* Timestamp - Time (Time preprocessing - divide into date and time maybe?)
* Goods - Categorical (one-hot encoding) +5 columns
* Amount - Numerical (no empty values)

1. T
* T_0, T_2 ~ T_14 - Numerical (fill null with median)
* T_1 - Numerical (no empty values)

2. C
* C_0 ~ C_4, C_6 ~ C_8 - Binary Categorical (fill null with mode)
* C_5 - Categorical Text (one-hot encoding) +3 columns
* C_23 - Discrete Numerical (fill null with mode)
* C_28 - Discrete Numerical (fill null with median)
* C_9 ~ C_22, C_26, C_27 - Discrete Numerical (no empty values) 
* C_24, C_25 - Categorical Text (no empty values, one-hot encoding) +8 columns

3. V
* V - Mostly discrete numerical, some are continuous (fill null with median)

4. O
* O_0, O_1, O_5, O_6, O_9, O_18, O_31, O_36 - Numerical (fill null with median)
* O_2, O_3, O_11, O_15, O_20 ~ O_22, O_24, O_26, O_27, O_29, O_32 ~ O_35, O_37, O_38 - Discrete Numerical (fill null with mode)
* O_4, O_7, O_8, O_16, O_17, O_23, O_25, O_39 - Text Categorical (one-hot encoding) +20 columns
* O_10, O_12, O_28, O_30 - Binary Categorical (one-hot encoding) +(2x4) columns
* O_13, O_14, O_19 - Text Categorical

5. A
* A_0 - Numerical (fill null with median)
* A_1 - Discrete Numerical (fill null with mode)

6. E
* E_0, E_1 - Text Categorical

7. M
* M_0 - Text Categorical (one-hot encoding) +2 columns
* M_1 - Numerical (fill null with median)

Using the above analysis, we first parse the text categorical features so that they contain only the relevant information and then apply one-hot encoding.

For O_13 and O_19, we only save the first word since OS versions and browser versions should contain insignificant information about credit card fraud unless for very extreme cases we can ignore. For E_0 and E_1, we generate a new feature, E_same, that returns True if they are the same for a row and False otherwise. Finally, we simply ignore O_14 as screen resolution should not be a significant feature in determining fraudulent transactions.

We ignore the timestamp feature as the time of transaction should be negligible. We also drop the transaction ID during training and prediction as it should not affect whether a transaction is fraudulent.

In [4]:
med_features = np.concatenate((['Amount'],
                               ['T_{}'.format(t) for t in range(15)], 
                               ['C_{}'.format(c) for c in range(9, 23)],
                               ['C_26', 'C_27', 'C_28'], 
                               ['V_{}'.format(v) for v in range(339)], 
                               ['O_0', 'O_1', 'O_5', 'O_6', 'O_9', 'O_18', 'O_31', 'O_36'],
                               ['A_0', 'M_1']
                              ))
mod_features = np.concatenate((['C_0', 'C_1', 'C_2', 'C_3', 'C_4', 'C_6', 'C_7', 'C_8', 'C_23'],
                               ['O_2', 'O_3', 'O_10', 'O_11', 'O_12', 'O_15', 'O_20', 'O_21', 'O_22', 'O_24', 'O_26', 'O_27', 'O_28', 'O_29', 'O_30', 'O_32', 'O_33', 'O_34', 'O_35', 'O_37', 'O_38'],
                               ['A_1']
                              ))
oh_features = np.concatenate((['Goods', 'C_5', 'C_24', 'C_25'],
                              ['O_4', 'O_7', 'O_8', 'browser', 'O_16', 'O_17', 'os', 'O_23', 'O_25',  'O_39'],
                              ['E_same', 'M_0']
                             ))

print(len(med_features) + len(mod_features) + len(oh_features))

429


In [5]:
features = pd.concat([X_train, test]).reset_index(drop=True)

features = features.drop(['TransactionID', 'Timestamp', 'O_14'], axis=1)

features['browser'] = features['O_13'].str.split(' ').str[0]
features['os'] = features['O_19'].str.split(' ').str[0]
features['E_0'] = features['E_0'].fillna('empty')
features['E_same'] = np.where(features['E_0'] == features['E_1'], 'T', 'F')

features = features.drop(['O_13', 'O_19', 'E_0', 'E_1'], axis=1)

for feature in med_features:
    med = X_train[feature].median()
    features[feature] = features[feature].fillna(med)
for feature in mod_features:
    mod = X_train[feature].mode()[0]
    #print('{}: {}'.format(feature, mod))
    features[feature] = features[feature].fillna(mod)

features_prepared = pd.get_dummies(features, columns=oh_features)

features_prepared.replace('T', 1, inplace=True)
features_prepared.replace('F', 0, inplace=True)

X_train_prepared = features_prepared.iloc[:N_train, :].copy()
X_test_prepared = features_prepared.iloc[N_train:, :].copy()

print(X_train_prepared.shape)
print(X_test_prepared.shape)

(472432, 499)
(118108, 499)


With the data prepared, we train a Random Forest classifier model on the training data. To evaluate the model, we apply 5-fold cross validation and observe the mean and standard deviation of the accuracy of our model.

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

forest = RandomForestClassifier()
forest_error = cross_val_score(forest, X_train_prepared, y_train, cv=5)
print(forest_error.mean())
print(forest_error.std())

0.9789366509222429
0.0001411476086140835


The results seem promising. Note that a Random Forest model, by default, returns the predicted label of a data point. Since we want the probability of a transaction being fraudulent instead of its predicted label, we return the number of votes that believe a transaction is fraudulent divided by the total number of votes from the Random Forest model. 

In [7]:
forest.fit(X_train_prepared, y_train)
y_pred = forest.predict_proba(X_test_prepared)
my_submission = pd.DataFrame({'TransactionID': test.TransactionID, 'Target': y_pred[:, 1]})
my_submission.to_csv('submission.csv', index=False)