In [28]:
#Import Libraries
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd

In [81]:
# Import data
df = pd.read_csv('data/LabeledTransactions.csv')
df = df.drop(df.columns[[0]], axis = 1)
df = df.drop('acceptor_state', axis = 1)
df = df.drop('pin_present', axis = 1)

# Convert amount to float
def s2f(s):
    return float(str(s).replace(",",""))
df['cc_amount'] = df['cc_amount'].apply(s2f)

# Add new features of time derived variables
df['user_transaction_time'] = pd.to_datetime(df['user_transaction_time'], errors='coerce', utc=True)
df['hour'] = df['user_transaction_time'].dt.hour
df['month'] = df['user_transaction_time'].dt.month
df['dayofweek'] = df['user_transaction_time'].dt.dayofweek
df['year'] = df['user_transaction_time'].dt.year

# Remove user_transaction time and date
df = df.drop(['user_transaction_time','date'], axis = 1)

# Convert categorical variable's type to category
cat_var = ['cardholder_presence','card_presence','partial_approval_capable','hour','month','dayofweek','year']
for col in cat_var:
    df[col] = df[col].fillna(-1)
    df[col] = df[col].astype(int)
    df[col] = df[col].astype(str)
    df[col] = df[col].replace('-1', np.nan)
df.dtypes

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


channel                      object
transaction_risk_score      float64
processing_type              object
cc_amount                   float64
ledger_balance              float64
cc_acceptor_state            object
cc_acceptor_country          object
cardholder_presence          object
card_presence                object
partial_approval_capable     object
is_fraud                      int64
hour                         object
month                        object
dayofweek                    object
year                         object
dtype: object

In [77]:
training_set, test_set = np.split(df, [int(.5 *len(df))])

X_train = training_set.drop("is_fraud",axis=1)
y_train = training_set[['is_fraud']].values.flatten()

X_test = test_set.drop("is_fraud",axis=1)
y_test = test_set[['is_fraud']].values.flatten()

In [78]:
numeric_features = ['transaction_risk_score','cc_amount','ledger_balance']
categorical_features = ['channel','processing_type','cc_acceptor_state','cc_acceptor_country','cardholder_presence',
                       'card_presence','partial_approval_capable','hour','month','dayofweek','year']

numeric_transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(strategy='mean'))
      ,('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(strategy='constant'))
      ,('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
   transformers=[
    ('numeric', numeric_transformer, numeric_features)
   ,('categorical', categorical_transformer, categorical_features)
]) 

pipeline = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('classifier',RandomForestClassifier(n_estimators=500, max_depth=15))
           ])

In [79]:
model = pipeline.fit(X_train, y_train)
predictions = model.predict(X_test)

In [80]:
print(classification_report(predictions,y_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     17029
           1       0.90      1.00      0.94       430

    accuracy                           1.00     17459
   macro avg       0.95      1.00      0.97     17459
weighted avg       1.00      1.00      1.00     17459

