In [1]:
import pandas as pd
from IPython.display import display, HTML

In [2]:
identifiers = ['user_id', 'request_id', 'target_recipient_id']
date_cols = ['date_user_created', 'date_request_submitted', 'date_request_received','date_request_transferred', \
             'first_attempt_date', 'first_success_date']
categorical = ['addr_country_code', 'addr_city', 'recipient_country_code', 'flag_personal_business', 'payment_type', \
               'payment_status', 'ccy_send', 'ccy_target', 'transfer_to_self', 'sending_bank_name',  'sending_bank_country',\
               'payment_reference_classification', 'device']

In [3]:
ds = pd.read_csv('AML_dataset.csv', parse_dates=date_cols)

In [4]:
ds['addr_city'] = ds['addr_city'].str.upper()

In [5]:
ds.dtypes

user_id                                     object
request_id                                  object
target_recipient_id                         object
date_user_created                   datetime64[ns]
addr_country_code                           object
addr_city                                   object
recipient_country_code                      object
flag_personal_business                      object
payment_type                                object
date_request_submitted              datetime64[ns]
date_request_received               datetime64[ns]
date_request_transferred            datetime64[ns]
date_request_cancelled                      object
invoice_value                              float64
invoice_value_cancel                       float64
flag_transferred                             int64
payment_status                              object
ccy_send                                    object
ccy_target                                  object
transfer_to_self               

# Isolation Forest Algorithm

I will use scikit-learn implementation,
For large dataset, we can use spark implementation
##### https://github.com/titicaca/spark-iforest

### Dealing all missing values

Missing values should be investigted with SMEs

In [6]:
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import IsolationForest
from sklearn.base import BaseEstimator, TransformerMixin

from tempfile import mkdtemp
from shutil import rmtree
from sklearn.externals.joblib import Memory

class dateEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        print('Processing DateTime Data')
        print('Encoding....')
        retX = pd.DataFrame()
        #print(X)
        for colname, col in X.iteritems():
            retX[colname+'_dayofweek'] = col.dt.dayofweek
            retX[colname+'_weekday'] = col.dt.weekday
            retX[colname+'_hour'] = col.dt.hour
            retX[colname+'_minute'] = col.dt.minute
            retX[colname+'_day'] = col.dt.day
            retX[colname+'_month'] = col.dt.month
            #retX[colname+'_year'] = col.dt.year
    
        return retX
    
def prepare_pipeline(ds):
    
    numeric_features = ds.select_dtypes(include=[np.float or np.float]).columns.tolist()
    categorical_features = ds.select_dtypes(include=['object']).columns.tolist()
    date_features = ds.select_dtypes(include=[np.datetime64]).columns.tolist()
    
    cachedir = mkdtemp()
    memory = Memory(cachedir=cachedir, verbose=1)

    date_transformer = Pipeline(memory=memory, steps=[('dateEncoder', dateEncoder()), ('imputer', SimpleImputer(strategy='median', verbose=1))])
    numeric_transformer = Pipeline(memory=memory, steps=[('imputer', SimpleImputer(strategy='median', verbose=1))])
    categorical_transformer = Pipeline(memory=memory, steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing', verbose=1)),
                                                             ('onehot', OneHotEncoder(handle_unknown='ignore'))])    

    preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features),
                                                   ('dates', date_transformer, date_features),
                                                   ('cat', categorical_transformer, categorical_features)])

    Anomaly_Detector = Pipeline(steps=[('preprocessor', preprocessor),
                                       ('iforce', IsolationForest(n_estimators=1000, max_samples=1000,\
                                                                  max_features=0.6, contamination=0.25, n_jobs=-1, behaviour='new', verbose=1))])
    
    return Anomaly_Detector

In [7]:
Anomaly_Detector = prepare_pipeline(ds[[c for c in ds.columns.tolist() if c not in identifiers]])

You provided "cachedir='C:\\Users\\KHALID~1\\AppData\\Local\\Temp\\tmpyy611tct'", use "location='C:\\Users\\KHALID~1\\AppData\\Local\\Temp\\tmpyy611tct'" instead.


In [8]:
Anomaly_Detector = Anomaly_Detector.fit(ds[[c for c in ds.columns.tolist() if c not in identifiers]])

________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(dateEncoder(),         date_user_created date_request_submitted date_request_received  \
0     2014-01-27 15:02:00    2016-08-26 07:35:00   2016-01-09 08:31:00   
1     2015-12-10 15:35:00    2016-10-23 22:54:00                   NaT   
2     2016-04-10 11:42:00    2016-10-26 13:42:00   2016-10-26 14:06:00   
3     2014-10-17 00:27:00    2015-01-28 23:36:00   2015-01-28 23:36:00   
4     2015-12-08 07:45:00    2015-08-18 08:55:00   2015-08-18 09:12:00   
5     2016-04-19 16:30:00    2016-02-10 11:59:00   2016-02-10 12:00:00   
6     2015-10-15 19:48:00    2016-12-08 20:54:00   2016-12-08 20:54:00   
7     2015-12-03 21:04:00    2015-05-31 09:49:00   2015-05-31 09:52:00   
8     2015-04-30 16:05:00    2015-..., 
None, None)
Processing DateTime Data
Encoding....
________________________________________________fit_transform_one - 0.2s,

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


________________________________________________fit_transform_one - 3.0s, 0.1min


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   2 out of  12 | elapsed:   41.8s remaining:  3.5min
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:   43.3s finished


In [9]:
ds['anomalous_score'] = Anomaly_Detector.decision_function(ds[[c for c in ds.columns.tolist() if c not in identifiers]])

Processing DateTime Data
Encoding....


In [10]:
from scipy import stats
outliers_fraction = 0.015
threshold = stats.scoreatpercentile(ds['anomalous_score'], 100 * outliers_fraction)

In [11]:
threshold

-0.005248514906945759

In [12]:
ds['anomalous'] = ds['anomalous_score'].apply(lambda x: 0 if x>=threshold else 1)

# Anomalous Transfers count

In [13]:
ds[ds['anomalous']==1]['anomalous'].count()

1500

# Non-Anomalous Transfer count

In [14]:
ds[ds['anomalous']==0]['anomalous'].count()

98500

## The sensitiveity of the Anomaly Detector can be validated by SMEs

In [15]:
import pickle
filename = "ds.pkl"
pickle.dump(ds, open(filename, 'wb'))