In [16]:
import pickle
filename = "ds.pkl"
ds = pickle.load(open(filename, 'rb'))

In [17]:
identifiers = ['user_id', 'request_id', 'target_recipient_id']
date_cols = ['date_user_created', 'date_request_submitted', 'date_request_received','date_request_transferred', \
             'first_attempt_date', 'first_success_date']
categorical = ['addr_country_code', 'addr_city', 'recipient_country_code', 'flag_personal_business', 'payment_type', \
               'payment_status', 'ccy_send', 'ccy_target', 'transfer_to_self', 'sending_bank_name',  'sending_bank_country',\
               'payment_reference_classification', 'device']

# Imbalanced Dataset

In [18]:
ds['anomalous'].value_counts()

0    98500
1     1500
Name: anomalous, dtype: int64

## We should do Over-sampling 

In [4]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import IsolationForest
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import TruncatedSVD
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split

from tempfile import mkdtemp
from shutil import rmtree
from sklearn.externals.joblib import Memory

class dateEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        print('Processing DateTime Data')
        print('Encoding....')
        retX = pd.DataFrame()

        for colname, col in X.iteritems():
            retX[colname+'_dayofweek'] = col.dt.dayofweek
            retX[colname+'_weekday'] = col.dt.weekday
            retX[colname+'_hour'] = col.dt.hour
            retX[colname+'_minute'] = col.dt.minute
            retX[colname+'_day'] = col.dt.day
            retX[colname+'_month'] = col.dt.month
            #retX[colname+'_year'] = col.dt.year
    
        return retX
    
def prepare_pipeline(ds):
    
    numeric_features = ds.select_dtypes(include=[np.float or np.float]).columns.tolist()
    categorical_features = ds.select_dtypes(include=['object']).columns.tolist()
    date_features = ds.select_dtypes(include=[np.datetime64]).columns.tolist()
    
    cachedir = mkdtemp()
    memory = Memory(cachedir=cachedir, verbose=1)

    date_transformer = Pipeline(memory=memory, steps=[('dateEncoder', dateEncoder()), ('imputer', SimpleImputer(strategy='median', verbose=1))])
    numeric_transformer = Pipeline(memory=memory, steps=[('imputer', SimpleImputer(strategy='median', verbose=1))])
    categorical_transformer = Pipeline(memory=memory, steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing', verbose=1)),
                                                             ('onehot', OneHotEncoder(handle_unknown='ignore'))])    

    preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features),
                                                   ('dates', date_transformer, date_features),
                                                   ('cat', categorical_transformer, categorical_features)])

    proc = Pipeline(steps=[('preprocessor', preprocessor)])
    
    return proc


class risk_estimator(IsolationForest):
    def __init__(self, n_estimators=1000, max_samples=1000,max_features=0.6, \
                 contamination=0.25, n_jobs=-1, behaviour='new', bootstrap=True, verbose=1):

                                    
        super(risk_estimator, self).__init__(n_estimators=n_estimators, max_samples=max_samples, \
                                             max_features=max_features, bootstrap=bootstrap, \
                                             contamination=contamination, n_jobs=n_jobs, behaviour=behaviour, verbose=verbose)
        self.preprocessor = None
        
    def fit(self, X, y):
        self.preprocessor = prepare_pipeline(X)
        print('Start Data pre-processing ...')
        self.preprocessor = self.preprocessor.fit(X)
        X = self.preprocessor.transform(X)
        print('Size of the Dataset after processing', X.shape)
        print('SMOTE Over-sampling the minority class ...')
        self.resampler = SMOTE()
        X, y = self.resampler.fit_sample(X, y)
        print('dataset has been added into ') 
        print('Size of the Dateset after resampling', X.shape)
        super(risk_estimator, self).fit(X, y)
        return self
    
    def predict(self, X):
        y_pred = super(risk_estimator, self).predict(self.preprocessor.transform(X))
        return np.array(list(map(lambda p: 1 if -1 else 0, y_pred)))

In [5]:
X = ds[[c for c in ds.columns.tolist() if c not in identifiers+['anomalous_score', 'anomalous']]]
y = ds['anomalous'].values

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X, X_holdout, y, y_holdout = train_test_split(X, y, stratify=y, test_size=0.1, shuffle=True, random_state=4891)

In [8]:
restimator = risk_estimator()
restimator.fit(X, y)

You provided "cachedir='C:\\Users\\KHALID~1\\AppData\\Local\\Temp\\tmpjagsx631'", use "location='C:\\Users\\KHALID~1\\AppData\\Local\\Temp\\tmpjagsx631'" instead.


Start Data pre-processing ...
________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(dateEncoder(),         date_user_created date_request_submitted date_request_received  \
97033 2013-09-08 01:04:00    2013-11-08 17:01:00                   NaT   
93828 2014-10-23 12:10:00    2016-09-11 08:23:00                   NaT   
785   2015-11-20 14:56:00    2016-07-03 16:15:00   2016-07-03 16:42:00   
42891 2016-03-02 21:01:00    2016-11-08 03:03:00   2016-11-08 03:05:00   
75317 2015-11-16 09:11:00    2016-05-14 06:18:00   2016-05-14 06:21:00   
13086 2016-09-30 11:20:00    2016-10-27 05:58:00   2016-10-27 06:47:00   
30749 2015-08-17 20:54:00    2015-08-17 21:13:00   2015-08-17 21:15:00   
50350 2015-02-09 11:15:00    2016-03-18 10:40:00   2016-03-18 10:44:00   
24448 2014-03-29 07:28:00    2015-..., 
None, None)
Processing DateTime Data
Encoding....
___________________________________________

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


________________________________________________fit_transform_one - 2.8s, 0.0min
Processing DateTime Data
Encoding....
Size of the Dataset after processing (90000, 35074)
SMOTE Over-sampling the minority class ...
dataset has been added into 
Size of the Dateset after resampling (177300, 35074)


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   2 out of  12 | elapsed:  1.6min remaining:  8.2min
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:  1.7min finished


risk_estimator(behaviour='new', bootstrap=True, contamination=0.25,
        max_features=0.6, max_samples=1000, n_estimators=1000, n_jobs=-1,
        verbose=1)

In [9]:
from sklearn.metrics import classification_report

In [10]:
y_pred = restimator.predict(X_holdout)

Processing DateTime Data
Encoding....


In [11]:
print(classification_report(y_holdout, y_pred, target_names=['0','1']))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      9850
           1       0.01      1.00      0.03       150

   micro avg       0.01      0.01      0.01     10000
   macro avg       0.01      0.50      0.01     10000
weighted avg       0.00      0.01      0.00     10000



  'precision', 'predicted', average, warn_for)


In [12]:
y_pred = restimator.predict(X)

Processing DateTime Data
Encoding....


In [14]:
print(classification_report(y, y_pred, target_names=['0','1']))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00     88650
           1       0.01      1.00      0.03      1350

   micro avg       0.01      0.01      0.01     90000
   macro avg       0.01      0.50      0.01     90000
weighted avg       0.00      0.01      0.00     90000



  'precision', 'predicted', average, warn_for)


In [15]:
import pickle
filename = "restimator.pkl"
modeling_cols = [c for c in ds.columns.tolist() if c not in identifiers+['anomalous_score', 'anomalous']]
pickle.dump({'est': restimator, 'cols': modeling_cols}, open(filename, 'wb'))