In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('swift.csv')

In [3]:
df_copy = df.copy()

In [25]:
size = 10000000

In [26]:
df = df_copy.copy()[:size]

In [27]:
df.isna().any()

Transaction Ref    False
Originator         False
Beneficiary        False
Type               False
Currency           False
Value              False
Flag               False
Fulldate           False
Intermediate       False
dtype: bool

In [28]:
%%time
df = df_copy.copy()[:size]
df['Fulldate'] = pd.to_datetime(df['Fulldate'])

df['OriginatorCountry'] = df['Originator'].str.strip().str[4:6]
df['IntermediaryCountry'] = df['Intermediary'].str.strip().str[4:6]
df['BeneficiaryCountry'] = df['Originator'].str.strip().str[4:6]

df['OriginatorCountry_BeneficiaryCountry'] = df['OriginatorCountry']+df['BeneficiaryCountry']
df['OriginatorCountry_IntermediaryCountry'] = df['OriginatorCountry']+df['IntermediaryCountry']
df['IntermediaryCountry_BeneficiaryCountry'] = df['IntermediaryCountry']+df['BeneficiaryCountry']
df['OriginatorCountry_IntermediaryCountry_BeneficiaryCountry'] = df['OriginatorCountry']+df['IntermediaryCountry']+df['BeneficiaryCountry']

Wall time: 6.02 s


# Features Client

In [29]:
def getLatency(x,name_col):
    return (x[name_col]-x[name_col].shift(1)).dt.total_seconds().fillna(0)

In [30]:
def get_circuit(x):
    return sum(df.loc[x.index,'Beneficiary']==df.loc[x.index,'Intermediary'])

In [31]:
def get_circuit(x):
    return sum(x.str[:2]!=x.str[2:4])

In [32]:
def base_features_client(type_client,period,relation):
    
    cols_start = df.columns
    
    groupby = []
        
    if period == '10Days':
        groupby = type_client+[pd.Grouper(key='Fulldate', axis=0, freq='10D')]
    elif period == 'Month':
        groupby = type_client+[pd.Grouper(key='Fulldate', axis=0, freq='30D')]
    elif period == 'Global':
        groupby = type_client

    name = period+'_'+'_'.join(type_client)
    
    print(groupby)
    #FREQUENCY
    #print('frequency')
    
    df['frequency_'+name] = df.groupby(groupby)['Value'].transform('count')
    
    #AMOUNT
    #print('amount')
    
    df['sum_value_'+name] = df.groupby(groupby)['Value'].transform('sum')
    df['max_value_'+name] = df.groupby(groupby)['Value'].transform('max')
    df['min_value_'+name] = df.groupby(groupby)['Value'].transform('min')
    df['avg_value_'+name] = df.groupby(groupby)['Value'].transform('sum')/df['frequency_'+name]
    
    if relation == False:
        # LATENCY
        #print('latency')
        df['latency_'+name] = df.groupby(groupby, as_index=False).apply(getLatency,name_col='Fulldate').reset_index(level=0, drop=True)


        # NB RELATION
        #print('nb relation')
        if type_client[0] == 'Originator':
            df['number_relation_'+name] = df.groupby(['Originator'])['Beneficiary'].transform('nunique')
        elif type_client[0] =='Beneficiary':
            df['number_relation_'+name] = df.groupby(['Beneficiary'])['Originator'].transform('nunique')
        
    # NB OF DISTINCT CURRENCY
    #print('currency 1')
    
    df['number_distinct_currency_'+name] = df.groupby(['Originator'])['Currency'].transform('nunique')
    
    # NB WITH CURRENT CURRENCY
    #print('currency 2')
        
    df['frequency_with_currency_'+name] = df.groupby(groupby+['Currency'])['Value'].transform('count')
    
    if relation == False:
    
        # NB OF DISTINCT COUNTRIES
        #print('country 1')

        if type_client[0] == 'Originator' :
            df['number_distinct_country_'+name] = df.groupby(['Originator'])['BeneficiaryCountry'].transform('nunique')
        elif type_client[0] =='Beneficiary':
            df['number_distinct_country_'+name] = df.groupby(['Beneficiary'])['OriginatorCountry'].transform('nunique')

        # NB WITH CURRENT COUNTRY
       # print('country 2')

        if type_client[0] == 'Originator':
            df['frequency_to_Beneficiary_Country_'+name] = df.groupby(groupby+['BeneficiaryCountry'])['Value'].transform('count')

        elif type_client[0] =='Beneficiary':
            df['frequency_to_Originator_Country_'+name] = df.groupby(groupby+['OriginatorCountry'])['Value'].transform('count')


        # NB WITH INTERMEDIATE
        #print('intermediary 1')

        if type_client[0] == 'Originator':
            df['number_with_intermediary_'+name] = df.groupby(['Originator'])['IntermediaryCountry_BeneficiaryCountry'].transform(get_circuit)
        elif type_client[0] =='Beneficiary' :
            df['number_with_intermediary_'+name] = df.groupby(['Beneficiary'])['IntermediaryCountry_BeneficiaryCountry'].transform(get_circuit)

        # NB DISTINCT CIRCRUIT
        #print('intermediary 2')

        if type_client[0] == 'Originator':
            df['number_dinstinct_circuit_with_intermediary_'+name] = df.groupby(['Originator'])['IntermediaryCountry_BeneficiaryCountry'].transform('nunique')
        elif type_client[0] =='Beneficiary':
            df['number_dinstinct_circuit_with_intermediary_'+name] = df.groupby(['Beneficiary'])['OriginatorCountry_IntermediaryCountry'].transform('nunique')

        # NB CURRENT CIRCRUIT
        #print('intermediary 3')

        if type_client[0] == 'Originator':
            df['number_current_circuit_with_intermediary_'+name] = df.groupby(['Originator','IntermediaryCountry_BeneficiaryCountry'])['Value'].transform('count')
        elif type_client[0] =='Beneficiary':
            df['number_current_circuit_with_intermediary_'+name] = df.groupby(['Beneficiary','OriginatorCountry_IntermediaryCountry'])['Value'].transform('count')

    
    cols_end = df.columns
    
    new_cols = list(set(cols_end) - set(cols_start))
    
    return new_cols

In [33]:
%%time 

array = []
name_configuration = []

array.append(base_features_client(['Beneficiary'],'Global',False))
array.append(base_features_client(['Beneficiary'],'10Days',False))
array.append(base_features_client(['Beneficiary'],'Month',False))
name_configuration.append('Beneficiary-Global')
name_configuration.append('Beneficiary-10Days')
name_configuration.append('Beneficiary-Month')

array.append(base_features_client(['Originator'],'Global',False))
array.append(base_features_client(['Originator'],'10Days',False))
array.append(base_features_client(['Originator'],'Month',False))
name_configuration.append('Originator-Global')
name_configuration.append('Originator-10Days')
name_configuration.append('Originator-Month')

array.append(base_features_client(['Intermediary'],'Global',False))
array.append(base_features_client(['Intermediary'],'10Days',False))
array.append(base_features_client(['Intermediary'],'Month',False))
name_configuration.append('Intermediary-Global')
name_configuration.append('Intermediary-10Days')
name_configuration.append('Intermediary-Month')

array.append(base_features_client(['Originator','Beneficiary'],'Global',True))
array.append(base_features_client(['Originator','Beneficiary'],'10Days',True))
array.append(base_features_client(['Originator','Beneficiary'],'Month',True))
name_configuration.append('Originator-Beneficiary-Global')
name_configuration.append('Originator-Beneficiary-10Days')
name_configuration.append('Originator-Beneficiary-Month')

array.append(base_features_client(['Originator','Intermediary'],'Global',True))
array.append(base_features_client(['Originator','Intermediary'],'10Days',True))
array.append(base_features_client(['Originator','Intermediary'],'Month',True))
name_configuration.append('Originator-Intermediary-Global')
name_configuration.append('Originator-Intermediary-10Days')
name_configuration.append('Originator-Intermediary-Month')

array.append(base_features_client(['Intermediary','Beneficiary'],'Global',True))
array.append(base_features_client(['Intermediary','Beneficiary'],'10Days',True))
array.append(base_features_client(['Intermediary','Beneficiary'],'Month',True))
name_configuration.append('Intermediary-Beneficiary-Global')
name_configuration.append('Intermediary-Beneficiary-10Days')
name_configuration.append('Intermediary-Beneficiary-Month')

['Beneficiary']
['Beneficiary', TimeGrouper(key='Fulldate', freq=<10 * Days>, axis=0, sort=True, closed='left', label='left', how='mean', convention='e', origin='start_day')]
['Beneficiary', TimeGrouper(key='Fulldate', freq=<30 * Days>, axis=0, sort=True, closed='left', label='left', how='mean', convention='e', origin='start_day')]
['Originator']
['Originator', TimeGrouper(key='Fulldate', freq=<10 * Days>, axis=0, sort=True, closed='left', label='left', how='mean', convention='e', origin='start_day')]
['Originator', TimeGrouper(key='Fulldate', freq=<30 * Days>, axis=0, sort=True, closed='left', label='left', how='mean', convention='e', origin='start_day')]
['Intermediate']
['Intermediate', TimeGrouper(key='Fulldate', freq=<10 * Days>, axis=0, sort=True, closed='left', label='left', how='mean', convention='e', origin='start_day')]
['Intermediate', TimeGrouper(key='Fulldate', freq=<30 * Days>, axis=0, sort=True, closed='left', label='left', how='mean', convention='e', origin='start_day')

# Features Category

In [34]:
def base_features_transaction(column,period):
    
    cols_start = df.columns
        
    if period == '10Days':
        groupby = [column,pd.Grouper(key=name_col_date, axis=0, freq='10D')]
    elif period == 'Month':
        groupby = [column,pd.Grouper(key=name_col_date, axis=0, freq='30D')]
    elif period == 'Global':
        groupby = column

    name = period+'_'+'_'.join(column)
    
    #FREQUENCY
    #print('frequency')
    
    df['frequency_'+name] = df.groupby(groupby)['Value'].transform('count')
    
    #AMOUNT
    #print('amount')
    
    df['sum_value_'+name] = df.groupby(groupby)['Value'].transform('sum')
    df['max_value_'+name] = df.groupby(groupby)['Value'].transform('max')
    df['min_value_'+name] = df.groupby(groupby)['Value'].transform('min')
    df['avg_value_'+name] = df.groupby(groupby)['Value'].transform('sum')/df['frequency_'+name]
    
    # LATENCY
    #print('latency')
    df['latency_'+name] = df.groupby(groupby, as_index=False).apply(getLatency,name_col='Fulldate').reset_index(level=0, drop=True)
    
    # NB RELATION
    #print('nb relation')
    df['number_relation_originator_'+name] = df.groupby(groupby)['Originator'].transform('nunique')
    df['number_relation_beneficiary_'+name] = df.groupby(groupby)['Beneficiary'].transform('nunique')
    df['number_relation_intermediary_'+name] = df.groupby(groupby)['Intermediary'].transform('nunique')
        
    # NB OF DISTINCT CURRENCY
    #print('currency ')
    
    if column != 'Currency':
        df['number_distinct_currency_'+name] = df.groupby(groupby)['Currency'].transform('nunique')
        df['frequency_with_currency_'+name] = df.groupby(groupby+['Currency'])['Value'].transform('count')
    
    
    # NB OF DISTINCT COUNTRIES
    #print('country 1')
    
    if column not in ['OriginatorCountry','BeneficiaryCountry','IntermediaryCountry']:
        df['number_distinct_originator_country_'+name] = df.groupby(groupby)['OriginatorCountry'].transform('nunique')
        df['number_distinct_beneficiary_country_'+name] = df.groupby(groupby)['BeneficiaryCountry'].transform('nunique')
        df['number_distinct_intermediary_country_'+name] = df.groupby(groupby)['IntermediaryCountry'].transform('nunique')
        
        df['number_current_originator_country_'+name] = df.groupby(groupby)['OriginatorCountry'].transform('count')
        df['number_current_beneficiary_country_'+name] = df.groupby(groupby)['BeneficiaryCountry'].transform('count')
        df['number_current_intermediary_country_'+name] = df.groupby(groupby)['IntermediaryCountry'].transform('count')
            
    
    # NB WITH INTERMEDIATE
    #print('intermediary 1')
    df['number_with_intermediary_'+name] = df.groupby(groupby)['IntermediaryCountry_BeneficiaryCountry'].transform(get_circuit)
    
    # NB DISTINCT CIRCRUIT
    #print('intermediary 2')
    
    df['number_dinstinct_circuit_with_intermediary_'+name] = df.groupby(groupby)['IntermediaryCountry_BeneficiaryCountry'].transform('nunique')
        
    # NB CURRENT CIRCRUIT
    #print('intermediary 3')
    
    df['number_current_circuit_intermediaryCountry_beneficiaryCountry_'+name] = df.groupby(groupby+['IntermediaryCountry_BeneficiaryCountry'])['Value'].transform('count')
    df['number_current_circuit_originatorCountry_intermediaryCountry_'+name] = df.groupby(groupby+['OriginatorCountry_IntermediaryCountry'])['Value'].transform('count')
    df['number_current_circuit_originatorCountry_beneficiaryCountry_'+name] = df.groupby(groupby+['OriginatorCountry_BeneficiaryCountry'])['Value'].transform('count')
    df['number_current_circuit_originatorCountry_intermediaryCountry_beneficiaryCountry_'+name] = df.groupby(groupby+['OriginatorCountry_IntermediaryCountry_BeneficiaryCountry'])['Value'].transform('count')
    
    cols_end = df.columns
    
    new_cols = list(set(cols_end) - set(cols_start))
    
    return new_cols
        

In [35]:
%%time


base_feature = []
array.append(base_features_transaction(['Currency'],'Global'))
name_configuration.append('Currency-Global')
array.append(base_features_transaction(['OriginatorCountry'],'Global'))
name_configuration.append('OriginatorCountry-Global')
array.append(base_features_transaction(['BeneficiaryCountry'],'Global'))
name_configuration.append('BeneficiaryCountry-Global')
array.append(base_features_transaction(['IntermediaryCountry'],'Global'))
name_configuration.append('IntermediaryCountry-Global')
array.append(base_features_transaction(['Type'],'Global'))
name_configuration.append('Type-Global')


Wall time: 6min 48s


# Transaction Features

In [36]:


base_feature.extend(list(df_copy.select_dtypes(include=np.number).columns))
cols_start = df.columns

df['Hour'] = df['Fulldate'].dt.hour
df['Week_of_year'] = df['Fulldate'].dt.week
df['day_of_week'] = df['Fulldate'].dt.dayofweek
df['day_of_year'] = df['Fulldate'].dt.dayofyear
df['quarter'] = df['Fulldate'].dt.quarter
df['year'] = df['Fulldate'].dt.year
df['month'] = df['Fulldate'].dt.month
df['day'] = df['Fulldate'].dt.day
df['is_month_start'] = df['Fulldate'].dt.is_month_start
df['is_month_start'] = df['is_month_start'].map({True: 1, False:0})
df['is_month_end'] = df['Fulldate'].dt.is_month_end
df['is_month_end'] = df['is_month_end'].map({True: 1, False:0})
cols_end = df.columns          
new_cols = list(set(cols_end) - set(cols_start))

base_feature.extend(new_cols)

  """


In [37]:
df.isna().any()

Transaction Ref    False
Originator         False
Beneficiary        False
Type               False
Currency           False
                   ...  
year               False
month              False
day                False
is_month_start     False
is_month_end       False
Length: 312, dtype: bool

# Classification

In [38]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler
from itertools import combinations
from imblearn.ensemble import BalancedRandomForestClassifier

In [39]:
X = df.select_dtypes(include=np.number)
y = df['Flag']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [40]:
%%time

model = DecisionTreeClassifier()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

report = classification_report(y_test,y_pred,output_dict=True )
CM = confusion_matrix(y_test, y_pred)


TN = CM[0][0]
FN = CM[1][0]
TP = CM[1][1]
FP = CM[0][1]
print('les fraudes : ')
print('bien trouvée TP : ',TP)
print('pas trouvée FN : ',FN) 
print('les normaux')
print('bien trouvée TN : ',TN)
print('pas trouvée FP : ',FP)
report

les fraudes : 
bien trouvée TP :  1768
pas trouvée FN :  877
les normaux
bien trouvée TN :  731623
pas trouvée FP :  1091
Wall time: 28min 13s


{'False': {'precision': 0.9988027303754267,
  'recall': 0.9985110152119381,
  'f1-score': 0.9986568514906354,
  'support': 732714},
 'True': {'precision': 0.6183980412731724,
  'recall': 0.6684310018903592,
  'f1-score': 0.6424418604651163,
  'support': 2645},
 'accuracy': 0.9973237561517572,
 'macro avg': {'precision': 0.8086003858242996,
  'recall': 0.8334710085511486,
  'f1-score': 0.8205493559778758,
  'support': 735359},
 'weighted avg': {'precision': 0.9974344593640222,
  'recall': 0.9973237561517572,
  'f1-score': 0.9973755879836101,
  'support': 735359}}

In [41]:
%%time

modelRF = RandomForestClassifier(n_estimators=30,bootstrap = True,max_features = 'sqrt')

modelRF.fit(X_train, y_train)

y_pred = modelRF.predict(X_test)

report = classification_report(y_test,y_pred,output_dict=True )
report
CM = confusion_matrix(y_test, y_pred)

TN = CM[0][0]
FN = CM[1][0]
TP = CM[1][1]
FP = CM[0][1]
print('les fraudes : ')
print('bien trouvée TP : ',TP)
print('pas trouvée FN : ',FN)
print('les normaux')
print('bien trouvée TN : ',TN)
print('pas trouvée FP : ',FP)
report

les fraudes : 
bien trouvée TP :  1592
pas trouvée FN :  1053
les normaux
bien trouvée TN :  732619
pas trouvée FP :  95
Wall time: 36min 59s


{'False': {'precision': 0.9985647537319129,
  'recall': 0.9998703450459524,
  'f1-score': 0.9992171229130666,
  'support': 732714},
 'True': {'precision': 0.943687018375815,
  'recall': 0.601890359168242,
  'f1-score': 0.7349953831948292,
  'support': 2645},
 'accuracy': 0.9984388577551917,
 'macro avg': {'precision': 0.971125886053864,
  'recall': 0.8008803521070972,
  'f1-score': 0.867106253053948,
  'support': 735359},
 'weighted avg': {'precision': 0.9983673649598751,
  'recall': 0.9984388577551917,
  'f1-score': 0.9982667483320051,
  'support': 735359}}

In [42]:
%%time

import xgboost as xgb


xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
xgb_model.fit(X_train, y_train)


y_pred = xgb_model.predict(X_test)

report = classification_report(y_test,y_pred,output_dict=True )
report
CM = confusion_matrix(y_test, y_pred)

TN = CM[0][0]
FN = CM[1][0]
TP = CM[1][1]
FP = CM[0][1]
print('les fraudes : ')
print('bien trouvée TP : ',TP)
print('pas trouvée FN : ',FN)
print('les normaux')
print('bien trouvée TN : ',TN)
print('pas trouvée FP : ',FP)
report




les fraudes : 
bien trouvée TP :  1733
pas trouvée FN :  912
les normaux
bien trouvée TN :  732544
pas trouvée FP :  170
Wall time: 47min 59s


{'False': {'precision': 0.9987565716280186,
  'recall': 0.9997679858717043,
  'f1-score': 0.9992620228213644,
  'support': 732714},
 'True': {'precision': 0.9106673673147662,
  'recall': 0.6551984877126654,
  'f1-score': 0.7620932277924363,
  'support': 2645},
 'accuracy': 0.9985286098354681,
 'macro avg': {'precision': 0.9547119694713924,
  'recall': 0.8274832367921849,
  'f1-score': 0.8806776253069004,
  'support': 735359},
 'weighted avg': {'precision': 0.9984397251008006,
  'recall': 0.9985286098354681,
  'f1-score': 0.9984089545066345,
  'support': 735359}}

# Approche

In [43]:
%%time
y_pred_array = []
y_pred_array_f1_score = []
models = []
models_f1_score = []



for idx,cols in enumerate(name_configuration):
    model = DecisionTreeClassifier()
    model.fit(X_train[array[idx]+base_feature], y_train)
    models.append(model)
    
    y_pred = model.predict(X_test[array[idx]+base_feature])
    
    report = classification_report(y_test,y_pred,output_dict=True )

    models_f1_score.append(report['True']['f1-score'])
    y_pred_array.append(np.multiply(y_pred, 1))
    y_pred_array_f1_score.append(np.multiply(y_pred, round(report['True']['f1-score'],2)))
    
    print(name_configuration[idx], 'base & ',round(report['True']['precision'],2),'&',round(report['True']['recall'],2),'&',round(report['True']['f1-score'],2),'\\\ \hline')

Beneficiary-Global base &  0.51 & 0.59 & 0.55 \\ \hline
Beneficiary-10Days base &  0.48 & 0.56 & 0.52 \\ \hline
Beneficiary-Month base &  0.47 & 0.55 & 0.51 \\ \hline
Originator-Global base &  0.2 & 0.26 & 0.22 \\ \hline
Originator-10Days base &  0.21 & 0.29 & 0.24 \\ \hline
Originator-Month base &  0.21 & 0.29 & 0.24 \\ \hline
Intermediate-Global base &  0.34 & 0.41 & 0.37 \\ \hline
Intermediate-10Days base &  0.18 & 0.23 & 0.2 \\ \hline
Intermediate-Month base &  0.26 & 0.33 & 0.29 \\ \hline
Originator-Beneficiary-Global base &  0.57 & 0.66 & 0.61 \\ \hline
Originator-Beneficiary-10Days base &  0.31 & 0.38 & 0.35 \\ \hline
Originator-Beneficiary-Month base &  0.49 & 0.57 & 0.53 \\ \hline
Originator-Intermediate-Global base &  0.41 & 0.51 & 0.45 \\ \hline
Originator-Intermediate-10Days base &  0.23 & 0.3 & 0.26 \\ \hline
Originator-Intermediate-Month base &  0.29 & 0.36 & 0.32 \\ \hline
Intermediate-Beneficiary-Global base &  0.53 & 0.63 & 0.57 \\ \hline
Intermediate-Beneficiary-10Day

In [44]:
import warnings
warnings.filterwarnings('ignore')

In [52]:
%%time

for alpha in range(10):
    maxF1 = 0
    alphaMax = 0
    threshold = 0
    listf1 = []
    listIndex = []
    y_pred_final = np.sum(y_pred_array_f1_score, 0)/(alpha+np.sum(y_pred_array, 0))
    y_pred_final = (y_pred_final - np.min(y_pred_final)) / (np.max(y_pred_final) - np.min(y_pred_final))
    y_pred_final[np.isnan(y_pred_final)] = 0
    for i in np.arange(0,1,0.005):
        y_pred = np.where(y_pred_final>=i, 1, 0)
        report = classification_report(y_test,y_pred,output_dict=True )
        listf1.append(report['True']['f1-score'])
        listIndex.append(i)
        # print( i,' : ',report['True'])
    if np.max(listf1) > maxF1:
        maxF1 = np.max(listf1)
        alphaMax = alpha
        threshold = listIndex[np.argmax(listf1)]
    print(alpha,':',np.max(listf1))
print('----------------')
print('best alpha : ',alphaMax)
print('best f1-score : ',maxF1)
print('best threshold : ',threshold)

0 : 0.007167982829361359
1 : 0.6570871030690083
2 : 0.7345454545454545
3 : 0.7692947784480965
4 : 0.7782340862422998
5 : 0.7817509247842171
6 : 0.7840837696335078
7 : 0.7848750267036958
8 : 0.7860394537177542
9 : 0.7870129870129869
----------------
best alpha :  9
best f1-score :  0.7870129870129869
best threshold :  0.615
Wall time: 19min 5s


In [275]:
y_pred_final = np.sum(y_pred_array_f1_score, 0)/(alphaMax+np.sum(y_pred_array, 0))
y_pred_final = (y_pred_final - np.min(y_pred_final)) / (np.max(y_pred_final) - np.min(y_pred_final))
y_pred_final[np.isnan(y_pred_final)] = 0

y_pred = np.where(y_pred_final>=threshold, 1, 0)
report = classification_report(y_test,y_pred,output_dict=True )
report

{'False': {'precision': 0.9989457591011812,
  'recall': 0.9996478844405866,
  'f1-score': 0.9992966984391616,
  'support': 732714},
 'True': {'precision': 0.8788732394366198,
  'recall': 0.7077504725897921,
  'f1-score': 0.7840837696335078,
  'support': 2645},
 'accuracy': 0.9985979637156818,
 'macro avg': {'precision': 0.9389094992689004,
  'recall': 0.8536991785151893,
  'f1-score': 0.8916902340363346,
  'support': 735359},
 'weighted avg': {'precision': 0.9985138723431314,
  'recall': 0.9985979637156818,
  'f1-score': 0.9985226027978613,
  'support': 735359}}

# Scores 

In [140]:
arr_conf = []
for i in np.transpose(y_pred_array):
    arr_conf.append([name_configuration[j] for j,x in enumerate(i) if x>0])

In [141]:
new_df = df.loc[X_test.index] 
new_df['Fraudulent Score'] = y_pred_final
new_df['Decision Trees'] = arr_conf
new_df = new_df.sort_values(by=['Fraudulent Score'], ascending=False)

In [142]:
new_df

Unnamed: 0,Transaction Ref,Originator,Beneficiary,Type,Currency,Value,Flag,Fulldate,Intermediate,Fraudulent Score,Decision Trees
3169062,780C11AEADE3,ATCUIDXX,RKGANGXX,MT103,67B,59804,True,2019-11-11 19:26:00,RKGANGXX,1.000000,"[Beneficiary-Global, Beneficiary-10Days, Benef..."
385491,AAA3EB8BA850AD7D,BBWJTLXX,OIKTQAXX,MT103,B6E,60414,True,2019-02-07 11:33:00,OIKTQAXX,0.999213,"[Beneficiary-Global, Beneficiary-10Days, Benef..."
2529248,BED46C41AC,BEMINIXX,GMHUSGXX,MT103,385,831863,True,2019-09-09 13:11:00,PRCFNEXX,0.998568,"[Beneficiary-Global, Beneficiary-10Days, Benef..."
1593214,7E5511214D1E,ARPWSTXX,XWUYTMXX,MT103,A42,74613,True,2019-06-07 08:21:00,TIYOBGXX,0.998527,"[Beneficiary-Global, Beneficiary-10Days, Benef..."
3224905,DCB7A95011CD,BQUSERXX,DBZKBBXX,MT103,3A1,809220,True,2019-11-15 17:40:00,DBZKBBXX,0.998527,"[Beneficiary-Global, Beneficiary-10Days, Benef..."
...,...,...,...,...,...,...,...,...,...,...,...
3516530,CC93E87D1C861B35,AFUEGYXX,VQRRPGXX,MT103,57D,249038,False,2019-12-16 13:29:00,WCWXECXX,0.000000,[]
155697,9A9B4A6740A7,CLAFLAXX,ODWYSCXX,MT202,307,362691,False,2019-01-16 09:06:00,ODWYSCXX,0.000000,[]
1060162,EED49B861D,BCFTZWXX,RLLVKMXX,MT103,EF9,125795,False,2019-04-16 10:40:00,RLLVKMXX,0.000000,[]
2609413,708886A70D48C9EB,CWBECAXX,JTDKSMXX,MT202,41B,73891,False,2019-09-17 10:30:00,SQMLAEXX,0.000000,[]


# Get Path

In [159]:
index_most_fraudulent_transaction = new_df.iloc[0].name

In [170]:
first_decision_tree = new_df.iloc[0]['Decision Trees'][0]

In [173]:
index_model = name_configuration.index(first_decision_tree)

In [213]:
models[index_model].predict([X_test[array[index_model]+base_feature].loc[index_most_fraudulent_transaction]])

array([ True])

In [324]:
index_loc = list(X_test_model.index).index(index_most_fraudulent_transaction)

In [325]:
estimator = models[index_model]

n_nodes = estimator.tree_.node_count
children_left = estimator.tree_.children_left
children_right = estimator.tree_.children_right
feature = estimator.tree_.feature
threshold = estimator.tree_.threshold

X_test_model = X_test[array[index_model]+base_feature]
node_indicator = estimator.decision_path(X_test_model)

leave_id = estimator.apply(X_test_model)


# HERE IS WHAT YOU WANT
sample_id = index_loc
node_index = node_indicator.indices[node_indicator.indptr[sample_id]:
                                    node_indicator.indptr[sample_id + 1]]

print('Rules used to predict sample %s: ' % sample_id)
for node_id in node_index:

    if leave_id[sample_id] == node_id:  # <-- changed != to ==
        #continue # <-- comment out
        print("leaf node {} reached, no decision here".format(leave_id[sample_id])) # <--

    else: # < -- added else to iterate through decision nodes
        if (X_test_model.iloc[[sample_id], feature[node_id]].iloc[0] <= threshold[node_id]):
            threshold_sign = "<="
        else:
            threshold_sign = ">"

        print("%s : %s %s %s"
              % (X_test_model.columns[feature[node_id]],
                  X_test_model.iloc[[sample_id], feature[node_id]].iloc[0],
                 threshold_sign,
                 threshold[node_id]))

Rules used to predict sample 20397: 
frequency_Global_Beneficiary : 1597 > 1.5
frequency_to_Originator_Country_Global_Beneficiary : 216 > 156.5
Value : 59804 <= 469121.0
latency_Global_Beneficiary : 240.00000000000003 <= 450.0
frequency_Global_Beneficiary : 1597 > 425.5
frequency_Global_Beneficiary : 1597 > 985.5
frequency_to_Originator_Country_Global_Beneficiary : 216 <= 513.5
Hour : 19 <= 19.5
Value : 59804 <= 223063.0
Hour : 19 > 18.5
frequency_to_Originator_Country_Global_Beneficiary : 216 <= 217.5
leaf node 6220 reached, no decision here
