In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('../swift.csv')

In [3]:
df_copy = df.copy()

In [4]:
size = 1000000

In [5]:
df = df_copy.copy()[:size]

In [6]:
df.isna().any()

Transaction Ref    False
Originator         False
Beneficiary        False
Type               False
Currency           False
Value              False
Flag               False
Fulldate           False
Intermediate       False
dtype: bool

In [7]:
%%time
df = df_copy.copy()[:size]
df['Fulldate'] = pd.to_datetime(df['Fulldate'])

df['OriginatorCountry'] = df['Originator'].str.strip().str[4:6]
df['IntermediateCountry'] = df['Intermediate'].str.strip().str[4:6]
df['BeneficiaryCountry'] = df['Originator'].str.strip().str[4:6]

df['OriginatorCountry_BeneficiaryCountry'] = df['OriginatorCountry']+df['BeneficiaryCountry']
df['OriginatorCountry_IntermediateCountry'] = df['OriginatorCountry']+df['IntermediateCountry']
df['IntermediateCountry_BeneficiaryCountry'] = df['IntermediateCountry']+df['BeneficiaryCountry']
df['OriginatorCountry_IntermediateCountry_BeneficiaryCountry'] = df['OriginatorCountry']+df['IntermediateCountry']+df['BeneficiaryCountry']

CPU times: user 1.87 s, sys: 238 ms, total: 2.11 s
Wall time: 2.12 s


# Features Client

In [8]:
def getLatency(x,name_col):
    return (x[name_col]-x[name_col].shift(1)).dt.total_seconds().fillna(0)

In [9]:
def get_circuit(x):
    return sum(df.loc[x.index,'Beneficiary']==df.loc[x.index,'Intermediate'])

In [10]:
def get_circuit(x):
    return sum(x.str[:2]!=x.str[2:4])

In [11]:
def base_features_client(type_client,period,relationship):
    
    cols_start = df.columns
    
    groupby = []
        
    if period == '10Days':
        groupby = type_client+[pd.Grouper(key='Fulldate', axis=0, freq='10D')]
    elif period == 'Month':
        groupby = type_client+[pd.Grouper(key='Fulldate', axis=0, freq='30D')]
    elif period == 'Global':
        groupby = type_client

    name = period+'_'+'_'.join(type_client)
    
    print(groupby)
    #FREQUENCY
    
    df['frequency_'+name] = df.groupby(groupby)['Value'].transform('count')
    
    #AMOUNT
    
    df['sum_value_'+name] = df.groupby(groupby)['Value'].transform('sum')
    df['max_value_'+name] = df.groupby(groupby)['Value'].transform('max')
    df['min_value_'+name] = df.groupby(groupby)['Value'].transform('min')
    df['avg_value_'+name] = df.groupby(groupby)['Value'].transform('sum')/df['frequency_'+name]
    
    if relationship == False:
        # LATENCY
        #print('latency')
        df['latency_'+name] = df.groupby(groupby, as_index=False).apply(getLatency,name_col='Fulldate').reset_index(level=0, drop=True)


        # NB RELATION
        #print('nb relation')
        if type_client[0] == 'Originator':
            df['number_relation_'+name] = df.groupby(['Originator'])['Beneficiary'].transform('nunique')
        elif type_client[0] =='Beneficiary':
            df['number_relation_'+name] = df.groupby(['Beneficiary'])['Originator'].transform('nunique')
        
    # NB OF DISTINCT CURRENCY
    #print('currency 1')
    
    df['number_distinct_currency_'+name] = df.groupby(['Originator'])['Currency'].transform('nunique')
    
    # NB WITH CURRENT CURRENCY
    #print('currency 2')
        
    df['frequency_with_currency_'+name] = df.groupby(groupby+['Currency'])['Value'].transform('count')
    
    if relationship == False:
    
        # NB OF DISTINCT COUNTRIES
        #print('country 1')

        if type_client[0] == 'Originator' :
            df['number_distinct_country_'+name] = df.groupby(['Originator'])['BeneficiaryCountry'].transform('nunique')
        elif type_client[0] =='Beneficiary':
            df['number_distinct_country_'+name] = df.groupby(['Beneficiary'])['OriginatorCountry'].transform('nunique')

        # NB WITH CURRENT COUNTRY
       # print('country 2')

        if type_client[0] == 'Originator':
            df['frequency_to_Beneficiary_Country_'+name] = df.groupby(groupby+['BeneficiaryCountry'])['Value'].transform('count')

        elif type_client[0] =='Beneficiary':
            df['frequency_to_Originator_Country_'+name] = df.groupby(groupby+['OriginatorCountry'])['Value'].transform('count')


        # NB WITH INTERMEDIATE
        #print('intermediate 1')

        if type_client[0] == 'Originator':
            df['number_with_intermediate_'+name] = df.groupby(['Originator'])['IntermediateCountry_BeneficiaryCountry'].transform(get_circuit)
        elif type_client[0] =='Beneficiary' :
            df['number_with_intermediate_'+name] = df.groupby(['Beneficiary'])['IntermediateCountry_BeneficiaryCountry'].transform(get_circuit)

        # NB DISTINCT CIRCRUIT
        #print('intermediate 2')

        if type_client[0] == 'Originator':
            df['number_dinstinct_circuit_with_intermediate_'+name] = df.groupby(['Originator'])['IntermediateCountry_BeneficiaryCountry'].transform('nunique')
        elif type_client[0] =='Beneficiary':
            df['number_dinstinct_circuit_with_intermediate_'+name] = df.groupby(['Beneficiary'])['OriginatorCountry_IntermediateCountry'].transform('nunique')

        # NB CURRENT CIRCRUIT
        #print('intermediate 3')

        if type_client[0] == 'Originator':
            df['number_current_circuit_with_intermediate_'+name] = df.groupby(['Originator','IntermediateCountry_BeneficiaryCountry'])['Value'].transform('count')
        elif type_client[0] =='Beneficiary':
            df['number_current_circuit_with_intermediate_'+name] = df.groupby(['Beneficiary','OriginatorCountry_IntermediateCountry'])['Value'].transform('count')

    
    cols_end = df.columns
    
    new_cols = list(set(cols_end) - set(cols_start))
    
    return new_cols

In [12]:
%%time 

array = []
name_configuration = []

array.append(base_features_client(['Beneficiary'],'Global',False))
array.append(base_features_client(['Beneficiary'],'10Days',False))
array.append(base_features_client(['Beneficiary'],'Month',False))
name_configuration.append('Beneficiary-Global')
name_configuration.append('Beneficiary-10Days')
name_configuration.append('Beneficiary-Month')

array.append(base_features_client(['Originator'],'Global',False))
array.append(base_features_client(['Originator'],'10Days',False))
array.append(base_features_client(['Originator'],'Month',False))
name_configuration.append('Originator-Global')
name_configuration.append('Originator-10Days')
name_configuration.append('Originator-Month')

array.append(base_features_client(['Intermediate'],'Global',False))
array.append(base_features_client(['Intermediate'],'10Days',False))
array.append(base_features_client(['Intermediate'],'Month',False))
name_configuration.append('Intermediate-Global')
name_configuration.append('Intermediate-10Days')
name_configuration.append('Intermediate-Month')

array.append(base_features_client(['Originator','Beneficiary'],'Global',True))
array.append(base_features_client(['Originator','Beneficiary'],'10Days',True))
array.append(base_features_client(['Originator','Beneficiary'],'Month',True))
name_configuration.append('Originator-Beneficiary-Global')
name_configuration.append('Originator-Beneficiary-10Days')
name_configuration.append('Originator-Beneficiary-Month')

array.append(base_features_client(['Originator','Intermediate'],'Global',True))
array.append(base_features_client(['Originator','Intermediate'],'10Days',True))
array.append(base_features_client(['Originator','Intermediate'],'Month',True))
name_configuration.append('Originator-Intermediate-Global')
name_configuration.append('Originator-Intermediate-10Days')
name_configuration.append('Originator-Intermediate-Month')

array.append(base_features_client(['Intermediate','Beneficiary'],'Global',True))
array.append(base_features_client(['Intermediate','Beneficiary'],'10Days',True))
array.append(base_features_client(['Intermediate','Beneficiary'],'Month',True))
name_configuration.append('Intermediate-Beneficiary-Global')
name_configuration.append('Intermediate-Beneficiary-10Days')
name_configuration.append('Intermediate-Beneficiary-Month')

['Beneficiary']
['Beneficiary', TimeGrouper(key='Fulldate', freq=<10 * Days>, axis=0, sort=True, closed='left', label='left', how='mean', convention='e', origin='start_day')]
['Beneficiary', TimeGrouper(key='Fulldate', freq=<30 * Days>, axis=0, sort=True, closed='left', label='left', how='mean', convention='e', origin='start_day')]
['Originator']
['Originator', TimeGrouper(key='Fulldate', freq=<10 * Days>, axis=0, sort=True, closed='left', label='left', how='mean', convention='e', origin='start_day')]
['Originator', TimeGrouper(key='Fulldate', freq=<30 * Days>, axis=0, sort=True, closed='left', label='left', how='mean', convention='e', origin='start_day')]
['Intermediate']
['Intermediate', TimeGrouper(key='Fulldate', freq=<10 * Days>, axis=0, sort=True, closed='left', label='left', how='mean', convention='e', origin='start_day')]
['Intermediate', TimeGrouper(key='Fulldate', freq=<30 * Days>, axis=0, sort=True, closed='left', label='left', how='mean', convention='e', origin='start_day')

# Features Category

In [13]:
def base_features_transaction(column,period):
    
    cols_start = df.columns
        
    if period == '10Days':
        groupby = [column,pd.Grouper(key=name_col_date, axis=0, freq='10D')]
    elif period == 'Month':
        groupby = [column,pd.Grouper(key=name_col_date, axis=0, freq='30D')]
    elif period == 'Global':
        groupby = column

    name = period+'_'+'_'.join(column)
    
    #FREQUENCY
    
    df['frequency_'+name] = df.groupby(groupby)['Value'].transform('count')
    
    #AMOUNT
    
    df['sum_value_'+name] = df.groupby(groupby)['Value'].transform('sum')
    df['max_value_'+name] = df.groupby(groupby)['Value'].transform('max')
    df['min_value_'+name] = df.groupby(groupby)['Value'].transform('min')
    df['avg_value_'+name] = df.groupby(groupby)['Value'].transform('sum')/df['frequency_'+name]
    
    # LATENCY
    df['latency_'+name] = df.groupby(groupby, as_index=False).apply(getLatency,name_col='Fulldate').reset_index(level=0, drop=True)
    
    # NB RELATION
    df['number_relation_originator_'+name] = df.groupby(groupby)['Originator'].transform('nunique')
    df['number_relation_beneficiary_'+name] = df.groupby(groupby)['Beneficiary'].transform('nunique')
    df['number_relation_intermediate_'+name] = df.groupby(groupby)['Intermediate'].transform('nunique')
        
    # NB OF DISTINCT CURRENCY
    
    if column != 'Currency':
        df['number_distinct_currency_'+name] = df.groupby(groupby)['Currency'].transform('nunique')
        df['frequency_with_currency_'+name] = df.groupby(groupby+['Currency'])['Value'].transform('count')
    
    
    # NB OF DISTINCT COUNTRIES
    
    if column not in ['OriginatorCountry','BeneficiaryCountry','IntermediateCountry']:
        df['number_distinct_originator_country_'+name] = df.groupby(groupby)['OriginatorCountry'].transform('nunique')
        df['number_distinct_beneficiary_country_'+name] = df.groupby(groupby)['BeneficiaryCountry'].transform('nunique')
        df['number_distinct_intermediate_country_'+name] = df.groupby(groupby)['IntermediateCountry'].transform('nunique')
        
        df['number_current_originator_country_'+name] = df.groupby(groupby)['OriginatorCountry'].transform('count')
        df['number_current_beneficiary_country_'+name] = df.groupby(groupby)['BeneficiaryCountry'].transform('count')
        df['number_current_intermediate_country_'+name] = df.groupby(groupby)['IntermediateCountry'].transform('count')
            
    
    # NB WITH INTERMEDIATE
    df['number_with_intermediate_'+name] = df.groupby(groupby)['IntermediateCountry_BeneficiaryCountry'].transform(get_circuit)
    
    # NB DISTINCT CIRCRUIT
    
    df['number_dinstinct_circuit_with_intermediate_'+name] = df.groupby(groupby)['IntermediateCountry_BeneficiaryCountry'].transform('nunique')
        
    # NB CURRENT CIRCRUIT
    
    df['number_current_circuit_intermediateCountry_beneficiaryCountry_'+name] = df.groupby(groupby+['IntermediateCountry_BeneficiaryCountry'])['Value'].transform('count')
    df['number_current_circuit_originatorCountry_intermediateCountry_'+name] = df.groupby(groupby+['OriginatorCountry_IntermediateCountry'])['Value'].transform('count')
    df['number_current_circuit_originatorCountry_beneficiaryCountry_'+name] = df.groupby(groupby+['OriginatorCountry_BeneficiaryCountry'])['Value'].transform('count')
    df['number_current_circuit_originatorCountry_intermediateCountry_beneficiaryCountry_'+name] = df.groupby(groupby+['OriginatorCountry_IntermediateCountry_BeneficiaryCountry'])['Value'].transform('count')
    
    cols_end = df.columns
    
    new_cols = list(set(cols_end) - set(cols_start))
    
    return new_cols
        

In [14]:
%%time


base_feature = []
array.extend(base_features_transaction(['Currency'],'Global'))
name_configuration.append('Currency-Global')
array.extend(base_features_transaction(['OriginatorCountry'],'Global'))
name_configuration.append('OriginatorCountry-Global')
array.extend(base_features_transaction(['BeneficiaryCountry'],'Global'))
name_configuration.append('BeneficiaryCountry-Global')
array.extend(base_features_transaction(['IntermediateCountry'],'Global'))
name_configuration.append('IntermediateCountry-Global')
array.extend(base_features_transaction(['Type'],'Global'))
name_configuration.append('Type-Global')


CPU times: user 33.8 s, sys: 3.74 s, total: 37.6 s
Wall time: 37.6 s


# Transaction Features

In [15]:


base_feature.extend(list(df_copy.select_dtypes(include=np.number).columns))
cols_start = df.columns

df['Hour'] = df['Fulldate'].dt.hour
df['Week_of_year'] = df['Fulldate'].dt.week
df['day_of_week'] = df['Fulldate'].dt.dayofweek
df['day_of_year'] = df['Fulldate'].dt.dayofyear
df['quarter'] = df['Fulldate'].dt.quarter
df['year'] = df['Fulldate'].dt.year
df['month'] = df['Fulldate'].dt.month
df['day'] = df['Fulldate'].dt.day
df['is_month_start'] = df['Fulldate'].dt.is_month_start
df['is_month_start'] = df['is_month_start'].map({True: 1, False:0})
df['is_month_end'] = df['Fulldate'].dt.is_month_end
df['is_month_end'] = df['is_month_end'].map({True: 1, False:0})
cols_end = df.columns          
new_cols = list(set(cols_end) - set(cols_start))

base_feature.extend(new_cols)

  df['Week_of_year'] = df['Fulldate'].dt.week


In [16]:
df.isna().any()

Transaction Ref    False
Originator         False
Beneficiary        False
Type               False
Currency           False
                   ...  
year               False
month              False
day                False
is_month_start     False
is_month_end       False
Length: 312, dtype: bool

# Classification

In [17]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler
from itertools import combinations
from imblearn.ensemble import BalancedRandomForestClassifier

In [18]:
X = df.select_dtypes(include=np.number)
y = df['Flag']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
%%time

model = DecisionTreeClassifier()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

report = classification_report(y_test,y_pred,output_dict=True )
CM = confusion_matrix(y_test, y_pred)


TN = CM[0][0]
FN = CM[1][0]
TP = CM[1][1]
FP = CM[0][1]
print('les fraudes : ')
print('bien trouvée TP : ',TP)
print('pas trouvée FN : ',FN) 
print('les normaux')
print('bien trouvée TN : ',TN)
print('pas trouvée FP : ',FP)
report

les fraudes : 
bien trouvée TP :  457
pas trouvée FN :  245
les normaux
bien trouvée TN :  198892
pas trouvée FP :  406
CPU times: user 3min 43s, sys: 1.13 s, total: 3min 44s
Wall time: 3min 44s


{'False': {'precision': 0.998769691217604,
  'recall': 0.9979628496021034,
  'f1-score': 0.9983661073951835,
  'support': 199298},
 'True': {'precision': 0.52954808806489,
  'recall': 0.6509971509971509,
  'f1-score': 0.5840255591054313,
  'support': 702},
 'accuracy': 0.996745,
 'macro avg': {'precision': 0.764158889641247,
  'recall': 0.8244800002996271,
  'f1-score': 0.7911958332503074,
  'support': 200000},
 'weighted avg': {'precision': 0.9971227233905379,
  'recall': 0.996745,
  'f1-score': 0.9969117720706864,
  'support': 200000}}

In [20]:
%%time

model = BalancedRandomForestClassifier(n_estimators=30)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

report = classification_report(y_test,y_pred,output_dict=True )
CM = confusion_matrix(y_test, y_pred)


TN = CM[0][0]
FN = CM[1][0]
TP = CM[1][1]
FP = CM[0][1]
print('les fraudes : ')
print('bien trouvée TP : ',TP)
print('pas trouvée FN : ',FN) 
print('les normaux')
print('bien trouvée TN : ',TN)
print('pas trouvée FP : ',FP)
report

les fraudes : 
bien trouvée TP :  650
pas trouvée FN :  52
les normaux
bien trouvée TN :  187457
pas trouvée FP :  11841
CPU times: user 7.22 s, sys: 495 ms, total: 7.72 s
Wall time: 7.71 s


{'False': {'precision': 0.9997226799780278,
  'recall': 0.940586458469227,
  'f1-score': 0.9692534002745556,
  'support': 199298},
 'True': {'precision': 0.05203746697622288,
  'recall': 0.9259259259259259,
  'f1-score': 0.09853710300917153,
  'support': 702},
 'accuracy': 0.940535,
 'macro avg': {'precision': 0.5258800734771253,
  'recall': 0.9332561921975764,
  'f1-score': 0.5338952516418636,
  'support': 200000},
 'weighted avg': {'precision': 0.9963963048803914,
  'recall': 0.940535,
  'f1-score': 0.9661971860711541,
  'support': 200000}}

In [21]:
%%time

modelRF = RandomForestClassifier(n_estimators=30,bootstrap = True,max_features = 'sqrt')

modelRF.fit(X_train, y_train)

y_pred = modelRF.predict(X_test)

report = classification_report(y_test,y_pred,output_dict=True )
report
CM = confusion_matrix(y_test, y_pred)

TN = CM[0][0]
FN = CM[1][0]
TP = CM[1][1]
FP = CM[0][1]
print('les fraudes : ')
print('bien trouvée TP : ',TP)
print('pas trouvée FN : ',FN)
print('les normaux')
print('bien trouvée TN : ',TN)
print('pas trouvée FP : ',FP)
report

les fraudes : 
bien trouvée TP :  416
pas trouvée FN :  286
les normaux
bien trouvée TN :  199260
pas trouvée FP :  38
CPU times: user 3min 46s, sys: 384 ms, total: 3min 47s
Wall time: 3min 47s


{'False': {'precision': 0.9985667465145881,
  'recall': 0.9998093307509358,
  'f1-score': 0.9991876523151909,
  'support': 199298},
 'True': {'precision': 0.9162995594713657,
  'recall': 0.5925925925925926,
  'f1-score': 0.7197231833910035,
  'support': 702},
 'accuracy': 0.99838,
 'macro avg': {'precision': 0.9574331529929769,
  'recall': 0.7962009616717642,
  'f1-score': 0.8594554178530972,
  'support': 200000},
 'weighted avg': {'precision': 0.9982779886880664,
  'recall': 0.99838,
  'f1-score': 0.998206732029267,
  'support': 200000}}

# Approche

## Decision Tree training

In [22]:
%%time
y_pred_array = []
y_pred_array_f1_score = []
models = []
models_f1_score = []



for idx,cols in enumerate(name_configuration):
    model = DecisionTreeClassifier()
    model.fit(X_train[array[idx]+base_feature], y_train)
    models.append(model)
    
    y_pred = model.predict(X_test[array[idx]+base_feature])
    
    report = classification_report(y_test,y_pred,output_dict=True )

    models_f1_score.append(report['True']['f1-score'])
    y_pred_array.append(np.multiply(y_pred, 1))
    y_pred_array_f1_score.append(np.multiply(y_pred, round(report['True']['f1-score'],2)))
    
    print(name_configuration[idx], 'base & ',round(report['True']['precision'],2),'&',round(report['True']['recall'],2),'&',round(report['True']['f1-score'],2),'\\\ \hline')


Beneficiary-Global base &  0.48 & 0.59 & 0.53 \\ \hline
Beneficiary-10Days base &  0.44 & 0.55 & 0.49 \\ \hline
Beneficiary-Month base &  0.44 & 0.54 & 0.48 \\ \hline
Originator-Global base &  0.19 & 0.26 & 0.22 \\ \hline
Originator-10Days base &  0.21 & 0.29 & 0.24 \\ \hline
Originator-Month base &  0.2 & 0.29 & 0.23 \\ \hline
Intermediate-Global base &  0.31 & 0.4 & 0.35 \\ \hline
Intermediate-10Days base &  0.17 & 0.22 & 0.19 \\ \hline
Intermediate-Month base &  0.26 & 0.32 & 0.29 \\ \hline
Originator-Beneficiary-Global base &  0.53 & 0.64 & 0.58 \\ \hline
Originator-Beneficiary-10Days base &  0.32 & 0.42 & 0.36 \\ \hline
Originator-Beneficiary-Month base &  0.47 & 0.56 & 0.51 \\ \hline
Originator-Intermediate-Global base &  0.38 & 0.5 & 0.43 \\ \hline
Originator-Intermediate-10Days base &  0.23 & 0.3 & 0.26 \\ \hline
Originator-Intermediate-Month base &  0.29 & 0.39 & 0.34 \\ \hline
Intermediate-Beneficiary-Global base &  0.39 & 0.47 & 0.42 \\ \hline
Intermediate-Beneficiary-10Days

TypeError: can only concatenate str (not "list") to str

In [23]:
import warnings
warnings.filterwarnings('ignore')

In [24]:
%%time

for alpha in range(10):
    maxF1 = 0
    alphaMax = 0
    threshold = 0
    listf1 = []
    listIndex = []
    y_pred_final = np.sum(y_pred_array_f1_score, 0)/(alpha+np.sum(y_pred_array, 0))
    y_pred_final = (y_pred_final - np.min(y_pred_final)) / (np.max(y_pred_final) - np.min(y_pred_final))
    y_pred_final[np.isnan(y_pred_final)] = 0
    for i in np.arange(0,1,0.005):
        y_pred = np.where(y_pred_final>=i, 1, 0)
        report = classification_report(y_test,y_pred,output_dict=True )
        listf1.append(report['True']['f1-score'])
        listIndex.append(i)
        # print( i,' : ',report['True'])
    if np.max(listf1) > maxF1:
        maxF1 = np.max(listf1)
        alphaMax = alpha
        threshold = listIndex[np.argmax(listf1)]
    print(alpha,':',np.max(listf1))
print('----------------')
print('best alpha : ',alphaMax)
print('best f1-score : ',maxF1)
print('best threshold : ',threshold)

0 : 0.006995445984594076
1 : 0.6550569323509712
2 : 0.7285382830626449
3 : 0.7449062754686226
4 : 0.7567114093959731
5 : 0.7616666666666666
6 : 0.7654941373534337
7 : 0.7675585284280936
8 : 0.7675585284280936
9 : 0.7679465776293823
----------------
best alpha :  9
best f1-score :  0.7679465776293823
best threshold :  0.63
CPU times: user 5min 17s, sys: 7.94 ms, total: 5min 17s
Wall time: 5min 17s


In [25]:
y_pred_final = np.sum(y_pred_array_f1_score, 0)/(alphaMax+np.sum(y_pred_array, 0))
y_pred_final = (y_pred_final - np.min(y_pred_final)) / (np.max(y_pred_final) - np.min(y_pred_final))
y_pred_final[np.isnan(y_pred_final)] = 0

y_pred = np.where(y_pred_final>=threshold, 1, 0)
report = classification_report(y_test,y_pred,output_dict=True )
report

{'False': {'precision': 0.998786991739514,
  'recall': 0.9998193659745708,
  'f1-score': 0.9993029122221053,
  'support': 199298},
 'True': {'precision': 0.9274193548387096,
  'recall': 0.6552706552706553,
  'f1-score': 0.7679465776293823,
  'support': 702},
 'accuracy': 0.99861,
 'macro avg': {'precision': 0.9631031732891118,
  'recall': 0.827545010622613,
  'f1-score': 0.8836247449257438,
  'support': 200000},
 'weighted avg': {'precision': 0.9985364913339924,
  'recall': 0.99861,
  'f1-score': 0.998490851487685,
  'support': 200000}}

# Scores 

In [26]:
arr_conf = []
for i in np.transpose(y_pred_array):
    arr_conf.append([name_configuration[j] for j,x in enumerate(i) if x>0])

In [27]:
new_df = df.loc[X_test.index] 
new_df['Fraudulent Score'] = y_pred_final
new_df['Decision Trees'] = arr_conf
new_df = new_df.sort_values(by=['Fraudulent Score'], ascending=False)

In [28]:
new_df

Unnamed: 0,Transaction Ref,Originator,Beneficiary,Type,Currency,Value,Flag,Fulldate,Intermediate,OriginatorCountry,...,day_of_week,day_of_year,quarter,year,month,day,is_month_start,is_month_end,Fraudulent Score,Decision Trees
537076,EB1C7121B88E60B6,AXNZSHXX,QCCYGGXX,MT103,D9A,598900,True,2019-02-22 09:20:00,QCCYGGXX,SH,...,4,53,1,2019,2,22,0,0,1.000000,"[Beneficiary-Global, Beneficiary-10Days, Benef..."
834719,761AABAECBCAACB8,CHLXHKXX,MCRXSGXX,MT103,F30,256332,True,2019-03-25 10:29:00,CWTBSYXX,HK,...,0,84,1,2019,3,25,0,0,0.997853,"[Beneficiary-Global, Beneficiary-10Days, Benef..."
299112,6E3C3B5031B7,CLAFLAXX,EEKJASXX,MT103,D9A,809492,True,2019-01-30 10:32:00,EEKJASXX,LA,...,2,30,1,2019,1,30,0,0,0.997853,"[Beneficiary-Global, Beneficiary-10Days, Benef..."
346019,CA916A9EA947CD9D,BBWJTLXX,XYJAMAXX,MT103,A26,74608,True,2019-02-04 13:21:00,NLULBMXX,TL,...,0,35,1,2019,2,4,0,0,0.997853,"[Beneficiary-Global, Beneficiary-10Days, Benef..."
951584,ECC2A36180AA,BQUSERXX,XJOAAEXX,MT103,0A6,491710,True,2019-04-04 13:02:00,XJOAAEXX,ER,...,3,94,2,2019,4,4,0,0,0.995074,"[Beneficiary-Global, Beneficiary-10Days, Benef..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599036,098EEDBBEEBABBC4,CHLXHKXX,KROEMYXX,MT202,4D8,500181,False,2019-02-28 12:39:00,KROEMYXX,HK,...,3,59,1,2019,2,28,0,1,0.000000,[]
175619,9A5DCDAD1D50,BEOWBIXX,URUKSBXX,MT202,5C9,208900,False,2019-01-17 12:43:00,URUKSBXX,BI,...,3,17,1,2019,1,17,0,0,0.000000,[]
26619,EED84A465E,BCFTZWXX,SUZEHKXX,MT103,505,146032,False,2019-01-02 16:14:00,NHDFPGXX,ZW,...,2,2,1,2019,1,2,0,0,0.000000,[]
723470,155550D40C71D2AC,CWBECAXX,CROKBSXX,MT202C,738,112595,False,2019-03-13 11:09:00,JKEETMXX,CA,...,2,72,1,2019,3,13,0,0,0.000000,[]
