In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('data.csv')

In [3]:
df_copy = df.copy()

In [4]:
size = 1000000

In [5]:
df = df_copy.copy()[:size]

In [6]:
df.isna().any()

Transaction Ref    False
Originator         False
Beneficiary        False
Type               False
Currency           False
Value              False
Flag               False
Fulldate           False
Intermediate       False
dtype: bool

In [7]:
%%time
df = df_copy.copy()[:size]
df['Fulldate'] = pd.to_datetime(df['Fulldate'])

df['OriginatorCountry'] = df['Originator'].str.strip().str[4:6]
df['IntermediateCountry'] = df['Intermediate'].str.strip().str[4:6]
df['BeneficiaryCountry'] = df['Originator'].str.strip().str[4:6]

df['OriginatorCountry_BeneficiaryCountry'] = df['OriginatorCountry']+df['BeneficiaryCountry']
df['OriginatorCountry_IntermediateCountry'] = df['OriginatorCountry']+df['IntermediateCountry']
df['IntermediateCountry_BeneficiaryCountry'] = df['IntermediateCountry']+df['BeneficiaryCountry']
df['OriginatorCountry_IntermediateCountry_BeneficiaryCountry'] = df['OriginatorCountry']+df['IntermediateCountry']+df['BeneficiaryCountry']

CPU times: user 1.76 s, sys: 233 ms, total: 1.99 s
Wall time: 2.01 s


# Features Client

In [8]:
def getLatency(x,name_col):
    return (x[name_col]-x[name_col].shift(1)).dt.total_seconds().fillna(0)

In [9]:
def get_circuit(x):
    return sum(df.loc[x.index,'Beneficiary']==df.loc[x.index,'Intermediate'])

In [10]:
def get_circuit(x):
    return sum(x.str[:2]!=x.str[2:4])

In [11]:
def base_features_client(type_client,period,relationship):
    
    cols_start = df.columns
    
    groupby = []
        
    if period == '10Days':
        groupby = type_client+[pd.Grouper(key='Fulldate', axis=0, freq='10D')]
    elif period == 'Month':
        groupby = type_client+[pd.Grouper(key='Fulldate', axis=0, freq='30D')]
    elif period == 'Global':
        groupby = type_client

    name = period+'_'+'_'.join(type_client)
    
    print(groupby)
    #FREQUENCY
    
    df['frequency_'+name] = df.groupby(groupby)['Value'].transform('count')
    
    #AMOUNT
    
    df['sum_value_'+name] = df.groupby(groupby)['Value'].transform('sum')
    df['max_value_'+name] = df.groupby(groupby)['Value'].transform('max')
    df['min_value_'+name] = df.groupby(groupby)['Value'].transform('min')
    df['avg_value_'+name] = df.groupby(groupby)['Value'].transform('sum')/df['frequency_'+name]
    
    if relationship == False:
        # LATENCY
        #print('latency')
        df['latency_'+name] = df.groupby(groupby, as_index=False).apply(getLatency,name_col='Fulldate').reset_index(level=0, drop=True)


        # NB RELATION
        #print('nb relation')
        if type_client[0] == 'Originator':
            df['number_relation_'+name] = df.groupby(['Originator'])['Beneficiary'].transform('nunique')
        elif type_client[0] =='Beneficiary':
            df['number_relation_'+name] = df.groupby(['Beneficiary'])['Originator'].transform('nunique')
        
    # NB OF DISTINCT CURRENCY
    #print('currency 1')
    
    df['number_distinct_currency_'+name] = df.groupby(['Originator'])['Currency'].transform('nunique')
    
    # NB WITH CURRENT CURRENCY
    #print('currency 2')
        
    df['frequency_with_currency_'+name] = df.groupby(groupby+['Currency'])['Value'].transform('count')
    
    if relationship == False:
    
        # NB OF DISTINCT COUNTRIES
        #print('country 1')

        if type_client[0] == 'Originator' :
            df['number_distinct_country_'+name] = df.groupby(['Originator'])['BeneficiaryCountry'].transform('nunique')
        elif type_client[0] =='Beneficiary':
            df['number_distinct_country_'+name] = df.groupby(['Beneficiary'])['OriginatorCountry'].transform('nunique')

        # NB WITH CURRENT COUNTRY
       # print('country 2')

        if type_client[0] == 'Originator':
            df['frequency_to_Beneficiary_Country_'+name] = df.groupby(groupby+['BeneficiaryCountry'])['Value'].transform('count')

        elif type_client[0] =='Beneficiary':
            df['frequency_to_Originator_Country_'+name] = df.groupby(groupby+['OriginatorCountry'])['Value'].transform('count')


        # NB WITH INTERMEDIATE
        #print('intermediate 1')

        if type_client[0] == 'Originator':
            df['number_with_intermediate_'+name] = df.groupby(['Originator'])['IntermediateCountry_BeneficiaryCountry'].transform(get_circuit)
        elif type_client[0] =='Beneficiary' :
            df['number_with_intermediate_'+name] = df.groupby(['Beneficiary'])['IntermediateCountry_BeneficiaryCountry'].transform(get_circuit)

        # NB DISTINCT CIRCRUIT
        #print('intermediate 2')

        if type_client[0] == 'Originator':
            df['number_dinstinct_circuit_with_intermediate_'+name] = df.groupby(['Originator'])['IntermediateCountry_BeneficiaryCountry'].transform('nunique')
        elif type_client[0] =='Beneficiary':
            df['number_dinstinct_circuit_with_intermediate_'+name] = df.groupby(['Beneficiary'])['OriginatorCountry_IntermediateCountry'].transform('nunique')

        # NB CURRENT CIRCRUIT
        #print('intermediate 3')

        if type_client[0] == 'Originator':
            df['number_current_circuit_with_intermediate_'+name] = df.groupby(['Originator','IntermediateCountry_BeneficiaryCountry'])['Value'].transform('count')
        elif type_client[0] =='Beneficiary':
            df['number_current_circuit_with_intermediate_'+name] = df.groupby(['Beneficiary','OriginatorCountry_IntermediateCountry'])['Value'].transform('count')

    
    cols_end = df.columns
    
    new_cols = list(set(cols_end) - set(cols_start))
    
    return new_cols

In [12]:
%%time 

array = []
name_configuration = []

array.append(base_features_client(['Beneficiary'],'Global',False))
array.append(base_features_client(['Beneficiary'],'10Days',False))
array.append(base_features_client(['Beneficiary'],'Month',False))
name_configuration.append('Beneficiary-Global')
name_configuration.append('Beneficiary-10Days')
name_configuration.append('Beneficiary-Month')

array.append(base_features_client(['Originator'],'Global',False))
array.append(base_features_client(['Originator'],'10Days',False))
array.append(base_features_client(['Originator'],'Month',False))
name_configuration.append('Originator-Global')
name_configuration.append('Originator-10Days')
name_configuration.append('Originator-Month')

array.append(base_features_client(['Intermediate'],'Global',False))
array.append(base_features_client(['Intermediate'],'10Days',False))
array.append(base_features_client(['Intermediate'],'Month',False))
name_configuration.append('Intermediate-Global')
name_configuration.append('Intermediate-10Days')
name_configuration.append('Intermediate-Month')

array.append(base_features_client(['Originator','Beneficiary'],'Global',True))
array.append(base_features_client(['Originator','Beneficiary'],'10Days',True))
array.append(base_features_client(['Originator','Beneficiary'],'Month',True))
name_configuration.append('Originator-Beneficiary-Global')
name_configuration.append('Originator-Beneficiary-10Days')
name_configuration.append('Originator-Beneficiary-Month')

array.append(base_features_client(['Originator','Intermediate'],'Global',True))
array.append(base_features_client(['Originator','Intermediate'],'10Days',True))
array.append(base_features_client(['Originator','Intermediate'],'Month',True))
name_configuration.append('Originator-Intermediate-Global')
name_configuration.append('Originator-Intermediate-10Days')
name_configuration.append('Originator-Intermediate-Month')

array.append(base_features_client(['Intermediate','Beneficiary'],'Global',True))
array.append(base_features_client(['Intermediate','Beneficiary'],'10Days',True))
array.append(base_features_client(['Intermediate','Beneficiary'],'Month',True))
name_configuration.append('Intermediate-Beneficiary-Global')
name_configuration.append('Intermediate-Beneficiary-10Days')
name_configuration.append('Intermediate-Beneficiary-Month')

['Beneficiary']
['Beneficiary', TimeGrouper(key='Fulldate', freq=<10 * Days>, axis=0, sort=True, closed='left', label='left', how='mean', convention='e', origin='start_day')]
['Beneficiary', TimeGrouper(key='Fulldate', freq=<30 * Days>, axis=0, sort=True, closed='left', label='left', how='mean', convention='e', origin='start_day')]
['Originator']
['Originator', TimeGrouper(key='Fulldate', freq=<10 * Days>, axis=0, sort=True, closed='left', label='left', how='mean', convention='e', origin='start_day')]
['Originator', TimeGrouper(key='Fulldate', freq=<30 * Days>, axis=0, sort=True, closed='left', label='left', how='mean', convention='e', origin='start_day')]
['Intermediate']
['Intermediate', TimeGrouper(key='Fulldate', freq=<10 * Days>, axis=0, sort=True, closed='left', label='left', how='mean', convention='e', origin='start_day')]
['Intermediate', TimeGrouper(key='Fulldate', freq=<30 * Days>, axis=0, sort=True, closed='left', label='left', how='mean', convention='e', origin='start_day')

# Features Category

In [13]:
def base_features_transaction(column,period):
    
    cols_start = df.columns
        
    if period == '10Days':
        groupby = [column,pd.Grouper(key=name_col_date, axis=0, freq='10D')]
    elif period == 'Month':
        groupby = [column,pd.Grouper(key=name_col_date, axis=0, freq='30D')]
    elif period == 'Global':
        groupby = column

    name = period+'_'+'_'.join(column)
    
    #FREQUENCY
    
    df['frequency_'+name] = df.groupby(groupby)['Value'].transform('count')
    
    #AMOUNT
    
    df['sum_value_'+name] = df.groupby(groupby)['Value'].transform('sum')
    df['max_value_'+name] = df.groupby(groupby)['Value'].transform('max')
    df['min_value_'+name] = df.groupby(groupby)['Value'].transform('min')
    df['avg_value_'+name] = df.groupby(groupby)['Value'].transform('sum')/df['frequency_'+name]
    
    # LATENCY
    df['latency_'+name] = df.groupby(groupby, as_index=False).apply(getLatency,name_col='Fulldate').reset_index(level=0, drop=True)
    
    # NB RELATION
    df['number_relation_originator_'+name] = df.groupby(groupby)['Originator'].transform('nunique')
    df['number_relation_beneficiary_'+name] = df.groupby(groupby)['Beneficiary'].transform('nunique')
    df['number_relation_intermediate_'+name] = df.groupby(groupby)['Intermediate'].transform('nunique')
        
    # NB OF DISTINCT CURRENCY
    
    if column != 'Currency':
        df['number_distinct_currency_'+name] = df.groupby(groupby)['Currency'].transform('nunique')
        df['frequency_with_currency_'+name] = df.groupby(groupby+['Currency'])['Value'].transform('count')
    
    
    # NB OF DISTINCT COUNTRIES
    
    if column not in ['OriginatorCountry','BeneficiaryCountry','IntermediateCountry']:
        df['number_distinct_originator_country_'+name] = df.groupby(groupby)['OriginatorCountry'].transform('nunique')
        df['number_distinct_beneficiary_country_'+name] = df.groupby(groupby)['BeneficiaryCountry'].transform('nunique')
        df['number_distinct_intermediate_country_'+name] = df.groupby(groupby)['IntermediateCountry'].transform('nunique')
        
        df['number_current_originator_country_'+name] = df.groupby(groupby)['OriginatorCountry'].transform('count')
        df['number_current_beneficiary_country_'+name] = df.groupby(groupby)['BeneficiaryCountry'].transform('count')
        df['number_current_intermediate_country_'+name] = df.groupby(groupby)['IntermediateCountry'].transform('count')
            
    
    # NB WITH INTERMEDIATE
    df['number_with_intermediate_'+name] = df.groupby(groupby)['IntermediateCountry_BeneficiaryCountry'].transform(get_circuit)
    
    # NB DISTINCT CIRCRUIT
    
    df['number_dinstinct_circuit_with_intermediate_'+name] = df.groupby(groupby)['IntermediateCountry_BeneficiaryCountry'].transform('nunique')
        
    # NB CURRENT CIRCRUIT
    
    df['number_current_circuit_intermediateCountry_beneficiaryCountry_'+name] = df.groupby(groupby+['IntermediateCountry_BeneficiaryCountry'])['Value'].transform('count')
    df['number_current_circuit_originatorCountry_intermediateCountry_'+name] = df.groupby(groupby+['OriginatorCountry_IntermediateCountry'])['Value'].transform('count')
    df['number_current_circuit_originatorCountry_beneficiaryCountry_'+name] = df.groupby(groupby+['OriginatorCountry_BeneficiaryCountry'])['Value'].transform('count')
    df['number_current_circuit_originatorCountry_intermediateCountry_beneficiaryCountry_'+name] = df.groupby(groupby+['OriginatorCountry_IntermediateCountry_BeneficiaryCountry'])['Value'].transform('count')
    
    cols_end = df.columns
    
    new_cols = list(set(cols_end) - set(cols_start))
    
    return new_cols
        

In [14]:
%%time


base_feature = []
array.extend(base_features_transaction(['Currency'],'Global'))
name_configuration.append('Currency-Global')
array.extend(base_features_transaction(['OriginatorCountry'],'Global'))
name_configuration.append('OriginatorCountry-Global')
array.extend(base_features_transaction(['BeneficiaryCountry'],'Global'))
name_configuration.append('BeneficiaryCountry-Global')
array.extend(base_features_transaction(['IntermediateCountry'],'Global'))
name_configuration.append('IntermediateCountry-Global')
array.extend(base_features_transaction(['Type'],'Global'))
name_configuration.append('Type-Global')


CPU times: user 31.5 s, sys: 3.7 s, total: 35.2 s
Wall time: 35.2 s


# Transaction Features

In [15]:


base_feature.extend(list(df_copy.select_dtypes(include=np.number).columns))
cols_start = df.columns

df['Hour'] = df['Fulldate'].dt.hour
df['Week_of_year'] = df['Fulldate'].dt.week
df['day_of_week'] = df['Fulldate'].dt.dayofweek
df['day_of_year'] = df['Fulldate'].dt.dayofyear
df['quarter'] = df['Fulldate'].dt.quarter
df['year'] = df['Fulldate'].dt.year
df['month'] = df['Fulldate'].dt.month
df['day'] = df['Fulldate'].dt.day
df['is_month_start'] = df['Fulldate'].dt.is_month_start
df['is_month_start'] = df['is_month_start'].map({True: 1, False:0})
df['is_month_end'] = df['Fulldate'].dt.is_month_end
df['is_month_end'] = df['is_month_end'].map({True: 1, False:0})
cols_end = df.columns          
new_cols = list(set(cols_end) - set(cols_start))

base_feature.extend(new_cols)

  df['Week_of_year'] = df['Fulldate'].dt.week


In [16]:
df.isna().any()

Transaction Ref    False
Originator         False
Beneficiary        False
Type               False
Currency           False
                   ...  
year               False
month              False
day                False
is_month_start     False
is_month_end       False
Length: 312, dtype: bool

# Classification

In [17]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler
from itertools import combinations
from imblearn.ensemble import BalancedRandomForestClassifier

In [18]:
X = df.select_dtypes(include=np.number)
y = df['Flag']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
%%time

model = DecisionTreeClassifier()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

report = classification_report(y_test,y_pred,output_dict=True )
CM = confusion_matrix(y_test, y_pred)


TN = CM[0][0]
FN = CM[1][0]
TP = CM[1][1]
FP = CM[0][1]
print('les fraudes : ')
print('bien trouvée TP : ',TP)
print('pas trouvée FN : ',FN) 
print('les normaux')
print('bien trouvée TN : ',TN)
print('pas trouvée FP : ',FP)
report

les fraudes : 
bien trouvée TP :  456
pas trouvée FN :  246
les normaux
bien trouvée TN :  198905
pas trouvée FP :  393
CPU times: user 3min 23s, sys: 1.8 s, total: 3min 25s
Wall time: 3min 26s


{'False': {'precision': 0.9987647563908792,
  'recall': 0.9980280785557306,
  'f1-score': 0.9983962815818336,
  'support': 199298},
 'True': {'precision': 0.5371024734982333,
  'recall': 0.6495726495726496,
  'f1-score': 0.5880077369439072,
  'support': 702},
 'accuracy': 0.996805,
 'macro avg': {'precision': 0.7679336149445563,
  'recall': 0.8238003640641901,
  'f1-score': 0.7932020092628704,
  'support': 200000},
 'weighted avg': {'precision': 0.9971443217779261,
  'recall': 0.996805,
  'f1-score': 0.9969558177901544,
  'support': 200000}}

In [20]:
%%time

model = BalancedRandomForestClassifier(n_estimators=30)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

report = classification_report(y_test,y_pred,output_dict=True )
CM = confusion_matrix(y_test, y_pred)


TN = CM[0][0]
FN = CM[1][0]
TP = CM[1][1]
FP = CM[0][1]
print('les fraudes : ')
print('bien trouvée TP : ',TP)
print('pas trouvée FN : ',FN) 
print('les normaux')
print('bien trouvée TN : ',TN)
print('pas trouvée FP : ',FP)
report

les fraudes : 
bien trouvée TP :  649
pas trouvée FN :  53
les normaux
bien trouvée TN :  187342
pas trouvée FP :  11956
CPU times: user 7.22 s, sys: 912 ms, total: 8.13 s
Wall time: 8.14 s


{'False': {'precision': 0.9997171749513061,
  'recall': 0.9400094331102169,
  'f1-score': 0.9689443563757296,
  'support': 199298},
 'True': {'precision': 0.05148750495834986,
  'recall': 0.9245014245014245,
  'f1-score': 0.09754264672728638,
  'support': 702},
 'accuracy': 0.939955,
 'macro avg': {'precision': 0.525602339954828,
  'recall': 0.9322554288058207,
  'f1-score': 0.533243501551508,
  'support': 200000},
 'weighted avg': {'precision': 0.9963888888096307,
  'recall': 0.939955,
  'f1-score': 0.9658857363748636,
  'support': 200000}}

In [21]:
%%time

modelRF = RandomForestClassifier(n_estimators=30,bootstrap = True,max_features = 'sqrt')

modelRF.fit(X_train, y_train)

y_pred = modelRF.predict(X_test)

report = classification_report(y_test,y_pred,output_dict=True )
report
CM = confusion_matrix(y_test, y_pred)

TN = CM[0][0]
FN = CM[1][0]
TP = CM[1][1]
FP = CM[0][1]
print('les fraudes : ')
print('bien trouvée TP : ',TP)
print('pas trouvée FN : ',FN)
print('les normaux')
print('bien trouvée TN : ',TN)
print('pas trouvée FP : ',FP)
report

les fraudes : 
bien trouvée TP :  409
pas trouvée FN :  293
les normaux
bien trouvée TN :  199257
pas trouvée FP :  41
CPU times: user 3min 39s, sys: 469 ms, total: 3min 39s
Wall time: 3min 39s


{'False': {'precision': 0.9985316963167126,
  'recall': 0.9997942779154834,
  'f1-score': 0.999162588254172,
  'support': 199298},
 'True': {'precision': 0.9088888888888889,
  'recall': 0.5826210826210826,
  'f1-score': 0.7100694444444444,
  'support': 702},
 'accuracy': 0.99833,
 'macro avg': {'precision': 0.9537102926028007,
  'recall': 0.791207680268283,
  'f1-score': 0.8546160163493082,
  'support': 200000},
 'weighted avg': {'precision': 0.9982170500626409,
  'recall': 0.99833,
  'f1-score': 0.9981478713193999,
  'support': 200000}}

# Approche

## Decision Tree training

In [22]:
%%time
y_pred_array = []
y_pred_array_f1_score = []
models = []
models_f1_score = []



for idx,cols in enumerate(name_configuration):
    model = DecisionTreeClassifier()
    model.fit(X_train[array[idx]+base_feature], y_train)
    models.append(model)
    
    y_pred = model.predict(X_test[array[idx]+base_feature])
    
    report = classification_report(y_test,y_pred,output_dict=True )

    models_f1_score.append(report['True']['f1-score'])
    y_pred_array.append(np.multiply(y_pred, 1))
    y_pred_array_f1_score.append(np.multiply(y_pred, round(report['True']['f1-score'],2)))
    
    print(name_configuration[idx], 'base & ',round(report['True']['precision'],2),'&',round(report['True']['recall'],2),'&',round(report['True']['f1-score'],2),'\\\ \hline')


Beneficiary-Global base &  0.47 & 0.58 & 0.52 \\ \hline
Beneficiary-10Days base &  0.43 & 0.55 & 0.48 \\ \hline
Beneficiary-Month base &  0.43 & 0.54 & 0.48 \\ \hline
Originator-Global base &  0.19 & 0.26 & 0.22 \\ \hline
Originator-10Days base &  0.21 & 0.3 & 0.25 \\ \hline
Originator-Month base &  0.2 & 0.29 & 0.24 \\ \hline
Intermediate-Global base &  0.31 & 0.39 & 0.35 \\ \hline
Intermediate-10Days base &  0.17 & 0.22 & 0.19 \\ \hline
Intermediate-Month base &  0.25 & 0.31 & 0.28 \\ \hline
Originator-Beneficiary-Global base &  0.52 & 0.64 & 0.57 \\ \hline
Originator-Beneficiary-10Days base &  0.32 & 0.41 & 0.36 \\ \hline
Originator-Beneficiary-Month base &  0.48 & 0.56 & 0.52 \\ \hline
Originator-Intermediate-Global base &  0.38 & 0.49 & 0.43 \\ \hline
Originator-Intermediate-10Days base &  0.23 & 0.3 & 0.26 \\ \hline
Originator-Intermediate-Month base &  0.29 & 0.39 & 0.33 \\ \hline
Intermediate-Beneficiary-Global base &  0.39 & 0.46 & 0.42 \\ \hline
Intermediate-Beneficiary-10Day

TypeError: can only concatenate str (not "list") to str

In [23]:
import warnings
warnings.filterwarnings('ignore')

In [34]:
%%time

for alpha in range(10):
    listf1 = []
    listIndex = []
    y_pred_final = np.sum(y_pred_array_f1_score, 0)/(alpha+np.sum(y_pred_array, 0))
    y_pred_final = (y_pred_final - np.min(y_pred_final)) / (np.max(y_pred_final) - np.min(y_pred_final))
    y_pred_final[np.isnan(y_pred_final)] = 0
    for i in np.arange(0,1,0.005):
        y_pred = np.where(y_pred_final>=i, 1, 0)
        report = classification_report(y_test,y_pred,output_dict=True )
        listf1.append(report['True']['f1-score'])
        listIndex.append(i)
        # print( i,' : ',report['True'])
    print(alpha,':',np.max(listf1))
    listIndex[np.argmax(listf1)]

0 : 0.006995445984594076
1 : 0.6589673913043479
2 : 0.7402298850574712
3 : 0.7522658610271903
4 : 0.760408483896308
5 : 0.7673151750972762
6 : 0.7654901960784314
7 : 0.7650793650793651
8 : 0.7658495350803043
9 : 0.7653490328006728
CPU times: user 5min 15s, sys: 3.15 ms, total: 5min 15s
Wall time: 5min 15s


In [35]:
y_pred_final = np.sum(y_pred_array_f1_score, 0)/(alpha+np.sum(y_pred_array, 0))
y_pred_final = (y_pred_final - np.min(y_pred_final)) / (np.max(y_pred_final) - np.min(y_pred_final))
y_pred_final[np.isnan(y_pred_final)] = 0


In [36]:
np.arange(0,1,0.1)

array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])

In [37]:
for i in np.arange(0,1.1,0.1):
    y_pred = np.where(y_pred_final>=i, 1, 0)
    report = classification_report(y_test,y_pred,output_dict=True )
    listf1.append(report['True']['f1-score'])
    print( '(',round(i,3),' , ',round(report['True']['precision'],3),')')
    listIndex.append(i)

( 0.0  ,  0.004 )
( 0.1  ,  0.081 )
( 0.2  ,  0.216 )
( 0.3  ,  0.426 )
( 0.4  ,  0.622 )
( 0.5  ,  0.786 )
( 0.6  ,  0.899 )
( 0.7  ,  0.959 )
( 0.8  ,  0.98 )
( 0.9  ,  0.984 )
( 1.0  ,  1.0 )


In [38]:
y_pred_final = np.sum(y_pred_array_f1_score, 0)/(1+np.sum(y_pred_array, 0))
y_pred_final = (y_pred_final - np.min(y_pred_final)) / (np.max(y_pred_final) - np.min(y_pred_final))
y_pred_final[np.isnan(y_pred_final)] = 0

In [39]:
y_pred_final = np.sum(y_pred_array_f1_score, 0)/(6+np.sum(y_pred_array, 0))
y_pred_final = (y_pred_final - np.min(y_pred_final)) / (np.max(y_pred_final) - np.min(y_pred_final))
y_pred_final[np.isnan(y_pred_final)] = 0

y_pred = np.where(y_pred_final>=0.69, 1, 0)
report = classification_report(y_test,y_pred,output_dict=True )
report

{'False': {'precision': 0.998751954769638,
  'recall': 0.9998243835863883,
  'f1-score': 0.9992878814473057,
  'support': 199298},
 'True': {'precision': 0.9282786885245902,
  'recall': 0.6452991452991453,
  'f1-score': 0.761344537815126,
  'support': 702},
 'accuracy': 0.99858,
 'macro avg': {'precision': 0.9635153216471141,
  'recall': 0.8225617644427667,
  'f1-score': 0.8803162096312158,
  'support': 200000},
 'weighted avg': {'precision': 0.9985045936051179,
  'recall': 0.99858,
  'f1-score': 0.9984527003111568,
  'support': 200000}}

In [40]:
y_pred_final = np.sum(y_pred_array_f1_score, 0)/(9+np.sum(y_pred_array, 0))
y_pred_final = (y_pred_final - np.min(y_pred_final)) / (np.max(y_pred_final) - np.min(y_pred_final))
y_pred_final[np.isnan(y_pred_final)] = 0

In [41]:
%%time
listf1 = []
listIndex = []
y_pred_final = np.sum(y_pred_array_f1_score, 0)/(9+np.sum(y_pred_array, 0))
y_pred_final = (y_pred_final - np.min(y_pred_final)) / (np.max(y_pred_final) - np.min(y_pred_final))
y_pred_final[np.isnan(y_pred_final)] = 0
for i in np.arange(0,1,0.005):
    y_pred = np.where(y_pred_final>=i, 1, 0)
    report = classification_report(y_test,y_pred,output_dict=True )
    listf1.append(report['True']['f1-score'])
    listIndex.append(i)
    # print( i,' : ',report['True'])
print(alpha,':',np.max(listf1))
listIndex[np.argmax(listf1)]

9 : 0.7653490328006728
CPU times: user 31.5 s, sys: 0 ns, total: 31.5 s
Wall time: 31.5 s


0.63

In [46]:
for i in np.arange(0,1.1,0.1):
    y_pred = np.where(y_pred_final>=i, 1, 0)
    report = classification_report(y_test,y_pred,output_dict=True )
    listf1.append(report['True']['f1-score'])
    print(report['True'])
    # print( '(',round(i,3),' , ',round(report['True']['precision'],3),')')
    listIndex.append(i)

{'precision': 0.00351, 'recall': 1.0, 'f1-score': 0.006995445984594076, 'support': 702}
{'precision': 0.081375050355848, 'recall': 0.8632478632478633, 'f1-score': 0.14872990550987852, 'support': 702}
{'precision': 0.2155621742367833, 'recall': 0.8247863247863247, 'f1-score': 0.3417945690672963, 'support': 702}
{'precision': 0.42615384615384616, 'recall': 0.7891737891737892, 'f1-score': 0.5534465534465535, 'support': 702}
{'precision': 0.6220379146919431, 'recall': 0.7478632478632479, 'f1-score': 0.6791720569210866, 'support': 702}
{'precision': 0.7858267716535433, 'recall': 0.7108262108262108, 'f1-score': 0.7464472700074795, 'support': 702}
{'precision': 0.8990291262135922, 'recall': 0.6595441595441596, 'f1-score': 0.7608874281018899, 'support': 702}
{'precision': 0.958904109589041, 'recall': 0.5982905982905983, 'f1-score': 0.7368421052631579, 'support': 702}
{'precision': 0.9798850574712644, 'recall': 0.48575498575498577, 'f1-score': 0.6495238095238095, 'support': 702}
{'precision': 0

# Compare with all classifiers

In [47]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis


In [49]:
names = [
    
    
    

    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "Linear SVM"
    
    
]

classifiers = [
    
    
    

    MLPClassifier(),
    AdaBoostClassifier(),
    GaussianNB(),
    SVC(kernel="linear", C=0.025),
]

In [None]:
%%time
for name, clf in zip(names, classifiers):
    print('----------------------------------')
    print(name)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    report = classification_report(y_test,y_pred,output_dict=True )
    print(report)

----------------------------------
Neural Net
{'False': {'precision': 0.99649, 'recall': 1.0, 'f1-score': 0.9982419145600528, 'support': 199298}, 'True': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 702}, 'accuracy': 0.99649, 'macro avg': {'precision': 0.498245, 'recall': 0.5, 'f1-score': 0.4991209572800264, 'support': 200000}, 'weighted avg': {'precision': 0.9929923200999999, 'recall': 0.99649, 'f1-score': 0.994738085439947, 'support': 200000}}
----------------------------------
AdaBoost
{'False': {'precision': 0.997931379914851, 'recall': 0.9996939256791337, 'f1-score': 0.9988118752318598, 'support': 199298}, 'True': {'precision': 0.8257142857142857, 'recall': 0.4116809116809117, 'f1-score': 0.5494296577946768, 'support': 702}, 'accuracy': 0.99763, 'macro avg': {'precision': 0.9118228328145683, 'recall': 0.7056874186800227, 'f1-score': 0.7741207665132683, 'support': 200000}, 'weighted avg': {'precision': 0.997326897914207, 'recall': 0.99763, 'f1-score': 0.99723454364

# Scores 

In [None]:
y_pred_index = np.argsort(y_pred_final)[::-1]

In [None]:
df = df_copy.copy()[:size]
first_10_transaction_fr_index = X_test.iloc[y_pred_index[0:10]].index
df.loc[first_10_transaction_fr_index]

In [None]:
y_pred_final[y_pred_index[0:10]]

# Get Configurations

In [None]:
tab_configuration = []
for pred_index in y_pred_index[0:10]:
    tab = []
    for index,configuration in enumerate(y_pred_array):
        if configuration[pred_index] > 0 :
            tab.append(name_configuration[index])
          
        #print(configuration[pred_index])
    print(pred_index, ' : ',tab)
    tab_configuration.append(tab)
    print('--------------------------------------------------')

In [None]:
tab_configuration

In [None]:
for idx,tr in df.loc[first_10_transaction_fr_index].iterrows():
    index = list(first_10_transaction_fr_index).index(idx)
    print(index,'&',tr['Originator'],'&',tr['Intermediate'],'&',tr['Beneficiary'],'&',tr['Value'],'&',tr['Fulldate'].strftime('%Y-%m-%d'),'&',tr['Currency'],'&',round(y_pred[y_pred_index[index]],2),'\\\ \hline')


In [None]:
for idx,tr in df.loc[first_10_transaction_fr_index].iterrows():
    index = list(first_10_transaction_fr_index).index(idx)
    print(index , '&',' '.join(tab_configuration[index]),'\\\ \hline')
