In [8]:
import pandas as pd
import numpy as np
from os.path import isfile
from extra_codes import calc_vif
import matplotlib.pyplot as plt
from datetime import datetime
from statsmodels.tsa.stattools import kpss
import warnings
warnings.filterwarnings("ignore")

class ML_Fraud:
    __version__='1.0.5'
    def __init__(self,sample_start=1991,test_sample=range(2001,2011),
                 OOS_per=1,OOS_gap=0,sampling='expanding',adjust_serial=True,
                 cv_type='kfold',temp_year=1,cv_flag=False,cv_k=10,write=True,IS_per=10):

        if isfile('FraudDB2020.csv')==False:
            df=pd.DataFrame()
            for s in range(1,5):
                fl_name='FraudDB2020_Part'+str(s)+'.csv'
                new_df=pd.read_csv(fl_name)
                df=df.append(new_df)
            df.to_csv('FraudDB2020.csv',index=False)
            
        df=pd.read_csv('FraudDB2020.csv')
        self.df=df
        self.ss=sample_start
        self.se=np.max(df.fyear)
        self.ts=test_sample
        self.cv_t=cv_type
        self.cv=cv_flag
        self.cv_k=cv_k
        self.cv_t_y=temp_year
        
        sampling_set=['expanding','rolling']
        if sampling in sampling_set:
            pass
        else:
            raise ValueError('Invalid sampling choice. Permitted options are "expanding" and "rolling"')
        
        self.sa=sampling
        self.w=write
        self.ip=IS_per
        self.op=OOS_per
        self.og=OOS_gap
        self.a_s=adjust_serial
        print('Module initiated successfully ...')
        #The dir() function returns all properties and methods of the specified object, without the values.
        list_methods=dir(self)
        # .any: It checks for any element satisfying a condition and returns a True in case it finds any one element.
        reduced_methods=[item+'()' for item in list_methods if any(['analy' in item,'compare' in item,item=='sumstats'])]
        #string.join(iterable)
        print('Procedures are: '+'; '.join(reduced_methods))
    
    
    
    def analyse_ratio(self,C_FN=30,C_FP=1):
        """
        This code uses 11 financial ratios to predict the likelihood of fraud in a financial statement.
        
        Parameters:
            – C_FN: Cost of a False Negative for ECM
            – C_FP: Cost of a False Positive for ECM

        Predictive models:
            – Support Vector Machine (SVM)
            – Logistic Regression (LR)
            – SGD Tree Boosting (SGD)
            – Adaptive Boosting with Logistic Regression/LogitBoost (ADA)
            – MUlti-layered Perceptron (MLP)
            – FUSED (weighted average of estimated probs of other methods)

        Outputs: 
        Main results are stored in the table variable "perf_tbl_general" written into
        2 csv files: time period 2001-2010 and 2003-2008. 

        Steps:
            1. Cross-validate to find optimal hyperparameters.
            2. Estimating the performance for each OOS period.

        Warnings: 
            – Running this code can take up to 85 mins. The cross-validation takes up
            to 60 mins (you can skip this step) main analysis up to 15 mins. 
            These figures are estimates based on a MacBook Pro 2021.
            
        """

        from sklearn.linear_model import LogisticRegression
        from sklearn.linear_model import SGDClassifier
        from sklearn.svm import SVC
        from sklearn.neural_network import MLPClassifier
        from sklearn.ensemble import AdaBoostClassifier
        from imblearn.ensemble import RUSBoostClassifier
        from sklearn.model_selection import GridSearchCV,train_test_split
        from sklearn.metrics import roc_auc_score
        from sklearn.tree import DecisionTreeClassifier
        from extra_codes import ndcg_k,relogit
        from statsmodels.discrete.discrete_model import Logit
        from statsmodels.tools import add_constant
        from sklearn.preprocessing import MinMaxScaler
        
        t0=datetime.now()
        # setting the parameters
        IS_period=self.ip
        k_fold=self.cv_k
        OOS_period=self.op # 1 year ahead prediction
        OOS_gap=self.og # Gap between training and testing period
        start_OOS_year=self.ts[0] #2001
        end_OOS_year=self.ts[-1] #2010
        sample_start=self.ss #1991
        adjust_serial=self.a_s
        cv_type=self.cv_t
        cross_val=self.cv
        temp_year=self.cv_t_y #1
        case_window=self.sa
        fraud_df=self.df.copy(deep=True)
        write=self.w

        reduced_tbl_1=fraud_df.iloc[:,[0,1,3,7,8]]
        reduced_tbl_2=fraud_df.iloc[:,-14:-3]
        reduced_tblset=[reduced_tbl_1,reduced_tbl_2]
        reduced_tbl=pd.concat(reduced_tblset,axis=1)
        reduced_tbl=reduced_tbl[reduced_tbl.fyear>=sample_start] #1991
        reduced_tbl=reduced_tbl[reduced_tbl.fyear<=end_OOS_year] #2010

        # Setting the cross-validation setting
        # IC sample: fyear 1991-2000
        tbl_year_IS_CV=reduced_tbl.loc[np.logical_and(reduced_tbl.fyear<start_OOS_year,\
                                                   reduced_tbl.fyear>=start_OOS_year-IS_period)]
        tbl_year_IS_CV=tbl_year_IS_CV.reset_index(drop=True)
        misstate_firms=np.unique(tbl_year_IS_CV.gvkey[tbl_year_IS_CV.AAER_DUMMY==1])

        X_CV=tbl_year_IS_CV.iloc[:,-11:]

        mean_vals=np.mean(X_CV)
        std_vals=np.std(X_CV)
        #Z-score: calculate the probability of a score occurring within a standard normal distribution
        X_CV=(X_CV-mean_vals)/std_vals

        Y_CV=tbl_year_IS_CV.AAER_DUMMY

        P_f=np.sum(Y_CV==1)/len(Y_CV)
        P_nf=1-P_f

        print('prior probablity of fraud between '+str(sample_start)+'-'+
              str(start_OOS_year-1)+' is '+str(np.round(P_f*100,2))+'%')
        
        range_oos=range(start_OOS_year,end_OOS_year+1,OOS_period) #(2001,2010+1,1)

        roc_rus=np.zeros(len(range_oos))
        sensitivity_OOS_rus1=np.zeros(len(range_oos))
        specificity_OOS_rus1=np.zeros(len(range_oos))
        precision_rus1=np.zeros(len(range_oos))
        ndcg_rus1=np.zeros(len(range_oos))
        ecm_rus1=np.zeros(len(range_oos))


        roc_svm=np.zeros(len(range_oos))
        sensitivity_OOS_svm1=np.zeros(len(range_oos))
        specificity_OOS_svm1=np.zeros(len(range_oos))
        precision_svm1=np.zeros(len(range_oos))
        ndcg_svm1=np.zeros(len(range_oos))
        ecm_svm1=np.zeros(len(range_oos))

        roc_lr=np.zeros(len(range_oos))
        sensitivity_OOS_lr1=np.zeros(len(range_oos))
        specificity_OOS_lr1=np.zeros(len(range_oos))
        precision_lr1=np.zeros(len(range_oos))
        ndcg_lr1=np.zeros(len(range_oos))
        ecm_lr1=np.zeros(len(range_oos))
        

        roc_sgd=np.zeros(len(range_oos))
        sensitivity_OOS_sgd1=np.zeros(len(range_oos))
        specificity_OOS_sgd1=np.zeros(len(range_oos))
        precision_sgd1=np.zeros(len(range_oos))
        ndcg_sgd1=np.zeros(len(range_oos))
        ecm_sgd1=np.zeros(len(range_oos))

        roc_ada=np.zeros(len(range_oos))
        sensitivity_OOS_ada1=np.zeros(len(range_oos))
        specificity_OOS_ada1=np.zeros(len(range_oos))
        precision_ada1=np.zeros(len(range_oos))
        ndcg_ada1=np.zeros(len(range_oos))
        ecm_ada1=np.zeros(len(range_oos))


        roc_mlp=np.zeros(len(range_oos))
        sensitivity_OOS_mlp1=np.zeros(len(range_oos))
        specificity_OOS_mlp1=np.zeros(len(range_oos))
        precision_mlp1=np.zeros(len(range_oos))
        ndcg_mlp1=np.zeros(len(range_oos))
        ecm_mlp1=np.zeros(len(range_oos))


        roc_fused=np.zeros(len(range_oos))
        sensitivity_OOS_fused1=np.zeros(len(range_oos))
        specificity_OOS_fused1=np.zeros(len(range_oos))
        precision_fused1=np.zeros(len(range_oos))
        ndcg_fused1=np.zeros(len(range_oos))
        ecm_fused1=np.zeros(len(range_oos))
        
        
        n_opt_rus=1000
        r_opt_rus=1e-4
        score_rus=0.6953935928499526
                
        opt_params_svm={'class_weight': {0: 0.01, 1: 1}, 'kernel': 'linear'}
        C_opt=opt_params_svm['class_weight'][0]
        kernel_opt=opt_params_svm['kernel']
        score_svm=0.701939025416111
                
        score_lr=0.7056438104977343
                
        opt_params_sgd={'class_weight': {0: 5e-3, 1: 1}, 'loss': 'log', 'penalty': 'l2'}
        score_sgd=0.7026775920776185
    
        opt_params_ada={'learning_rate': 0.9, 'n_estimators': 20}
        score_ada=0.700229450411913
                                
        opt_params_mlp={'activation': 'logistic', 'hidden_layer_sizes': 5, 'solver': 'adam'}
        score_mlp=0.706333862286029


        m=0
        for yr in range_oos: #2001-2010
            t1=datetime.now()
            if case_window=='expanding':
                year_start_IS=sample_start #1991
            else:
                year_start_IS=yr-IS_period #1991
            #how many years between training and testing sample: 
            #expanding: 1991-2000, 1991-2001
            #rolling: 1991-2000, 1992-2001
            tbl_year_IS=reduced_tbl.loc[np.logical_and(reduced_tbl.fyear<yr-OOS_gap,\
                                                       reduced_tbl.fyear>=year_start_IS)]
            tbl_year_IS=tbl_year_IS.reset_index(drop=True)
            
            
            
            
            misstate_firms=np.unique(tbl_year_IS.gvkey[tbl_year_IS.AAER_DUMMY==1])
            #How many periods constitute the testing sample at a time: 2001, 2002
            tbl_year_OOS=reduced_tbl.loc[np.logical_and(reduced_tbl.fyear>=yr,\
                                                        reduced_tbl.fyear<yr+OOS_period)]
            
            if adjust_serial==True:
                ok_index=np.zeros(tbl_year_OOS.shape[0])
                for s in range(0,tbl_year_OOS.shape[0]):
                    if not tbl_year_OOS.iloc[s,1] in misstate_firms:
                        ok_index[s]=True
                    
                
            else:
                #filled with ones and keep all observations including serial frauds
                ok_index=np.ones(tbl_year_OOS.shape[0]).astype(bool)
                
            #deleting observations where a company appears both in IS and OOS samples
            tbl_year_OOS=tbl_year_OOS.iloc[ok_index==True,:]
            tbl_year_OOS=tbl_year_OOS.reset_index(drop=True)
                
            
            X=tbl_year_IS.iloc[:,-11:]
            mean_vals=np.mean(X)
            std_vals=np.std(X)
            X=(X-mean_vals)/std_vals
            Y=tbl_year_IS.AAER_DUMMY
            
            X_OOS=tbl_year_OOS.iloc[:,-11:]
            X_OOS=(X_OOS-mean_vals)/std_vals
            
            Y_OOS=tbl_year_OOS.AAER_DUMMY
            n_P=np.sum(Y_OOS==1)
            n_N=np.sum(Y_OOS==0)
            
            scaling = MinMaxScaler(feature_range=(-1,1)).fit(X)
            X = scaling.transform(X)
            X_OOS = scaling.transform(X_OOS)
            
            
            # Support Vector Machines
            
            clf_svm=SVC(class_weight={0:C_opt,1:1},kernel=kernel_opt,shrinking=False,\
                            probability=False,random_state=0,max_iter=-1,\
                                tol=X.shape[-1]*1e-3)
                
            clf_svm=clf_svm.fit(X,Y)
            
            pred_test_svm=clf_svm.decision_function(X_OOS)
            probs_oos_fraud_svm=np.exp(pred_test_svm)/(1+np.exp(pred_test_svm))
            
            roc_svm[m]=roc_auc_score(Y_OOS,probs_oos_fraud_svm)
            
            
            cutoff_OOS_svm=np.percentile(probs_oos_fraud_svm,99)
            sensitivity_OOS_svm1[m]=np.sum(np.logical_and(probs_oos_fraud_svm>=cutoff_OOS_svm, \
                                                          Y_OOS==1))/np.sum(Y_OOS)
            specificity_OOS_svm1[m]=np.sum(np.logical_and(probs_oos_fraud_svm<cutoff_OOS_svm, \
                                                          Y_OOS==0))/np.sum(Y_OOS==0)
            precision_svm1[m]=np.sum(np.logical_and(probs_oos_fraud_svm>=cutoff_OOS_svm, \
                                                         Y_OOS==1))/np.sum(probs_oos_fraud_svm>=cutoff_OOS_svm)
            ndcg_svm1[m]=ndcg_k(Y_OOS,probs_oos_fraud_svm,99)
            
            FN_svm1=np.sum(np.logical_and(probs_oos_fraud_svm<cutoff_OOS_svm, \
                                                          Y_OOS==1))
            FP_svm1=np.sum(np.logical_and(probs_oos_fraud_svm>=cutoff_OOS_svm, \
                                                          Y_OOS==0))
                
            ecm_svm1[m]=C_FN*P_f*FN_svm1/n_P+C_FP*P_nf*FP_svm1/n_N
                
            
            # Logistic Regression – Dechow et al (2011)
            X_lr=add_constant(X)
            X_OOS_lr=add_constant(X_OOS)
            clf_lr = Logit(Y,X_lr)
            clf_lr=clf_lr.fit(disp=0)
            probs_oos_fraud_lr=clf_lr.predict(X_OOS_lr)

            roc_lr[m]=roc_auc_score(Y_OOS,probs_oos_fraud_lr)
            
            
            cutoff_OOS_lr=np.percentile(probs_oos_fraud_lr,99)
            sensitivity_OOS_lr1[m]=np.sum(np.logical_and(probs_oos_fraud_lr>=cutoff_OOS_lr, \
                                                         Y_OOS==1))/np.sum(Y_OOS)
            specificity_OOS_lr1[m]=np.sum(np.logical_and(probs_oos_fraud_lr<cutoff_OOS_lr, \
                                                          Y_OOS==0))/np.sum(Y_OOS==0)
            precision_lr1[m]=np.sum(np.logical_and(probs_oos_fraud_lr>=cutoff_OOS_lr, \
                                                         Y_OOS==1))/np.sum(probs_oos_fraud_lr>=cutoff_OOS_lr)
            ndcg_lr1[m]=ndcg_k(Y_OOS,probs_oos_fraud_lr,99)
            
            FN_lr1=np.sum(np.logical_and(probs_oos_fraud_lr<cutoff_OOS_lr, \
                                                          Y_OOS==1))
            FP_lr1=np.sum(np.logical_and(probs_oos_fraud_lr>=cutoff_OOS_lr, \
                                                          Y_OOS==0))
                
            ecm_lr1[m]=C_FN*P_f*FN_lr1/n_P+C_FP*P_nf*FP_lr1/n_N
                        
            
            # Stochastic Gradient Decent 

            clf_sgd=SGDClassifier(class_weight=opt_params_sgd['class_weight'],\
                                  loss=opt_params_sgd['loss'], random_state=0,\
                                   penalty=opt_params_sgd['penalty'],validation_fraction=.2,shuffle=False)
            clf_sgd=clf_sgd.fit(X,Y)
            probs_oos_fraud_sgd=clf_sgd.predict_proba(X_OOS)[:,-1]
            
            roc_sgd[m]=roc_auc_score(Y_OOS,probs_oos_fraud_sgd)
            
            cutoff_OOS_sgd=np.percentile(probs_oos_fraud_sgd,99)
            sensitivity_OOS_sgd1[m]=np.sum(np.logical_and(probs_oos_fraud_sgd>=cutoff_OOS_sgd, \
                                                         Y_OOS==1))/np.sum(Y_OOS)
            specificity_OOS_sgd1[m]=np.sum(np.logical_and(probs_oos_fraud_sgd<cutoff_OOS_sgd, \
                                                          Y_OOS==0))/np.sum(Y_OOS==0)
            precision_sgd1[m]=np.sum(np.logical_and(probs_oos_fraud_sgd>=cutoff_OOS_sgd, \
                                                         Y_OOS==1))/np.sum(probs_oos_fraud_sgd>=cutoff_OOS_sgd)
            ndcg_sgd1[m]=ndcg_k(Y_OOS,probs_oos_fraud_sgd,99)
            
            FN_sgd1=np.sum(np.logical_and(probs_oos_fraud_sgd<cutoff_OOS_sgd, \
                                                          Y_OOS==1))
            FP_sgd1=np.sum(np.logical_and(probs_oos_fraud_sgd>=cutoff_OOS_sgd, \
                                                          Y_OOS==0))
                
            ecm_sgd1[m]=C_FN*P_f*FN_sgd1/n_P+C_FP*P_nf*FP_sgd1/n_N
            
            
            # LogitBoost
            base_lr=LogisticRegression(random_state=0,solver='newton-cg')
            
            
            clf_ada=AdaBoostClassifier(n_estimators=opt_params_ada['n_estimators'],\
                                       learning_rate=opt_params_ada['learning_rate'],\
                                           base_estimator=base_lr,random_state=0)
            clf_ada=clf_ada.fit(X,Y)
            probs_oos_fraud_ada=clf_ada.predict_proba(X_OOS)[:,-1]
            
            
            labels_ada=clf_ada.predict(X_OOS)
            
            roc_ada[m]=roc_auc_score(Y_OOS,probs_oos_fraud_ada)
            cutoff_OOS_ada=np.percentile(probs_oos_fraud_ada,99)
            sensitivity_OOS_ada1[m]=np.sum(np.logical_and(probs_oos_fraud_ada>=cutoff_OOS_ada, \
                                                         Y_OOS==1))/np.sum(Y_OOS)
            specificity_OOS_ada1[m]=np.sum(np.logical_and(probs_oos_fraud_ada<cutoff_OOS_ada, \
                                                          Y_OOS==0))/np.sum(Y_OOS==0)
            precision_ada1[m]=np.sum(np.logical_and(probs_oos_fraud_ada>=cutoff_OOS_ada, \
                                                         Y_OOS==1))/np.sum(probs_oos_fraud_ada>=cutoff_OOS_ada)
            ndcg_ada1[m]=ndcg_k(Y_OOS,probs_oos_fraud_ada,99)
            
            FN_ada1=np.sum(np.logical_and(probs_oos_fraud_ada<cutoff_OOS_ada, \
                                                          Y_OOS==1))
            FP_ada1=np.sum(np.logical_and(probs_oos_fraud_ada>=cutoff_OOS_ada, \
                                                          Y_OOS==0))
                
            ecm_ada1[m]=C_FN*P_f*FN_ada1/n_P+C_FP*P_nf*FP_ada1/n_N
                
            
            # Multi Layer Perceptron
            clf_mlp=MLPClassifier(hidden_layer_sizes=opt_params_mlp['hidden_layer_sizes'], \
                                  activation=opt_params_mlp['activation'],solver=opt_params_mlp['solver'],\
                                               random_state=0,validation_fraction=.1)
            clf_mlp=clf_mlp.fit(X,Y)
            probs_oos_fraud_mlp=clf_mlp.predict_proba(X_OOS)[:,-1]
                        
            roc_mlp[m]=roc_auc_score(Y_OOS,probs_oos_fraud_mlp)
            
            cutoff_OOS_mlp=np.percentile(probs_oos_fraud_mlp,99)
            sensitivity_OOS_mlp1[m]=np.sum(np.logical_and(probs_oos_fraud_mlp>=cutoff_OOS_mlp, \
                                                         Y_OOS==1))/np.sum(Y_OOS)
            specificity_OOS_mlp1[m]=np.sum(np.logical_and(probs_oos_fraud_mlp<cutoff_OOS_mlp, \
                                                          Y_OOS==0))/np.sum(Y_OOS==0)
            precision_mlp1[m]=np.sum(np.logical_and(probs_oos_fraud_mlp>=cutoff_OOS_mlp, \
                                                         Y_OOS==1))/np.sum(probs_oos_fraud_mlp>=cutoff_OOS_mlp)
            ndcg_mlp1[m]=ndcg_k(Y_OOS,probs_oos_fraud_mlp,99)
            
            FN_mlp1=np.sum(np.logical_and(probs_oos_fraud_mlp<cutoff_OOS_mlp, \
                                                          Y_OOS==1))
            FP_mlp1=np.sum(np.logical_and(probs_oos_fraud_mlp>=cutoff_OOS_mlp, \
                                                          Y_OOS==0))
                
            ecm_mlp1[m]=C_FN*P_f*FN_mlp1/n_P+C_FP*P_nf*FP_mlp1/n_N
                
            
            
            # Fused approach
            #highest fine tunning scores for each model
            weight_ser=np.array([score_svm,score_lr,score_sgd,score_ada,score_mlp])
            #give each model weight based on their score ranking
            weight_ser=weight_ser/np.sum(weight_ser)
            # ** Exponentiation
            probs_oos_fraud_svm=(1+np.exp(-1*probs_oos_fraud_svm))**-1
                
            probs_oos_fraud_lr=(1+np.exp(-1*probs_oos_fraud_lr))**-1
                
            probs_oos_fraud_sgd=(1+np.exp(-1*probs_oos_fraud_sgd))**-1
            
            probs_oos_fraud_ada=(1+np.exp(-1*probs_oos_fraud_ada))**-1
                
            probs_oos_fraud_mlp=(1+np.exp(-1*probs_oos_fraud_mlp))**-1
            
            #dot.product   - weighted probability of the classification made by each model
            clf_fused=np.dot(np.array([probs_oos_fraud_svm,\
                                  probs_oos_fraud_lr,probs_oos_fraud_sgd,probs_oos_fraud_ada,\
                                      probs_oos_fraud_mlp]).T,weight_ser)
            
            probs_oos_fraud_fused=clf_fused
                        
            roc_fused[m]=roc_auc_score(Y_OOS,probs_oos_fraud_fused)
            
            
            cutoff_OOS_fused=np.percentile(probs_oos_fraud_fused,99)
            sensitivity_OOS_fused1[m]=np.sum(np.logical_and(probs_oos_fraud_fused>=cutoff_OOS_fused, \
                                                         Y_OOS==1))/np.sum(Y_OOS)
            specificity_OOS_fused1[m]=np.sum(np.logical_and(probs_oos_fraud_fused<cutoff_OOS_fused, \
                                                          Y_OOS==0))/np.sum(Y_OOS==0)
            precision_fused1[m]=np.sum(np.logical_and(probs_oos_fraud_fused>=cutoff_OOS_fused, \
                                                         Y_OOS==1))/np.sum(probs_oos_fraud_fused>=cutoff_OOS_fused)
            ndcg_fused1[m]=ndcg_k(Y_OOS,probs_oos_fraud_fused,99)
            
            FN_fused1=np.sum(np.logical_and(probs_oos_fraud_fused<cutoff_OOS_fused, \
                                                          Y_OOS==1))
            FP_fused1=np.sum(np.logical_and(probs_oos_fraud_fused>=cutoff_OOS_fused, \
                                                          Y_OOS==0))
                
            ecm_fused1[m]=C_FN*P_f*FN_fused1/n_P+C_FP*P_nf*FP_fused1/n_N
            

            
            t2=datetime.now() 
            dt=t2-t1
            print('analysis finished for OOS period '+str(yr)+' after '+str(dt.total_seconds())+' sec')
            m+=1

        print('average top percentile sensitivity for the period '+str(start_OOS_year)+' to '+\
              str(end_OOS_year)+' is '+ str(round(np.mean(sensitivity_OOS_svm1)*100,2))+\
                          '% for SVM vs '+ str(round(np.mean(sensitivity_OOS_lr1)*100,2))+\
                          '% for Dechow-LR vs '+ str(round(np.mean(sensitivity_OOS_sgd1)*100,2))+\
                              '% for SGD vs '+ str(round(np.mean(sensitivity_OOS_ada1)*100,2))+\
                                  '% for ADA vs '+ str(round(np.mean(sensitivity_OOS_mlp1)*100,2))+\
                                      '% for MLP vs '+ str(round(np.mean(sensitivity_OOS_fused1)*100,2))+\
                                          '% for FUSED')

        
        f1_score_svm1=2*(precision_svm1*sensitivity_OOS_svm1)/\
            (precision_svm1+sensitivity_OOS_svm1+1e-8)
        
        f1_score_lr1=2*(precision_lr1*sensitivity_OOS_lr1)/\
            (precision_lr1+sensitivity_OOS_lr1+1e-8)
        
        f1_score_sgd1=2*(precision_sgd1*sensitivity_OOS_sgd1)/\
            (precision_sgd1+sensitivity_OOS_sgd1+1e-8)
        
        f1_score_ada1=2*(precision_ada1*sensitivity_OOS_ada1)/\
            (precision_ada1+sensitivity_OOS_ada1+1e-8)
        
        f1_score_mlp1=2*(precision_mlp1*sensitivity_OOS_mlp1)/\
            (precision_mlp1+sensitivity_OOS_mlp1+1e-8)
        
        f1_score_fused1=2*(precision_fused1*sensitivity_OOS_fused1)/\
            (precision_fused1+sensitivity_OOS_fused1+1e-8)
        
        # create performance table now
        perf_tbl_general=pd.DataFrame()
        perf_tbl_general['models']=['SVM','LR','SGD','LogitBoost','MLP','FUSED']
        perf_tbl_general['Roc']=[str(np.round(
                np.mean(roc_svm)*100,2))+'% ('+\
                str(np.round(np.std(roc_svm)*100,2))+'%)',str(np.round(
                    np.mean(roc_lr)*100,2))+'% ('+\
                    str(np.round(np.std(roc_lr)*100,2))+'%)',str(np.round(
                        np.mean(roc_sgd)*100,2))+'% ('+\
                        str(np.round(np.std(roc_sgd)*100,2))+'%)',str(np.round(
                            np.mean(roc_ada)*100,2))+'% ('+\
                            str(np.round(np.std(roc_ada)*100,2))+'%)',str(np.round(
                                np.mean(roc_mlp)*100,2))+'% ('+\
                                str(np.round(np.std(roc_mlp)*100,2))+'%)',
                                str(np.round(
                                    np.mean(roc_fused)*100,2))+'% ('+\
                                    str(np.round(np.std(roc_fused)*100,2))+'%)']
        
        perf_tbl_general['Roc_noise_to_signal']=[str(np.round(
                np.std(roc_svm)/np.mean(roc_svm)*100,2))+'%',str(np.round(
                    np.std(roc_lr)/np.mean(roc_lr)*100,2))+'%',str(np.round(
                        np.std(roc_sgd)/np.mean(roc_sgd)*100,2))+'%',str(np.round(
                            np.std(roc_ada)/np.mean(roc_ada)*100,2))+'%',str(np.round(
                                np.std(roc_mlp)/np.mean(roc_mlp)*100,2))+'%',
                                str(np.round(np.std(roc_fused)/np.mean(roc_fused)*100,2))+'%']
        
        

                                                    
        perf_tbl_general['Sensitivity @ 1 Prc']=[str(np.round(
                np.mean(sensitivity_OOS_svm1)*100,2))+'% ('+\
                str(np.round(np.std(sensitivity_OOS_svm1)*100,2))+'%)',str(np.round(
                    np.mean(sensitivity_OOS_lr1)*100,2))+'% ('+\
                    str(np.round(np.std(sensitivity_OOS_lr1)*100,2))+'%)',str(np.round(
                        np.mean(sensitivity_OOS_sgd1)*100,2))+'% ('+\
                        str(np.round(np.std(sensitivity_OOS_sgd1)*100,2))+'%)',str(np.round(
                            np.mean(sensitivity_OOS_ada1)*100,2))+'% ('+\
                            str(np.round(np.std(sensitivity_OOS_ada1)*100,2))+'%)',str(np.round(
                                np.mean(sensitivity_OOS_mlp1)*100,2))+'% ('+\
                                str(np.round(np.std(sensitivity_OOS_mlp1)*100,2))+'%)',
                                str(np.round(
                                    np.mean(sensitivity_OOS_fused1)*100,2))+'% ('+\
                                    str(np.round(np.std(sensitivity_OOS_fused1)*100,2))+'%)']
        
        
        perf_tbl_general['Sensitivity_noise_to_signal @ 1 Prc']=[str(np.round(
                np.std(sensitivity_OOS_svm1)/np.mean(sensitivity_OOS_svm1)*100,2))+'%',str(np.round(
                    np.std(sensitivity_OOS_lr1)/np.mean(sensitivity_OOS_lr1)*100,2))+'%',str(np.round(
                        np.std(sensitivity_OOS_sgd1)/np.mean(sensitivity_OOS_sgd1)*100,2))+'%',str(np.round(
                            np.std(sensitivity_OOS_ada1)/np.mean(sensitivity_OOS_ada1)*100,2))+'%',str(np.round(
                                np.std(sensitivity_OOS_mlp1)/np.mean(sensitivity_OOS_mlp1)*100,2))+'%',
                                str(np.round(np.std(sensitivity_OOS_fused1)/np.mean(sensitivity_OOS_fused1)*100,2))+'%']
        
        

        perf_tbl_general['Specificity @ 1 Prc']=[str(np.round(
                np.mean(specificity_OOS_svm1)*100,2))+'% ('+\
                str(np.round(np.std(specificity_OOS_svm1)*100,2))+'%)',str(np.round(
                    np.mean(specificity_OOS_lr1)*100,2))+'% ('+\
                    str(np.round(np.std(specificity_OOS_lr1)*100,2))+'%)',str(np.round(
                        np.mean(specificity_OOS_sgd1)*100,2))+'% ('+\
                        str(np.round(np.std(specificity_OOS_sgd1)*100,2))+'%)',str(np.round(
                            np.mean(specificity_OOS_ada1)*100,2))+'% ('+\
                            str(np.round(np.std(specificity_OOS_ada1)*100,2))+'%)',str(np.round(
                                np.mean(specificity_OOS_mlp1)*100,2))+'% ('+\
                                str(np.round(np.std(specificity_OOS_mlp1)*100,2))+'%)',
                                str(np.round(np.mean(specificity_OOS_fused1)*100,2))+'% ('+\
                                    str(np.round(np.std(specificity_OOS_fused1)*100,2))+'%)']
        
        
        perf_tbl_general['Specificity_noise_to_signal @ 1 Prc']=[str(np.round(
                np.std(specificity_OOS_svm1)/np.mean(specificity_OOS_svm1)*100,2))+'%',str(np.round(
                    np.std(specificity_OOS_lr1)/np.mean(specificity_OOS_lr1)*100,2))+'%',str(np.round(
                        np.std(specificity_OOS_sgd1)/np.mean(specificity_OOS_sgd1)*100,2))+'%',str(np.round(
                            np.std(specificity_OOS_ada1)/np.mean(specificity_OOS_ada1)*100,2))+'%',str(np.round(
                                np.std(specificity_OOS_mlp1)/np.mean(specificity_OOS_mlp1)*100,2))+'%',
                                str(np.round(np.std(specificity_OOS_fused1)/np.mean(specificity_OOS_fused1)*100,2))+'%']
        
        
        

        perf_tbl_general['Precision @ 1 Prc']=[str(np.round(
                np.mean(precision_svm1)*100,2))+'% ('+\
                str(np.round(np.std(precision_svm1)*100,2))+'%)',str(np.round(
                    np.mean(precision_lr1)*100,2))+'% ('+\
                    str(np.round(np.std(precision_lr1)*100,2))+'%)',str(np.round(
                        np.mean(precision_sgd1)*100,2))+'% ('+\
                        str(np.round(np.std(precision_sgd1)*100,2))+'%)',str(np.round(
                            np.mean(precision_ada1)*100,2))+'% ('+\
                            str(np.round(np.std(precision_ada1)*100,2))+'%)',str(np.round(
                                np.mean(precision_mlp1)*100,2))+'% ('+\
                                str(np.round(np.std(precision_mlp1)*100,2))+'%)',
                                str(np.round(
                                    np.mean(precision_fused1)*100,2))+'% ('+\
                                    str(np.round(np.std(precision_fused1)*100,2))+'%)']
        
        perf_tbl_general['Precision_noise_to_signal @ 1 Prc']=[str(np.round(
                np.std(precision_svm1)/np.mean(precision_svm1)*100,2))+'%',str(np.round(
                    np.std(precision_lr1)/np.mean(precision_lr1)*100,2))+'%',str(np.round(
                        np.std(precision_sgd1)/np.mean(precision_sgd1)*100,2))+'%',str(np.round(
                            np.std(precision_ada1)/np.mean(precision_ada1)*100,2))+'%',str(np.round(
                                np.std(precision_mlp1)/np.mean(precision_mlp1)*100,2))+'%',
                                str(np.round(np.std(precision_fused1)/np.mean(precision_fused1)*100,2))+'%']
                                    
                                    
        perf_tbl_general['F1 Score @ 1 Prc']=[str(np.round(
                np.mean(f1_score_svm1)*100,2))+'% ('+\
                str(np.round(np.std(f1_score_svm1)*100,2))+'%)',str(np.round(
                    np.mean(f1_score_lr1)*100,2))+'% ('+\
                    str(np.round(np.std(f1_score_lr1)*100,2))+'%)',str(np.round(
                        np.mean(f1_score_sgd1)*100,2))+'% ('+\
                        str(np.round(np.std(f1_score_sgd1)*100,2))+'%)',str(np.round(
                            np.mean(f1_score_ada1)*100,2))+'% ('+\
                            str(np.round(np.std(f1_score_ada1)*100,2))+'%)',str(np.round(
                                np.mean(f1_score_mlp1)*100,2))+'% ('+\
                                str(np.round(np.std(f1_score_mlp1)*100,2))+'%)',
                                str(np.round(
                                    np.mean(f1_score_fused1)*100,2))+'% ('+\
                                    str(np.round(np.std(f1_score_fused1)*100,2))+'%)']
        
        perf_tbl_general['F1 Score_noise_to_signal @ 1 Prc']=[str(np.round(
                np.std(f1_score_svm1)/np.mean(f1_score_svm1)*100,2))+'%',str(np.round(
                    np.std(f1_score_lr1)/np.mean(f1_score_lr1)*100,2))+'%',str(np.round(
                        np.std(f1_score_sgd1)/np.mean(f1_score_sgd1)*100,2))+'%',str(np.round(
                            np.std(f1_score_ada1)/np.mean(f1_score_ada1)*100,2))+'%',str(np.round(
                                np.std(f1_score_mlp1)/np.mean(f1_score_mlp1)*100,2))+'%',
                                str(np.round(np.std(f1_score_fused1)/np.mean(f1_score_fused1)*100,2))+'%']
            
        
        perf_tbl_general['NDCG @ 1 Prc']=[str(np.round(
                np.mean(ndcg_svm1)*100,2))+'% ('+\
                str(np.round(np.std(ndcg_svm1)*100,2))+'%)',str(np.round(
                    np.mean(ndcg_lr1)*100,2))+'% ('+\
                    str(np.round(np.std(ndcg_lr1)*100,2))+'%)',str(np.round(
                        np.mean(ndcg_sgd1)*100,2))+'% ('+\
                        str(np.round(np.std(ndcg_sgd1)*100,2))+'%)',str(np.round(
                            np.mean(ndcg_ada1)*100,2))+'% ('+\
                            str(np.round(np.std(ndcg_ada1)*100,2))+'%)',str(np.round(
                                np.mean(ndcg_mlp1)*100,2))+'% ('+\
                                str(np.round(np.std(ndcg_mlp1)*100,2))+'%)',
                                str(np.round(
                                    np.mean(ndcg_fused1)*100,2))+'% ('+\
                                    str(np.round(np.std(ndcg_fused1)*100,2))+'%)']
        
        perf_tbl_general['NDCG_noise_to_signal @ 1 Prc']=[str(np.round(
                np.std(ndcg_svm1)/np.mean(ndcg_svm1)*100,2))+'%',str(np.round(
                    np.std(ndcg_lr1)/np.mean(ndcg_lr1)*100,2))+'%',str(np.round(
                        np.std(ndcg_sgd1)/np.mean(ndcg_sgd1)*100,2))+'%',str(np.round(
                            np.std(ndcg_ada1)/np.mean(ndcg_ada1)*100,2))+'%',str(np.round(
                                np.std(ndcg_mlp1)/np.mean(ndcg_mlp1)*100,2))+'%',
                                str(np.round(np.std(ndcg_fused1)/np.mean(ndcg_fused1)*100,2))+'%']
        
        
        
        
        
        perf_tbl_general['ECM @ 1 Prc']=[str(np.round(
                np.mean(ecm_svm1)*100,2))+'% ('+\
                str(np.round(np.std(ecm_svm1)*100,2))+'%)',str(np.round(
                    np.mean(ecm_lr1)*100,2))+'% ('+\
                    str(np.round(np.std(ecm_lr1)*100,2))+'%)',str(np.round(
                        np.mean(ecm_sgd1)*100,2))+'% ('+\
                        str(np.round(np.std(ecm_sgd1)*100,2))+'%)',str(np.round(
                            np.mean(ecm_ada1)*100,2))+'% ('+\
                            str(np.round(np.std(ecm_ada1)*100,2))+'%)',str(np.round(
                                np.mean(ecm_mlp1)*100,2))+'% ('+\
                                str(np.round(np.std(ecm_mlp1)*100,2))+'%)',
                                str(np.round(
                                    np.mean(ecm_fused1)*100,2))+'% ('+\
                                    str(np.round(np.std(ecm_fused1)*100,2))+'%)']
        
        
        perf_tbl_general['ECM_noise_to_signal @ 1 Prc']=[str(np.round(
                np.std(ecm_svm1)/np.mean(ecm_svm1)*100,2))+'%',str(np.round(
                    np.std(ecm_lr1)/np.mean(ecm_lr1)*100,2))+'%',str(np.round(
                        np.std(ecm_sgd1)/np.mean(ecm_sgd1)*100,2))+'%',str(np.round(
                            np.std(ecm_ada1)/np.mean(ecm_ada1)*100,2))+'%',str(np.round(
                                np.std(ecm_mlp1)/np.mean(ecm_mlp1)*100,2))+'%',
                                str(np.round(np.std(ecm_fused1)/np.mean(ecm_fused1)*100,2))+'%']
    
            


        lbl_perf_tbl='perf_tbl_'+str(start_OOS_year)+'_'+str(end_OOS_year)+\
                    '_'+case_window+',OOS='+str(OOS_period)+','+\
                    str(k_fold)+'fold'+',serial='+str(adjust_serial)+\
                    ',gap='+str(OOS_gap)+'_6ratios_kfold.csv'



        if write==True:
            perf_tbl_general.to_csv(lbl_perf_tbl,index=False)
        print(perf_tbl_general)
        t_last=datetime.now()
        dt_total=t_last-t0
        print('total run time is '+str(dt_total.total_seconds())+' sec')




In [9]:
a = ML_Fraud(sample_start = 1991,test_sample = range (2001,2011),OOS_per = 1,OOS_gap = 0,sampling = "expanding",adjust_serial = True,
            cv_flag = False,cv_k = 10,write = True,IS_per = 10)
a.analyse_ratio()

Module initiated successfully ...
Procedures are: analyse_ratio()
prior probablity of fraud between 1991-2000 is 0.78%
analysis finished for OOS period 2001 after 24.121504 sec
analysis finished for OOS period 2002 after 30.244797 sec
analysis finished for OOS period 2003 after 36.652687 sec
analysis finished for OOS period 2004 after 43.547498 sec
analysis finished for OOS period 2005 after 50.652381 sec
analysis finished for OOS period 2006 after 68.725007 sec
analysis finished for OOS period 2007 after 84.684954 sec
analysis finished for OOS period 2008 after 91.410226 sec
analysis finished for OOS period 2009 after 105.368471 sec
analysis finished for OOS period 2010 after 117.555866 sec
average top percentile sensitivity for the period 2001 to 2010 is 7.33% for SVM vs 7.33% for Dechow-LR vs 8.04% for SGD vs 6.38% for ADA vs 5.54% for MLP vs 8.04% for FUSED
       models              Roc Roc_noise_to_signal Sensitivity @ 1 Prc  \
0         SVM  63.17% (11.49%)               18.2%  