In [1]:
import pandas as pd
import numpy as np
from os.path import isfile
from extra_codes import calc_vif
import matplotlib.pyplot as plt
from datetime import datetime
from statsmodels.tsa.stattools import kpss
import warnings
warnings.filterwarnings("ignore")

class ML_Fraud:
    __version__='1.0.5'
    def __init__(self,sample_start=1991,test_sample=range(2001,2011),
                 OOS_per=1,OOS_gap=0,sampling='expanding',adjust_serial=True,
                 cv_type='kfold',temp_year=1,cv_flag=False,cv_k=10,write=True,IS_per=10):

        if isfile('FraudDB2020.csv')==False:
            df=pd.DataFrame()
            for s in range(1,5):
                fl_name='FraudDB2020_Part'+str(s)+'.csv'
                new_df=pd.read_csv(fl_name)
                df=df.append(new_df)
            df.to_csv('FraudDB2020.csv',index=False)
            
        df=pd.read_csv('FraudDB2020.csv')
        self.df=df
        self.ss=sample_start
        self.se=np.max(df.fyear)
        self.ts=test_sample
        self.cv_t=cv_type
        self.cv=cv_flag
        self.cv_k=cv_k
        self.cv_t_y=temp_year
        
        sampling_set=['expanding','rolling']
        if sampling in sampling_set:
            pass
        else:
            raise ValueError('Invalid sampling choice. Permitted options are "expanding" and "rolling"')
        
        self.sa=sampling
        self.w=write
        self.ip=IS_per
        self.op=OOS_per
        self.og=OOS_gap
        self.a_s=adjust_serial
        print('Module initiated successfully ...')
        #The dir() function returns all properties and methods of the specified object, without the values.
        list_methods=dir(self)
        # .any: It checks for any element satisfying a condition and returns a True in case it finds any one element.
        reduced_methods=[item+'()' for item in list_methods if any(['analy' in item,'compare' in item,item=='sumstats'])]
        #string.join(iterable)
        print('Procedures are: '+'; '.join(reduced_methods)) 
    
    
    def analyse_raw(self, C_FN=30,C_FP=1):
        """
        This code replicates the RUSBoost model of Bao et al (2020).
        Skipping cross-validation sets the number of estimators to 1000.

        Parameters:
            – C_FN: Cost of a False Negative for ECM
            – C_FP: Cost of a False Positive for ECM
        
        
        Predictive models:
            – RUSBoost based on Scikit module
        Outputs: 
        Main results are stored in the table variable "perf_tbl_general" written into
        2 csv files: time period 2001-2010 and 2003-2008

        Steps:
            1. Cross-validate to find optimal hyperparameters.
            2. Estimating the performance for each OOS period.

        Warnings: 
            – Running this code can take up to 10 mins when CV is skipped. 
            These figures are estimates based on a MacBook Pro 2021.
            
        """
        
        
        from sklearn.tree import DecisionTreeClassifier
        from sklearn.model_selection import GridSearchCV
        from datetime import datetime
        from imblearn.ensemble import RUSBoostClassifier
        from sklearn.metrics import roc_auc_score
        from extra_codes import ndcg_k
        from sklearn.preprocessing import MinMaxScaler
        
        t0=datetime.now()

        ## setting the parameters
        
        # IS_period=self.ip since the Bao approach is an expanding one
        k_fold=self.cv_k
        OOS_period=self.op # 1 year ahead prediction
        OOS_gap=self.og # Gap between training and testing period
        start_OOS_year=self.ts[0]
        end_OOS_year=self.ts[-1]
        sample_start=self.ss
        adjust_serial=self.a_s
        cv_type=self.cv_t
        cross_val=self.cv
        temp_year=self.cv_t_y
        case_window=self.sa
        fraud_df=self.df.copy(deep=True)
        write=self.w

        reduced_tbl_1=fraud_df.iloc[:,[0,1,3,7,8]]
        reduced_tbl_2=fraud_df.iloc[:,9:-14]
        reduced_tblset=[reduced_tbl_1,reduced_tbl_2]
        reduced_tbl=pd.concat(reduced_tblset,axis=1)
        reduced_tbl=reduced_tbl.reset_index(drop=True)

        range_oos=range(start_OOS_year,end_OOS_year+1)#2001-2010
        #1991-2000
        tbl_year_IS_CV=reduced_tbl.loc[np.logical_and(reduced_tbl.fyear<start_OOS_year,\
                                                   reduced_tbl.fyear>=sample_start)]
        tbl_year_IS_CV=tbl_year_IS_CV.reset_index(drop=True)
        misstate_firms=np.unique(tbl_year_IS_CV.gvkey[tbl_year_IS_CV.AAER_DUMMY==1])

        X_CV=tbl_year_IS_CV.iloc[:,-28:]

        Y_CV=tbl_year_IS_CV.AAER_DUMMY

        P_f=np.sum(Y_CV==1)/len(Y_CV)
        P_nf=1-P_f
        
        n_opt_rus=200
        r_opt_rus=1e-5
        
        # Setting as proposed in Bao et al (2020)
        #testing OOS period with the fine tuned hyperparametres 
        roc_rusboost=np.zeros(len(range_oos))
        specificity_rusboost=np.zeros(len(range_oos))
        sensitivity_OOS_rusboost=np.zeros(len(range_oos))
        precision_rusboost=np.zeros(len(range_oos))
        sensitivity_OOS_rusboost1=np.zeros(len(range_oos))
        specificity_OOS_rusboost1=np.zeros(len(range_oos))
        precision_rusboost1=np.zeros(len(range_oos))
        ndcg_rusboost1=np.zeros(len(range_oos))
        ecm_rusboost1=np.zeros(len(range_oos))

        m=0

        for yr in range_oos:
            t1=datetime.now()
            
            year_start_IS=sample_start
            #1991-2000
            tbl_year_IS=reduced_tbl.loc[np.logical_and(reduced_tbl.fyear<yr-OOS_gap,\
                                                       reduced_tbl.fyear>=year_start_IS)]
            tbl_year_IS=tbl_year_IS.reset_index(drop=True)
            misstate_firms=np.unique(tbl_year_IS.gvkey[tbl_year_IS.AAER_DUMMY==1])
            #2001
            tbl_year_OOS=reduced_tbl.loc[reduced_tbl.fyear==yr]
            
            if adjust_serial==True:
                ok_index=np.zeros(tbl_year_OOS.shape[0])
                for s in range(0,tbl_year_OOS.shape[0]):
                    if not tbl_year_OOS.iloc[s,1] in misstate_firms:
                        ok_index[s]=True
                
            else:
                ok_index=np.ones(tbl_year_OOS.shape[0]).astype(bool)
                
            
            tbl_year_OOS=tbl_year_OOS.iloc[ok_index==True,:]
            tbl_year_OOS=tbl_year_OOS.reset_index(drop=True)
            
            X=tbl_year_IS.iloc[:,-28:]

            Y=tbl_year_IS.AAER_DUMMY
            
            X_OOS=tbl_year_OOS.iloc[:,-28:]
            
            Y_OOS=tbl_year_OOS.AAER_DUMMY
            
            n_P=np.sum(Y_OOS==1)
            n_N=np.sum(Y_OOS==0)
            
            scaling = MinMaxScaler(feature_range=(-1,1)).fit(X)
            X = scaling.transform(X)
            X_OOS = scaling.transform(X_OOS)
            
            
            base_tree=DecisionTreeClassifier(min_samples_leaf=5)
            bao_RUSboost=RUSBoostClassifier(base_estimator=base_tree,n_estimators=n_opt_rus,\
                             learning_rate=r_opt_rus,sampling_strategy=1,random_state=0)
            clf_rusboost = bao_RUSboost.fit(X,Y)
            
            probs_oos_fraud_rusboost=clf_rusboost.predict_proba(X_OOS)[:,-1]
            
            labels_rusboost=clf_rusboost.predict(X_OOS)
            
            roc_rusboost[m]=roc_auc_score(Y_OOS,probs_oos_fraud_rusboost)
            specificity_rusboost[m]=np.sum(np.logical_and(labels_rusboost==0,Y_OOS==0))/\
                np.sum(Y_OOS==0)
            
            sensitivity_OOS_rusboost[m]=np.sum(np.logical_and(labels_rusboost==1, \
                                                         Y_OOS==1))/np.sum(Y_OOS)
            precision_rusboost[m]=np.sum(np.logical_and(labels_rusboost==1,Y_OOS==1))/np.sum(labels_rusboost)
            
            
            cutoff_OOS_rusboost=np.percentile(probs_oos_fraud_rusboost,99)
            sensitivity_OOS_rusboost1[m]=np.sum(np.logical_and(probs_oos_fraud_rusboost>=cutoff_OOS_rusboost, \
                                                         Y_OOS==1))/np.sum(Y_OOS)
            specificity_OOS_rusboost1[m]=np.sum(np.logical_and(probs_oos_fraud_rusboost<cutoff_OOS_rusboost, \
                                                          Y_OOS==0))/np.sum(Y_OOS==0)
            precision_rusboost1[m]=np.sum(np.logical_and(probs_oos_fraud_rusboost>=cutoff_OOS_rusboost, \
                                                         Y_OOS==1))/np.sum(probs_oos_fraud_rusboost>=cutoff_OOS_rusboost)
            ndcg_rusboost1[m]=ndcg_k(Y_OOS,probs_oos_fraud_rusboost,99)
            
            FN_rusboost1=np.sum(np.logical_and(probs_oos_fraud_rusboost<cutoff_OOS_rusboost, \
                                                          Y_OOS==1))
            FP_rusboost1=np.sum(np.logical_and(probs_oos_fraud_rusboost>=cutoff_OOS_rusboost, \
                                                          Y_OOS==0))
                
            ecm_rusboost1[m]=C_FN*P_f*FN_rusboost1/n_P+C_FP*P_nf*FP_rusboost1/n_N
            
            
            t2=datetime.now() 
            dt=t2-t1
            print('analysis finished for OOS period '+str(yr)+' after '+str(dt.total_seconds())+' sec')
            m+=1

        print('average top percentile sensitivity for the period '+str(start_OOS_year)+' to '+\
              str(end_OOS_year)+' is '+str(round(np.mean(sensitivity_OOS_rusboost1)*100,2))+\
                  '% for RUSBoost-28')

        # create performance table now
        perf_tbl_general=pd.DataFrame()
        perf_tbl_general['models']=['RUS28_kfold']

        perf_tbl_general['Roc']=str(np.round(
            np.mean(roc_rusboost)*100,2))+'% ('+\
            str(np.round(np.std(roc_rusboost)*100,2))+'%)'
        
        perf_tbl_general['Roc_noise_to_signal']=str(np.round(
            np.std(roc_rusboost)/np.mean(roc_rusboost)*100,2))+'%'
                                                    
        perf_tbl_general['Sensitivity @ 1 Prc']=str(np.round(
            np.mean(sensitivity_OOS_rusboost1)*100,2))+'% ('+\
            str(np.round(np.std(sensitivity_OOS_rusboost1)*100,2))+'%)'
        
        perf_tbl_general['Sensitivity_noise_to_signal @ 1 Prc']=str(np.round(
            np.std(sensitivity_OOS_rusboost1)/np.mean(sensitivity_OOS_rusboost1)*100,2))+'%'

        perf_tbl_general['Specificity @ 1 Prc']=str(np.round(
            np.mean(specificity_OOS_rusboost1)*100,2))+'% ('+\
            str(np.round(np.std(specificity_OOS_rusboost1)*100,2))+'%)'
        
        perf_tbl_general['Specificity_noise_to_signal @ 1 Prc']=str(np.round(
            np.std(specificity_OOS_rusboost1)/np.mean(specificity_OOS_rusboost1)*100,2))+'%'
        

        perf_tbl_general['Precision @ 1 Prc']=str(np.round(
            np.mean(precision_rusboost1)*100,2))+'% ('+\
            str(np.round(np.std(precision_rusboost1)*100,2))+'%)'
        
        perf_tbl_general['Precision_noise_to_signal @ 1 Prc']=str(np.round(
            np.std(precision_rusboost1)/np.mean(precision_rusboost1)*100,2))+'%'
        
        f1_score_rusboost1=2*(precision_rusboost1*sensitivity_OOS_rusboost1)/\
            (precision_rusboost1+sensitivity_OOS_rusboost1+1e-8)
        
        perf_tbl_general['F1 Score @ 1 Prc']=str(np.round(
            np.mean(f1_score_rusboost1)*100,2))+'% ('+\
            str(np.round(np.std(f1_score_rusboost1)*100,2))+'%)'
        
        perf_tbl_general['F1 Score_noise_to_signal @ 1 Prc']=str(np.round(
            np.std(f1_score_rusboost1)/np.mean(f1_score_rusboost1)*100,2))+'%'
                                                    
        perf_tbl_general['NDCG @ 1 Prc']=str(np.round(
            np.mean(ndcg_rusboost1)*100,2))+'% ('+\
            str(np.round(np.std(ndcg_rusboost1)*100,2))+'%)'
        
        perf_tbl_general['NDCG_noise_to_signal @ 1 Prc']=str(np.round(
            np.std(ndcg_rusboost1)/np.mean(ndcg_rusboost1)*100,2))+'%'

        perf_tbl_general['ECM @ 1 Prc']=str(np.round(
            np.mean(ecm_rusboost1)*100,2))+'% ('+\
            str(np.round(np.std(ecm_rusboost1)*100,2))+'%)'
        
        perf_tbl_general['ECM_noise_to_signal @ 1 Prc']=str(np.round(
            np.std(ecm_rusboost1)/np.mean(ecm_rusboost1)*100,2))+'%'
        
                   
        lbl_perf_tbl='perf_tbl_'+str(start_OOS_year)+'_'+str(end_OOS_year)+\
                    '_'+case_window+',OOS='+str(OOS_period)+',serial='+str(adjust_serial)+\
                        ',gap='+str(OOS_gap)+'_kfold_RUSBoost.csv'

        if write==True:
            perf_tbl_general.to_csv(lbl_perf_tbl,index=False)
        print(perf_tbl_general)
        t_last=datetime.now()
        dt_total=t_last-t0
        print('total run time is '+str(dt_total.total_seconds())+' sec')

In [2]:
a = ML_Fraud(sample_start = 1991,test_sample = range (2001,2011),OOS_per = 1,OOS_gap = 0,sampling = "expanding",adjust_serial = True,
            cv_flag = False,cv_k = 10,write = True,IS_per = 10)
a.analyse_raw()

Module initiated successfully ...
Procedures are: analyse_raw()
analysis finished for OOS period 2001 after 5.139632 sec
analysis finished for OOS period 2002 after 5.703513 sec
analysis finished for OOS period 2003 after 6.344101 sec
analysis finished for OOS period 2004 after 7.062975 sec
analysis finished for OOS period 2005 after 7.628896 sec
analysis finished for OOS period 2006 after 8.830884 sec
analysis finished for OOS period 2007 after 9.541539 sec
analysis finished for OOS period 2008 after 10.302172 sec
analysis finished for OOS period 2009 after 10.553144 sec
analysis finished for OOS period 2010 after 10.891895 sec
average top percentile sensitivity for the period 2001 to 2010 is 2.71% for RUSBoost-28
        models             Roc Roc_noise_to_signal Sensitivity @ 1 Prc  \
0  RUS28_kfold  63.12% (9.65%)              15.29%       2.71% (6.14%)   

  Sensitivity_noise_to_signal @ 1 Prc Specificity @ 1 Prc  \
0                             226.32%      98.99% (0.01%)   

  S