In [13]:
import pandas as pd
import numpy as np
from os.path import isfile
from extra_codes import calc_vif
import matplotlib.pyplot as plt
from datetime import datetime
from statsmodels.tsa.stattools import kpss
import warnings
warnings.filterwarnings("ignore")

class ML_Fraud:
    __version__='1.0.5'
    def __init__(self,sample_start=1991,test_sample=range(2001,2011),
                 OOS_per=1,OOS_gap=0,sampling='expanding',adjust_serial=True,
                 cv_type='kfold',temp_year=1,cv_flag=False,cv_k=10,write=True,IS_per=10):

        if isfile('FraudDB2020.csv')==False:
            df=pd.DataFrame()
            for s in range(1,5):
                fl_name='FraudDB2020_Part'+str(s)+'.csv'
                new_df=pd.read_csv(fl_name)
                df=df.append(new_df)
            df.to_csv('FraudDB2020.csv',index=False)
            
        df=pd.read_csv('FraudDB2020.csv')
        self.df=df
        self.ss=sample_start
        self.se=np.max(df.fyear)
        self.ts=test_sample
        self.cv_t=cv_type
        self.cv=cv_flag
        self.cv_k=cv_k
        self.cv_t_y=temp_year
        
        sampling_set=['expanding','rolling']
        if sampling in sampling_set:
            pass
        else:
            raise ValueError('Invalid sampling choice. Permitted options are "expanding" and "rolling"')
        
        self.sa=sampling
        self.w=write
        self.ip=IS_per
        self.op=OOS_per
        self.og=OOS_gap
        self.a_s=adjust_serial
        print('Module initiated successfully ...')
        #The dir() function returns all properties and methods of the specified object, without the values.
        list_methods=dir(self)
        # .any: It checks for any element satisfying a condition and returns a True in case it finds any one element.
        reduced_methods=[item+'()' for item in list_methods if any(['analy' in item,'compare' in item,item=='sumstats'])]
        #string.join(iterable)
        print('Procedures are: '+'; '.join(reduced_methods))
    
    def mc_analysis(self,B=1000,adjust_serial=None,C_FN=30,C_FP=1):
       
        from sklearn.svm import SVC
        from sklearn.model_selection import GridSearchCV
        from sklearn.metrics import roc_auc_score
        from extra_codes import ndcg_k
        import pickle
        from sklearn.preprocessing import MinMaxScaler
        
        t0=datetime.now()
        # setting the parameters
        IS_period=self.ip
        k_fold=self.cv_k
        OOS_period=self.op # 1 year ahead prediction
        OOS_gap=self.og # Gap between training and testing period
        start_OOS_year=self.ts[0] #2001
        end_OOS_year=self.ts[-1] #2010
        sample_start=self.ss #1991
        if adjust_serial==None:
            adjust_serial=self.a_s
        cross_val=self.cv
        case_window=self.sa #expanding or rolling window
        fraud_df=self.df
        write=self.w
        
        dict_db=pickle.load(open('features_fk.pkl','r+b'))
        tbl_ratio_fk=dict_db['lagged_Data']
        mapped_X=dict_db['matrix']
        red_tbl_fk=tbl_ratio_fk.iloc[:,-46:]
        print('pickle file loaded successfully ...')
        
        tbl_year_IS_CV_index=tbl_ratio_fk[np.logical_and(tbl_ratio_fk.fyear<=2010,\
                                                       tbl_ratio_fk.fyear>=1991)].index
        
        tbl_year_IS_CV=tbl_ratio_fk.loc[np.logical_and(tbl_ratio_fk.fyear<=2010,\
                                                   tbl_ratio_fk.fyear>=1991)]
            
        X_CV=mapped_X[tbl_year_IS_CV_index,:]
        idx_set=np.where(np.logical_and(np.isnan(X_CV).any(axis=1)==False,\
                                             np.isinf(X_CV).any(axis=1)==False))[0]
        tbl_year_IS_CV=tbl_year_IS_CV.iloc[idx_set,:]
        
        X_CV=X_CV[idx_set,:]
        X_CV=(X_CV-np.mean(X_CV,axis=0))/np.std(X_CV,axis=0)
        Y_CV=tbl_ratio_fk.AAER_DUMMY[tbl_year_IS_CV_index]
        Y_CV=Y_CV.iloc[idx_set]
        
        P_f=np.sum(Y_CV==1)/len(Y_CV)
        P_nf=1-P_f
        
        print('prior probablity of fraud between '+str(sample_start)+'-'+
              str(start_OOS_year-1)+' is '+str(np.round(P_f*100,2))+'%')

        # Setting the cross-validation setting
        t000=datetime.now() 
        
        range_oos=range(start_OOS_year,end_OOS_year+1,OOS_period) #(2001,2010+1,1)
        roc_ratio=np.zeros(len(range_oos))
        count_positive_train=np.zeros(len(range_oos))
        count_positive_test=np.zeros(len(range_oos))
        drop_serial=np.zeros(len(range_oos))
        ndcg_ratio=np.zeros(len(range_oos))
        sensitivity_ratio=np.zeros(len(range_oos))
        specificity_ratio=np.zeros(len(range_oos))
        precision_ratio=np.zeros(len(range_oos))
        ecm_ratio=np.zeros(len(range_oos))
        f1_ratio=np.zeros(len(range_oos))
        
        m=0
        for yr in range_oos: #2001-2010
            t1=datetime.now()
            if case_window=='expanding':
                year_start_IS=sample_start #1991
            else:
                year_start_IS=yr-IS_period #1991
            #how many years between training and testing sample: 
            #expanding: 1991-2000, 1991-2001
            #rolling: 1991-2000, 1992-2001
            
            reduced_tbl=tbl_ratio_fk[tbl_ratio_fk.fyear>=sample_start] #1991
            reduced_tbl=tbl_ratio_fk[tbl_ratio_fk.fyear<=end_OOS_year] #2010
            
            tbl_year_IS_index=reduced_tbl[np.logical_and(reduced_tbl.fyear<yr-OOS_gap,\
                                                       reduced_tbl.fyear>=year_start_IS)].index
        
            tbl_year_IS=reduced_tbl.loc[np.logical_and(reduced_tbl.fyear<yr-OOS_gap,\
                                                       reduced_tbl.fyear>=year_start_IS)]
            
            X=mapped_X[tbl_year_IS_index,:]
            idx_set=np.where(np.logical_and(np.isnan(X).any(axis=1)==False,\
                                             np.isinf(X).any(axis=1)==False))[0]
            tbl_year_IS=tbl_year_IS.iloc[idx_set,:]
        
            X=X[idx_set,:]
            mean_vals=np.mean(X)
            std_vals=np.std(X)
            X=(X-mean_vals)/std_vals
            Y=tbl_ratio_fk.AAER_DUMMY[tbl_year_IS_index]
            Y=Y.iloc[idx_set]  
            
            misstate_firms=np.unique(tbl_year_IS.gvkey[tbl_year_IS.AAER_DUMMY==1])
            #How many periods constitute the testing sample at a time: 2001, 2002
            tbl_year_OOS=reduced_tbl.loc[np.logical_and(reduced_tbl.fyear>=yr,\
                                                        reduced_tbl.fyear<yr+OOS_period)]
            
            if adjust_serial==True:
                ok_index=np.zeros(tbl_year_OOS.shape[0])
                for s in range(0,tbl_year_OOS.shape[0]):
                    if not tbl_year_OOS.iloc[s,1] in misstate_firms:
                        ok_index[s]=True
                    
                
            else:
                #filled with ones and keep all observations including serial frauds
                ok_index=np.ones(tbl_year_OOS.shape[0]).astype(bool)
                
            #deleting observations where a company appears both in IS and OOS samples
            tbl_year_OOS=tbl_year_OOS.iloc[ok_index==True,:]
            tbl_year_OOS=tbl_year_OOS.reset_index(drop=True)
            tbl_year_OOS_index=tbl_year_OOS.index
            
            X_OOS=mapped_X[tbl_year_OOS_index,:]
            X_OOS=(X_OOS-mean_vals)/std_vals
            
            Y_OOS=tbl_year_OOS.AAER_DUMMY
            n_P=np.sum(Y_OOS==1)
            n_N=np.sum(Y_OOS==0)
            
            scaling = MinMaxScaler(feature_range=(-1,1)).fit(X)
            X = scaling.transform(X)
            X_OOS = scaling.transform(X_OOS)
            
            t01=datetime.now()
            
            print('5')
            #require a pickle file containing lagged data of financial ratios
            svm_fk=SVC(class_weight={0: 0.02, 1: 1},kernel='linear',shrinking=False,\
                            probability=False,random_state=0,cache_size=1000,\
                                tol=X.shape[-1]*1e-3)
            clf_svm_fk=svm_fk.fit(X, Y)
            predicted_test=clf_svm_fk.decision_function(X_OOS)
            predicted_test[predicted_test>=1]=1+np.log(predicted_test[predicted_test>=1])
            predicted_test=np.exp(predicted_test)/(1+np.exp(predicted_test))
            roc_ratio[m]=roc_auc_score(Y_OOS,predicted_test)
            
            #numpy.percentile()function used to compute the nth percentile of the given data (array elements) along the specified axis. 
            #classification threshold (99th percentile) = 1 or 0
            cutoff_ratio=np.percentile(predicted_test,95)
            # predicted value higher than the threshold value will flag the observation as positive, whether correctly or not (TP+FP)
            labels_ratio=(predicted_test>=cutoff_ratio).astype(int)
            #fraud correctly classified 
            sensitivity_ratio[m]=np.sum(np.logical_and(labels_ratio==1,Y_OOS==1))/np.sum(Y_OOS)
            #non-fraud correctly classified
            specificity_ratio[m]=np.sum(np.logical_and(labels_ratio==0,Y_OOS==0))/np.sum(Y_OOS==0)
            #the number of true positives to the total number of positives 
            precision_ratio[m]=np.sum(np.logical_and(labels_ratio==1,Y_OOS==1))/np.sum(labels_ratio)
            #Pandas Series.to_numpy() function is used to return a NumPy ndarray representing the values in given Series or Index.
            ndcg_ratio[m]=ndcg_k(Y_OOS.to_numpy(),predicted_test,95)
            
            FN=np.sum(np.logical_and(predicted_test<cutoff_ratio, \
                                                          Y_OOS==1))
            FP=np.sum(np.logical_and(predicted_test>=cutoff_ratio, \
                                                          Y_OOS==0))
            # C_FN: Cost of a False Negative for ECM  -30
            #C_FP: Cost of a False Positive for ECM  -1
   
            ecm_ratio[m]=C_FN*P_f*FN/n_P+C_FP*P_nf*FP/n_N
    
    
            t2=datetime.now() 
            dt=t2-t1
            print('analysis finished for OOS period '+str(yr)+' after '+str(dt.total_seconds())+' sec')
            m+=1
            
        
        f1_ratio=2*(precision_ratio*sensitivity_ratio)/(precision_ratio+sensitivity_ratio+1e-8)
        
        # create performance table now
        perf_tbl_general=pd.DataFrame()
        perf_tbl_general['models']=['FK23_kfold']
        perf_tbl_general['Roc']=[str(np.round(
            np.mean(roc_ratio)*100,2))+'% ('+\
            str(np.round(np.std(roc_ratio)*100,2))+'%)']
        
        perf_tbl_general['Roc_noise_to_signal']=[str(np.round(
            np.std(roc_ratio)/np.mean(roc_ratio)*100,2))+'%']
                                                    
        perf_tbl_general['Sensitivity @ 1 Prc']=[str(np.round(
            np.mean(sensitivity_ratio)*100,2))+'% ('+\
            str(np.round(np.std(sensitivity_ratio)*100,2))+'%)']
        
        perf_tbl_general['Sensitivity_noise_to_signal']=[str(np.round(
            np.std(sensitivity_ratio)/np.mean(sensitivity_ratio)*100,2))+'%']

        perf_tbl_general['Specificity @ 1 Prc']=[str(np.round(
            np.mean(specificity_ratio)*100,2))+'% ('+\
            str(np.round(np.std(specificity_ratio)*100,2))+'%)'] 
        
        perf_tbl_general['Specificity_noise_to_signal']=[str(np.round(
            np.std(specificity_ratio)/np.mean(specificity_ratio)*100,2))+'%']
        

        perf_tbl_general['Precision @ 1 Prc']=[str(np.round(
            np.mean(precision_ratio)*100,2))+'% ('+\
            str(np.round(np.std(precision_ratio)*100,2))+'%)']
        
        perf_tbl_general['Precision_noise_to_signal']=[str(np.round(
            np.std(precision_ratio)/np.mean(precision_ratio)*100,2))+'%']

        perf_tbl_general['F1 Score @ 1 Prc']=[str(np.round(
            np.mean(f1_ratio)*100,2))+'% ('+\
            str(np.round(np.std(f1_ratio)*100,2))+'%)']
            
        perf_tbl_general['F1 Score_noise_to_signal']=[str(np.round(
            np.std(f1_ratio)/np.mean(f1_ratio)*100,2))+'%']
        
        perf_tbl_general['NDCG @ 1 Prc']=[str(np.round(
            np.mean(ndcg_ratio)*100,2))+'% ('+\
            str(np.round(np.std(ndcg_ratio)*100,2))+'%)']
        
        perf_tbl_general['NDCG_noise_to_signal']=[str(np.round(
            np.std(ndcg_ratio)/np.mean(ndcg_ratio)*100,2))+'%']
        
        
        perf_tbl_general['ECM @ 1 Prc']=[str(np.round(
            np.mean(ecm_ratio)*100,2))+'% ('+\
            str(np.round(np.std(ecm_ratio)*100,2))+'%)']
        
        perf_tbl_general['ECM_noise_to_signal']=[str(np.round(
            np.std(ecm_ratio)/np.mean(ecm_ratio)*100,2))+'%']
        
        lbl_perf_tbl='MC_results_FK23_kfold'+'.csv'
                        
        if write==True:
            perf_tbl_general.to_csv(lbl_perf_tbl,index=True)
        
        t001=datetime.now()
        dt00=t001-t000
        print('MC analysis is completed after '+str(dt00.total_seconds())+' seconds')

In [14]:
a = ML_Fraud(sample_start = 1991,test_sample = range (2001,2011),OOS_per = 1,OOS_gap = 0,sampling = "expanding",adjust_serial = True,
            cv_flag = False,cv_k = 10,write = True,IS_per = 10)
a.mc_analysis()

Module initiated successfully ...
Procedures are: mc_analysis()
pickle file loaded successfully ...
prior probablity of fraud between 1991-2000 is 0.89%
5
analysis finished for OOS period 2001 after 220.423968 sec
5
analysis finished for OOS period 2002 after 299.296065 sec
5
analysis finished for OOS period 2003 after 385.583004 sec
5
analysis finished for OOS period 2004 after 466.655009 sec
5
analysis finished for OOS period 2005 after 549.564889 sec
5
analysis finished for OOS period 2006 after 596.97971 sec
5
analysis finished for OOS period 2007 after 652.756198 sec
5
analysis finished for OOS period 2008 after 735.795254 sec
5
analysis finished for OOS period 2009 after 759.902828 sec
5
analysis finished for OOS period 2010 after 790.267023 sec
MC analysis is completed after 5457.311567 seconds
