# Housekeeping

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import os
import sqlite3

from jsonschema.benchmarks.const_vs_enum import value
from pandas.tseries.offsets import *
from datetime import timedelta
from datetime import datetime
from dateutil.relativedelta import relativedelta
import warnings
warnings.filterwarnings('ignore')

today = dt.datetime.today().strftime('%Y-%m-%d')
today

'2026-01-08'

# Pull AA data from WRDS - accessed 1/8/2026; WRDS last updated 10/30/2025


In [25]:
import wrds
db = wrds.Connection()

WRDS recommends setting up a .pgpass file.
pgpass file created at C:\Users\jessf\AppData\Roaming\postgresql\pgpass.conf
Created .pgpass file successfully.
You can create this file yourself at any time with the create_pgpass_file() function.
Loading library list...
Done


In [3]:
opinion = db.get_table('audit', 'feed34_revised_audit_opinions',columns=['AUDIT_OP_KEY',
                                                                         'AUDITOR_FKEY',
                                                                         'AUDITOR_NAME',
                                                                         'SIG_DATE_OF_OP',
                                                                         'GOING_CONCERN',
                                                                         'AUDITOR_CITY',
                                                                         'AUDITOR_STATE',
                                                                         'AUDITOR_STATE_NAME',
                                                                         'AUDITOR_REGION',
                                                                         'FISCAL_YEAR_END_OP',
                                                                         'OP_AUD_PCAOB',
                                                                         'IS_NTH_ADD_OP',
                                                                         'ACCNT_BASIS',
                                                                         'INTEGRATED_AUDIT',
                                                                         'FILE_DATE',
                                                                         'COMPANY_FKEY'])

opinion.to_pickle(f"../Data/raw/audit_aa_opinions_raw_{today}.pkl")

len(opinion)

527206

In [27]:
fees = db.get_table('audit', 'feed03_audit_fees', columns=['AUDITOR_FKEY',
                                                           'AUDIT_GIG_KEY',
                                                           'FISCAL_YEAR_ENDED',
                                                           'AUDIT_FEES',
                                                           'NON_AUDIT_FEES',
                                                           'TOTAL_FEES',
                                                           'AUDIT_RELATED_FEES',
                                                           'TAX_FEES',
                                                           'RESTATEMENT',
                                                           'FILE_DATE',
                                                           'COMPANY_FKEY'])

fees.to_pickle(f"../Data/raw/audit_aa_fees_raw_{today}.pkl")

len(fees)

285955

In [28]:
restate = db.get_table('audit', 'feed39_financial_restatements', columns=['RESTATEMENT_NOTIFICATION_KEY',
                                                                          'RESTATEMENT_TYPE_FKEY',
                                                                          'RES_ACCOUNTING_RES_CAT_FKE_LIS',
                                                                          'RES_BEGIN_DATE',
                                                                          'RES_END_DATE',
                                                                          'FILE_DATE',
                                                                          'DATE_OF_8K_402',
                                                                          'COMPANY_FKEY'])

restate.to_pickle(f"../Data/raw/audit_aa_restatements_raw_{today}.pkl")

len(restate)

28789

In [30]:
icfr = db.get_table('audit', 'feed11_sox_404_internal_controls', columns=['IS_NTH_RESTATE',
                                                                          'IC_OP_FKEY',
                                                                          'IC_OP_TYPE',
                                                                          'AUDITOR_FKEY',
                                                                          'AUDITOR_AGREES',
                                                                          'IC_IS_EFFECTIVE',
                                                                          'FYE_IC_OP',
                                                                          'SIG_DATE_IC_OP',
                                                                          'COUNT_WEAK',
                                                                          'FILE_DATE',
                                                                          'COMPANY_FKEY'])

icfr.to_pickle(f"../Data/raw/audit_aa_icfr_raw_{today}.pkl")

len(icfr)

235171

# Prepare opinions data for analysis

**Opinions**

In [5]:
# format datetime variables

opinion['fiscal_year_end_op'] = pd.to_datetime(opinion['fiscal_year_end_op'], format='%Y-%m-%d')
opinion['sig_date_of_op'] = pd.to_datetime(opinion['sig_date_of_op'],  format='%Y-%m-%d')
opinion['file_date'] = pd.to_datetime(opinion['file_date'],  format='%Y-%m-%d')

In [6]:
# remove foreign and canadian auditors

op = opinion.loc[(opinion['auditor_region'] != 'Foreign' ) & (opinion['auditor_region'] != 'Canada')]
len(op)

475113

In [7]:
# keep only if original opinion

op2 = op.loc[(op['is_nth_add_op'] == 0)]
len(op2)

352781

In [8]:
# create fiscal year variable based on typical convention (i.e., fiscal year ending before June = prior year)

op2['fyear'] = np.where(op2['fiscal_year_end_op'].dt.month < 6,
                        op2['fiscal_year_end_op'].dt.year - 1,
                        op2['fiscal_year_end_op'].dt.year)

In [13]:
# create audit lag variable (#of days between fiscal year end and opinion date)

op2['audit_lag'] = (op2['sig_date_of_op'] - op2['fiscal_year_end_op']).dt.days
op2['ln_audit_lag'] = np.log(op2['audit_lag'] + 1)

In [14]:
# count number fo clients per auditor-year

op2['auditor_year'] = op2['auditor_fkey'].astype(str) + '_' + op2['fyear'].astype(str)
auditor_counts = op2.groupby('auditor_year')['company_fkey'].nunique().reset_index()
auditor_counts.columns = ['auditor_year', 'num_clients']
op3 = op2.merge(auditor_counts, on='auditor_year', how='left')
op3['ln_num_clients'] = np.log(op3['num_clients'] + 1)

len(op3)

352781

Create Big4 and Mid4 variables

In [20]:
# Create Big 4 indicator variable

big4_fkeys = [1.0, 2.0, 3.0, 4.0]
op3['big4'] = op3['auditor_fkey'].isin(big4_fkeys).astype(int)

In [32]:
#correct some of the mid-4 observations and create mid4 variables

auditor_mapping = {
    2830: 8,   # Crowe
    28440: 8,  # Crowe
    30212: 8,  # Crowe
    11761: 7,  # BDO
    16168: 10, # McGladrey/RSM
    26516: 10  # McGladrey/RSM
}

op3['auditor_fkey'] = op3['auditor_fkey'].replace(auditor_mapping)

mid4_fkeys = [6.0, 7.0, 8.0, 10.0]
op3['mid4'] = op3['auditor_fkey'].isin(mid4_fkeys).astype(int)

In [33]:
# drop observations if missing CIK

op4 = op3.dropna(subset=['company_fkey'])
len(op4)

352781

Determine whether there is a new auditor or first time GCO

In [39]:
# start by lagging variables

needs_lag = ['auditor_fkey','going_concern']

op5 = op4.sort_values(by=['company_fkey', 'fiscal_year_end_op'])

is_consecutive = op5.groupby('company_fkey')['fyear'].diff().eq(1)

for var in needs_lag:
    op5[f'lag_{var}'] = op5.groupby('company_fkey')[var].shift(1)
    op5.loc[~is_consecutive, f'lag_{var}'] = np.nan

In [43]:
op5['new_auditor'] = np.where(
    op5['lag_auditor_fkey'].isna(),
    np.nan,
    (op5['auditor_fkey'] != op5['lag_auditor_fkey']).astype(float)
)

op5['new_gco'] = np.where(
    op5['lag_going_concern'].isna(),
    np.nan,
    ((op5['going_concern'] == 1) & (op5['lag_going_concern'] != 1)).astype(float)
)

In [47]:
len(op5)

352781

Create auditor tenure variables

In [55]:
op6 = op5.copy()

In [58]:
# Sort by company and fiscal year
op6 = op6.sort_values(['company_fkey', 'fyear'])

# Replace NaN with 1 for the cumsum (treat first observation as a change)
op6['new_auditor_temp'] = op6['new_auditor'].fillna(1)

# Create cumulative sum of changes within each company to create auditor "groups"
op6['auditor_group'] = op6.groupby('company_fkey')['new_auditor_temp'].cumsum()

# Count tenure within each company-auditor group
op6['auditor_tenure'] = op6.groupby(['company_fkey', 'auditor_group']).cumcount() + 1

op6 = op6.drop(columns=['new_auditor_temp', 'auditor_group'])

In [60]:
len(op6)

352781

Handle some city/state variables for audit-office variables

In [63]:
op6['city_state'] = op6['auditor_city'].str.strip().str.upper() + op6['auditor_state'].str.strip().str.upper()

In [66]:
#import MSA data

msa = pd.read_csv("../Data/analysis/msadata.csv")

In [68]:
#merge in MSA codes

op7 = op6.merge(msa[['MSA', 'citystate']],
                   left_on=['city_state'],
                   right_on='citystate',
                   how='left')

Save temporary files

In [70]:
# save as temp file

op7.to_pickle(f"../Data/temp/audit_aa_opinions_processed_{today}.pkl")

# Prepare fees data for analysis

In [71]:
# format datetime variables

fees['fiscal_year_ended'] = pd.to_datetime(fees['fiscal_year_ended'], format='%Y-%m-%d')
fees['file_date'] = pd.to_datetime(fees['file_date'],  format='%Y-%m-%d')

In [72]:
#create fee variables

fees['ln_audit_fees'] = np.log(fees['audit_fees'] + 1)
fees['ln_non_audit_fees'] = np.log(fees['non_audit_fees'] + 1)

In [73]:
#merge fee data into opinions file - LEFT OFF HERE WITHOUT DOING MERGE

aa_merge = op7.merge(fees[['auditor_fkey',
                             'fiscal_year_ended',
                             'audit_fees',
                             'non_audit_fees',
                             'total_fees',
                             'audit_related_fees',
                             'tax_fees',
                             'ln_audit_fees',
                             'ln_non_audit_fees']],
                   left_on=['auditor_fkey', 'fiscal_year_end_op'],
                   right_on=['auditor_fkey', 'fiscal_year_ended'],
                   how='left')

MemoryError: Unable to allocate 926. MiB for an array with shape (121357887, 1) and data type object

# Prepare restatements data for analysis

# Prepare ICFR data for analysis

# Merge basic AA data together