In [59]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [60]:
# Load the datasets
audit_fees = pd.read_csv('audit_fees_from_wrds.csv', sep=',')
cams = pd.read_csv('cams_from_wrds_2.csv', sep=',')
audit_opinions = pd.read_csv('audit_opinions_wrds.csv', sep=',')
# Verify the data loaded by printing the first few rows
print("---Audit Fees Data---")
print(audit_fees.head())
print("---CAMs Data---")
print(cams.head())
print("---Audit Opinions Data---")
print(audit_opinions.head())

---Audit Fees Data---
   FISCAL_YEAR  AUDIT_FEES   FILE_DATE  COMPANY_FKEY  MATCHFY_BALSH_BOOK_VAL  \
0         2020     2007000  2020-08-28          1750             780900000.0   
1         2021     1930500  2021-08-18          1750             850600000.0   
2         2022     1998416  2022-08-09          1750             914800000.0   
3         2023     1998132  2023-08-08          1750             859600000.0   
4         2024     2952000  2024-08-06          1750             399600000.0   

   MATCHFY_BALSH_ASSETS  MATCHFY_INCMST_NETINC_TTM  SIC_CODE_FKEY  \
0          2.079000e+09                  4400000.0         3720.0   
1          1.539700e+09                 35800000.0         3720.0   
2          1.573900e+09                 78700000.0         3720.0   
3          1.833100e+09                 90200000.0         3720.0   
4          2.770000e+09                 46300000.0         3720.0   

   CURR_AUD_FKEY CURR_AUD_NAME  
0              4      KPMG LLP  
1              4

In [61]:
# Filter audit_fees for Fiscal Year 2020-2025

audit_fees_filtered = audit_fees[
    (audit_fees['FISCAL_YEAR'] >= 2020) & (audit_fees['FISCAL_YEAR'] <= 2025)
]

In [62]:
# Filter audit_fees_filtered for Big 4 Firms

# Define the keywords for the Big 4 firms
big4_keywords = ['PricewaterhouseCoopers', 'Ernst & Young', 'Deloitte', 'KPMG']

# Create a filter (mask) that checks if the auditor name contains any of these keywords
mask_big4 = audit_fees_filtered['CURR_AUD_NAME'].str.contains('|'.join(big4_keywords), na=False)

# Apply the filter to create a new dataframe
audit_fees_filtered_big_4 = audit_fees_filtered[mask_big4].copy()

# Check the results
print(f"Original Row Count: {len(audit_fees_filtered)}")
print(f"Big 4 Row Count: {len(audit_fees_filtered_big_4)}")
print("\n--- Auditor Counts ---")
print(audit_fees_filtered_big_4['CURR_AUD_NAME'].value_counts())

Original Row Count: 51703
Big 4 Row Count: 28673

--- Auditor Counts ---
CURR_AUD_NAME
PricewaterhouseCoopers LLP    8935
Deloitte & Touche LLP         7661
Ernst & Young LLP             7221
KPMG LLP                      4856
Name: count, dtype: int64


# --- Step 2: Merge Opinions into CAMs to get the Fiscal Year ---

In [63]:
# We only need the key and the year from the opinions file
# We create a subset so we don't duplicate columns we don't need
opinions_subset = audit_opinions[['AUDIT_OP_KEY', 'FISCAL_YEAR_OF_OP']]

# Merge them into the CAMs dataframe
# left_on points to the key in 'cams', right_on points to the key in 'opinions'
cams_with_year = pd.merge(
    cams,
    opinions_subset,
    left_on='AUDIT_OPINION_FKEY',
    right_on='AUDIT_OP_KEY',
    how='left'
)

# Verify the merge: Check if 'FISCAL_YEAR_OF_OP' now exists in the CAMs data
print("--- CAMs with Year Added ---")
print(cams_with_year[['CRITICAL_AUDIT_MATTER_KEY', 'FISCAL_YEAR_OF_OP']].head())

--- CAMs with Year Added ---
   CRITICAL_AUDIT_MATTER_KEY  FISCAL_YEAR_OF_OP
0                      12414             2021.0
1                      12415             2021.0
2                      21834             2022.0
3                      21835             2022.0
4                      30697             2023.0


In [64]:
# Check how many CAMs failed to find a year (Missing values)
missing_years = cams_with_year['FISCAL_YEAR_OF_OP'].isna().sum()
print(f"Rows with missing years: {missing_years}")

# Drop rows where the year is missing (we can't use them)
cams_with_year = cams_with_year.dropna(subset=['FISCAL_YEAR_OF_OP'])

# Convert the year column to Integer (removes the .0)
cams_with_year['FISCAL_YEAR_OF_OP'] = cams_with_year['FISCAL_YEAR_OF_OP'].astype(int)

# Check the result
print("--- Fixed Years ---")
print(cams_with_year[['CRITICAL_AUDIT_MATTER_KEY', 'FISCAL_YEAR_OF_OP']].head())

Rows with missing years: 443
--- Fixed Years ---
   CRITICAL_AUDIT_MATTER_KEY  FISCAL_YEAR_OF_OP
0                      12414               2021
1                      12415               2021
2                      21834               2022
3                      21835               2022
4                      30697               2023


In [65]:
# Pivot the CAMs Table

cam_types = pd.pivot_table(
    cams_with_year,
    index=['COMPANY_FKEY', 'FISCAL_YEAR_OF_OP'],
    columns='TOPIC_NAME',
    aggfunc='size',
    fill_value=0
).reset_index()

# Clean up column names
# We'll add a prefix "CAM_" to make them easy to identify later
cam_types.columns.name = None  # Remove the index name
new_columns = ['COMPANY_FKEY', 'FISCAL_YEAR_OF_OP'] + ['CAM_' + col.replace(' ', '_') for col in cam_types.columns[2:]]
cam_types.columns = new_columns


# We still want the TOTAL count (including the rare ones we filtered out)
# So let's merge this back with our original total count
total_counts = cams_with_year.groupby(['COMPANY_FKEY', 'FISCAL_YEAR_OF_OP']).size().reset_index(name='NUM_CAMS')

# Merge detailed types with total counts
final_cam_data = pd.merge(total_counts, cam_types, on=['COMPANY_FKEY', 'FISCAL_YEAR_OF_OP'], how='left')

print(final_cam_data.head())

   COMPANY_FKEY  FISCAL_YEAR_OF_OP  NUM_CAMS  CAM_Accounts/loans_receivable  \
0          1750               2020         2                              0   
1          1750               2021         2                              0   
2          1750               2022         2                              0   
3          1750               2023         2                              0   
4          1750               2024         2                              0   

   CAM_Allowance_for_credit_losses  \
0                                0   
1                                0   
2                                0   
3                                0   
4                                0   

   CAM_Asset_retirement_and_environmental_obligations  \
0                                                  0    
1                                                  0    
2                                                  0    
3                                                  0    
4          

# --- Step 3: Merge Audit Fees with CAM Counts ---

In [66]:
# Merge audit_fees (left) with final_cam_data (right)
# Left Key: COMPANY_FKEY, FISCAL_YEAR
# Right Key: COMPANY_FKEY, FISCAL_YEAR_OF_OP
df_merged = pd.merge(
    audit_fees_filtered_big_4,
    final_cam_data,
    left_on=['COMPANY_FKEY', 'FISCAL_YEAR'],
    right_on=['COMPANY_FKEY', 'FISCAL_YEAR_OF_OP'],
    how='left'
)

# Check the result
print("--- Merged Data with CAM Types ---")
# Show specific columns to verify
cols_to_show = ['COMPANY_FKEY', 'FISCAL_YEAR', 'AUDIT_FEES', 'NUM_CAMS']
print(df_merged[cols_to_show].head())

--- Merged Data with CAM Types ---
   COMPANY_FKEY  FISCAL_YEAR  AUDIT_FEES  NUM_CAMS
0          1750         2020     2007000       2.0
1          1750         2021     1930500       2.0
2          1750         2022     1998416       2.0
3          1750         2023     1998132       2.0
4          1750         2024     2952000       2.0


# --- Step 4: Clean Merged Data ---

In [67]:
# Rename Columns

df_merged_renamed = df_merged.rename(columns={
    'MATCHFY_BALSH_BOOK_VAL': 'BOOK_VAL',
    'MATCHFY_BALSH_ASSETS': 'TOTAL_ASSETS',
    'MATCHFY_INCMST_NETINC_TTM': 'NET_INCOME_TTM',
    'MATCHFY_INCOME_STMT_REVENUE': 'REVENUE',
    'SIC_CODE_FKEY': 'SIC_CODE'
})

In [68]:
# Handle Missing Data
# Rationale: We must drop any row that is missing a variable
# we intend to use in our model.
print(f"Original row count: {len(df_merged_renamed)}")

# We define our list of "must-have" columns for the regression
required_cols = ['AUDIT_FEES', 'TOTAL_ASSETS', 'BOOK_VAL',
                 'NET_INCOME_TTM', 'SIC_CODE', 'NUM_CAMS']

# .dropna() removes any row with 'NaN' in this subset
df_clean = df_merged_renamed.dropna(subset=required_cols)

print(f"Row count after dropping NaNs: {len(df_clean)}")

Original row count: 28673
Row count after dropping NaNs: 12116


In [69]:
# --- 4.2: Handle Non-Positive Values ---
# Rationale: We cannot take the log of zero or a negative number.
# Financial data (like 'AUDIT_FEES' and 'TOTAL_ASSETS') must be
# positive. We'll filter for this. 'book_value' can be negative,
# so we'll be careful.

df_clean = df_clean[df_clean['AUDIT_FEES'] > 0]
df_clean = df_clean[df_clean['TOTAL_ASSETS'] > 0]

# For book_value, we will only log it if it's positive.
# For net_income, we'll create a separate 'is_loss' flag.

print(f"Row count after ensuring 'fees' and 'assets' > 0: {len(df_clean)}")

Row count after ensuring 'fees' and 'assets' > 0: 12106


In [72]:
### Part 4.3: Feature Engineering (Log Transforms)
# Rationale: This is a key part of our proposal. We log these
# variables to reduce skewness and model the relationships
# in proportional terms (elasticity).

# Log transform fees and assets
df_clean['LOG_AUDIT_FEES'] = np.log(df_clean['AUDIT_FEES'])

# --- CORRECTED LINE BELOW ---
# My original code had a typo. This is the correct line.
df_clean['LOG_TOTAL_ASSETS'] = np.log(df_clean['TOTAL_ASSETS'])

# Log transform book_value only where it's positive
# We create 'LOG_BOOK_VAL' and fill negatives/zeros with NaN
# This line is *expected* to produce a RuntimeWarning, which is fine.
df_clean['LOG_BOOK_VAL'] = np.where(
    df_clean['BOOK_VAL'] > 0,
    np.log(df_clean['BOOK_VAL']),
    np.nan
)

# For cam_count, we use log(x + 1) to handle the '0' values.
# This is a standard transformation for count data.
df_clean['LOG_NUM_CAMS'] = np.log(df_clean['NUM_CAMS'] + 1)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [73]:
# --- 4.4: Feature Engineering (Other Variables) ---
# Rationale: We create a simple 'IS_LOSS' dummy variable for
# profitability and simplify 'SIC_CODE' to a 2-digit industry.

# Create a 1/0 dummy variable for 'is_loss'
df_clean['IS_LOSS'] = (df_clean['NET_INCOME_TTM'] < 0).astype(int)

# Convert SIC code to a 2-digit industry group
# .str.slice(0, 2) takes the first two characters
df_clean['SIC_CODE'] = df_clean['SIC_CODE'].astype(str).str.slice(0, 2)

print("Feature engineering complete.")

Feature engineering complete.


In [74]:
# --- 4.5: Final Inspection of Cleaned Data ---
print("\n--- Final Clean DataFrame Head ---")
print(df_clean.head())

print("\n--- Final Clean DataFrame Info ---")
df_clean.info()

print("\n--- Final Clean DataFrame Statistics (describe) ---")
# .describe() is a *mandatory* step. It shows us the
# mean, median, min, and max of our new variables.
print(df_clean.describe())


--- Final Clean DataFrame Head ---
   FISCAL_YEAR  AUDIT_FEES   FILE_DATE  COMPANY_FKEY     BOOK_VAL  \
0         2020     2007000  2020-08-28          1750  780900000.0   
1         2021     1930500  2021-08-18          1750  850600000.0   
2         2022     1998416  2022-08-09          1750  914800000.0   
3         2023     1998132  2023-08-08          1750  859600000.0   
4         2024     2952000  2024-08-06          1750  399600000.0   

   TOTAL_ASSETS  NET_INCOME_TTM SIC_CODE  CURR_AUD_FKEY CURR_AUD_NAME  ...  \
0  2.079000e+09       4400000.0       37              4      KPMG LLP  ...   
1  1.539700e+09      35800000.0       37              4      KPMG LLP  ...   
2  1.573900e+09      78700000.0       37              4      KPMG LLP  ...   
3  1.833100e+09      90200000.0       37              4      KPMG LLP  ...   
4  2.770000e+09      46300000.0       37              4      KPMG LLP  ...   

   CAM_Shareholder_valuation  CAM_Subsidiary/affiliate  \
0                     