In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [12]:
# Load the datasets
audit_fees = pd.read_csv('audit_fees_from_wrds.csv', sep=',')
cams = pd.read_csv('cams_from_wrds_2.csv', sep=',')
audit_opinions = pd.read_csv('audit_opinions_wrds.csv', sep=',')

In [13]:
# Verify the data loaded by printing the first few rows
print("---Audit Fees Data---")
print(audit_fees.head())
print("---CAMs Data---")
print(cams.head())
print("---Audit Opinions Data---")
print(audit_opinions.head())

---Audit Fees Data---
   FISCAL_YEAR  AUDIT_FEES   FILE_DATE  COMPANY_FKEY  MATCHFY_BALSH_BOOK_VAL  \
0         2020     2007000  2020-08-28          1750             780900000.0   
1         2021     1930500  2021-08-18          1750             850600000.0   
2         2022     1998416  2022-08-09          1750             914800000.0   
3         2023     1998132  2023-08-08          1750             859600000.0   
4         2024     2952000  2024-08-06          1750             399600000.0   

   MATCHFY_BALSH_ASSETS  MATCHFY_INCMST_NETINC_TTM  CURR_AUD_FKEY  \
0          2.079000e+09                  4400000.0              4   
1          1.539700e+09                 35800000.0              4   
2          1.573900e+09                 78700000.0              4   
3          1.833100e+09                 90200000.0              4   
4          2.770000e+09                 46300000.0              4   

  CURR_AUD_NAME  
0      KPMG LLP  
1      KPMG LLP  
2      KPMG LLP  
3      KPM

# --- Step 2: Merge Opinions into CAMs to get the Fiscal Year ---

In [21]:
# We only need the key and the year from the opinions file
# We create a subset so we don't duplicate columns we don't need
opinions_subset = audit_opinions[['AUDIT_OP_KEY', 'FISCAL_YEAR_OF_OP']]

# Merge them into the CAMs dataframe
# left_on points to the key in 'cams', right_on points to the key in 'opinions'
cams_with_year = pd.merge(
    cams,
    opinions_subset,
    left_on='AUDIT_OPINION_FKEY',
    right_on='AUDIT_OP_KEY',
    how='left'
)

# Verify the merge: Check if 'FISCAL_YEAR_OF_OP' now exists in the CAMs data
print("--- CAMs with Year Added ---")
print(cams_with_year[['CRITICAL_AUDIT_MATTER_KEY', 'FISCAL_YEAR_OF_OP']].head())

--- CAMs with Year Added ---
   CRITICAL_AUDIT_MATTER_KEY  FISCAL_YEAR_OF_OP
0                      12414             2021.0
1                      12415             2021.0
2                      21834             2022.0
3                      21835             2022.0
4                      30697             2023.0


In [22]:
# Check how many CAMs failed to find a year (Missing values)
missing_years = cams_with_year['FISCAL_YEAR_OF_OP'].isna().sum()
print(f"Rows with missing years: {missing_years}")

# Drop rows where the year is missing (we can't use them)
cams_with_year = cams_with_year.dropna(subset=['FISCAL_YEAR_OF_OP'])

# Convert the year column to Integer (removes the .0)
cams_with_year['FISCAL_YEAR_OF_OP'] = cams_with_year['FISCAL_YEAR_OF_OP'].astype(int)

# Check the result
print("--- Fixed Years ---")
print(cams_with_year[['CRITICAL_AUDIT_MATTER_KEY', 'FISCAL_YEAR_OF_OP']].head())

Rows with missing years: 443
--- Fixed Years ---
   CRITICAL_AUDIT_MATTER_KEY  FISCAL_YEAR_OF_OP
0                      12414               2021
1                      12415               2021
2                      21834               2022
3                      21835               2022
4                      30697               2023


# --- Step 3: Count Specific CAM Types per Company-Year ---

In [27]:
# 1. Identify the Top 10 most common CAM topics to focus our analysis
# (We don't want 100 columns for rare topics)
top_topics = cams_with_year['TOPIC_NAME'].value_counts().head(10).index.tolist()
print("Top 10 CAM Topics:", top_topics)

# 2. Filter the CAMs data to only include these top topics
cams_subset = cams_with_year[cams_with_year['TOPIC_NAME'].isin(top_topics)].copy()

# 3. Pivot the table
# Index: Company and Year
# Columns: The Topic Name
# Values: Count (size)
cam_types = pd.pivot_table(
    cams_subset,
    index=['COMPANY_FKEY', 'FISCAL_YEAR_OF_OP'],
    columns='TOPIC_NAME',
    aggfunc='size',
    fill_value=0
).reset_index()

# 4. Clean up column names
# We'll add a prefix "CAM_" to make them easy to identify later
cam_types.columns.name = None  # Remove the index name
new_columns = ['COMPANY_FKEY', 'FISCAL_YEAR_OF_OP'] + ['CAM_' + col.replace(' ', '_') for col in cam_types.columns[2:]]
cam_types.columns = new_columns

# 5. We still want the TOTAL count (including the rare ones we filtered out)
# So let's merge this back with our original total count
total_counts = cams_with_year.groupby(['COMPANY_FKEY', 'FISCAL_YEAR_OF_OP']).size().reset_index(name='NUM_CAMS')

# Merge detailed types with total counts
final_cam_data = pd.merge(total_counts, cam_types, on=['COMPANY_FKEY', 'FISCAL_YEAR_OF_OP'], how='left')

# Fill NaNs with 0 (if a company had CAMs but none were in the Top 10, the specific columns would be NaN)
final_cam_data = final_cam_data.fillna(0)

# Check the result
print("--- Detailed CAM Data ---")
print(final_cam_data.head())

Top 10 CAM Topics: ['Revenue from customer contracts', 'Business combinations', 'Goodwill', 'Allowance for credit losses', 'Inventory', 'Other contingent liabilities', 'Other investments', 'Going concern', 'Other debt', 'Long-lived assets']
--- Detailed CAM Data ---
   COMPANY_FKEY  FISCAL_YEAR_OF_OP  NUM_CAMS  CAM_Allowance_for_credit_losses  \
0          1750               2020         2                              0.0   
1          1750               2021         2                              0.0   
2          1750               2022         2                              0.0   
3          1750               2023         2                              0.0   
4          1750               2024         2                              0.0   

   CAM_Business_combinations  CAM_Going_concern  CAM_Goodwill  CAM_Inventory  \
0                        0.0                0.0           0.0            1.0   
1                        0.0                0.0           0.0            1.0   
2     

# --- Step 4: Merge Audit Fees with CAM Counts ---

In [28]:
# Merge audit_fees (left) with final_cam_data (right)
# Left Key: COMPANY_FKEY, FISCAL_YEAR
# Right Key: COMPANY_FKEY, FISCAL_YEAR_OF_OP
df_merged = pd.merge(
    audit_fees,
    final_cam_data,
    left_on=['COMPANY_FKEY', 'FISCAL_YEAR'],
    right_on=['COMPANY_FKEY', 'FISCAL_YEAR_OF_OP'],
    how='left'
)

# Identify all the CAM columns (NUM_CAMS + any column starting with 'CAM_')
cam_cols = ['NUM_CAMS'] + [col for col in df_merged.columns if col.startswith('CAM_')]

# Fill missing values with 0 for these columns
# (If no match was found, it means 0 CAMs of that type)
df_merged[cam_cols] = df_merged[cam_cols].fillna(0)

# Check the result
print("--- Merged Data with CAM Types ---")
# Show specific columns to verify
cols_to_show = ['COMPANY_FKEY', 'FISCAL_YEAR', 'AUDIT_FEES', 'NUM_CAMS'] + cam_cols[1:4] # Show first few CAM types
print(df_merged[cols_to_show].head())

--- Merged Data with CAM Types ---
   COMPANY_FKEY  FISCAL_YEAR  AUDIT_FEES  NUM_CAMS  \
0          1750         2020     2007000       2.0   
1          1750         2021     1930500       2.0   
2          1750         2022     1998416       2.0   
3          1750         2023     1998132       2.0   
4          1750         2024     2952000       2.0   

   CAM_Allowance_for_credit_losses  CAM_Business_combinations  \
0                              0.0                        0.0   
1                              0.0                        0.0   
2                              0.0                        0.0   
3                              0.0                        0.0   
4                              0.0                        1.0   

   CAM_Going_concern  
0                0.0  
1                0.0  
2                0.0  
3                0.0  
4                0.0  


# --- Step 5: Filter for Big 4 Auditors ---

In [29]:
# Define the keywords for the Big 4 firms
big4_keywords = ['PricewaterhouseCoopers', 'Ernst & Young', 'Deloitte', 'KPMG']

# Create a filter (mask) that checks if the auditor name contains any of these keywords
mask_big4 = df_merged['CURR_AUD_NAME'].str.contains('|'.join(big4_keywords), na=False)

# Apply the filter to create a new dataframe
df_big4 = df_merged[mask_big4].copy()

# Check the results
print(f"Original Row Count: {len(df_merged)}")
print(f"Big 4 Row Count: {len(df_big4)}")
print("\n--- Auditor Counts ---")
print(df_big4['CURR_AUD_NAME'].value_counts())

Original Row Count: 59337
Big 4 Row Count: 32534

--- Auditor Counts ---
CURR_AUD_NAME
PricewaterhouseCoopers LLP    10030
Deloitte & Touche LLP          8651
Ernst & Young LLP              8284
KPMG LLP                       5569
Name: count, dtype: int64


# --- Step 6: Create Variables and Clean Financial Data ---

In [31]:
# --- Step 6 (Re-run): Create Variables and Clean Financial Data ---

# 1. Drop rows where key financial data is missing
# We need Assets, Fees, and Net Income for our analysis
df_clean = df_big4.dropna(subset=['MATCHFY_BALSH_ASSETS', 'AUDIT_FEES', 'MATCHFY_INCMST_NETINC_TTM'])

# 2. Filter for positive values only (required for Log transformation)
# We can't take the log of 0 or negative numbers
df_clean = df_clean[df_clean['MATCHFY_BALSH_ASSETS'] > 0]
df_clean = df_clean[df_clean['AUDIT_FEES'] > 0]

# 3. Create Log Variables
# 'np.log' is the natural logarithm function from numpy
df_clean['LOG_ASSETS'] = np.log(df_clean['MATCHFY_BALSH_ASSETS'])
df_clean['LOG_AUDIT_FEES'] = np.log(df_clean['AUDIT_FEES'])

# 4. Create Profitability Ratio (ROA = Net Income / Total Assets)
df_clean['ROA'] = df_clean['MATCHFY_INCMST_NETINC_TTM'] / df_clean['MATCHFY_BALSH_ASSETS']

# Check the final shape and columns
print(f"Cleaned Dataset Rows: {len(df_clean)}")
print("\n--- Final Data Preview ---")
# Show the first few columns + the new Log/Ratio columns + a couple of CAM columns
cols_to_view = ['COMPANY_FKEY', 'FISCAL_YEAR', 'LOG_AUDIT_FEES', 'LOG_ASSETS', 'ROA', 'NUM_CAMS', 'CAM_Revenue_from_customer_contracts']
print(df_clean[cols_to_view].head())

Cleaned Dataset Rows: 26772

--- Final Data Preview ---
   COMPANY_FKEY  FISCAL_YEAR  LOG_AUDIT_FEES  LOG_ASSETS       ROA  NUM_CAMS  \
0          1750         2020       14.512152   21.455153  0.002116       2.0   
1          1750         2021       14.473290   21.154853  0.023251       2.0   
2          1750         2022       14.507865   21.176822  0.050003       2.0   
3          1750         2023       14.507723   21.329274  0.049206       2.0   
4          1750         2024       14.897993   21.742113  0.016715       2.0   

   CAM_Revenue_from_customer_contracts  
0                                  1.0  
1                                  1.0  
2                                  1.0  
3                                  1.0  
4                                  0.0  


# --- Step 6.5: Filter for Timeframe (2020-2025) ---

In [32]:
# Filter the dataset to include only fiscal years between 2020 and 2025 (inclusive)
df_clean = df_clean[(df_clean['FISCAL_YEAR'] >= 2020) & (df_clean['FISCAL_YEAR'] <= 2025)]

# Check the result
print(f"Row Count after Year Filter: {len(df_clean)}")
print("--- Years Present ---")
print(df_clean['FISCAL_YEAR'].value_counts().sort_index())

Row Count after Year Filter: 23503
--- Years Present ---
FISCAL_YEAR
2020    4529
2021    4689
2022    4594
2023    4745
2024    4570
2025     376
Name: count, dtype: int64
