In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Set plot style
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)

# 1. Load Data Files Individually
# (Using cp1252 encoding to handle special characters)

In [3]:
audit_opinions_2020 = pd.read_csv('audit_opinions_2020.csv', encoding='cp1252')
audit_opinions_2021 = pd.read_csv('audit_opinions_2021.csv', encoding='cp1252')
audit_opinions_2022 = pd.read_csv('audit_opinions_2022.csv', encoding='cp1252')
audit_opinions_2023 = pd.read_csv('audit_opinions_2023.csv', encoding='cp1252')
audit_opinions_2024 = pd.read_csv('audit_opinions_2024.csv', encoding='cp1252')
audit_opinions_2025 = pd.read_csv('audit_opinions_2025.csv', encoding='cp1252')

# Load the Detailed CAMs dataset
df_cams_details = pd.read_csv('critical-audit-matters-1764689243.csv', encoding='cp1252')

  audit_opinions_2020 = pd.read_csv('audit_opinions_2020.csv', encoding='cp1252')
  audit_opinions_2021 = pd.read_csv('audit_opinions_2021.csv', encoding='cp1252')
  audit_opinions_2022 = pd.read_csv('audit_opinions_2022.csv', encoding='cp1252')
  audit_opinions_2023 = pd.read_csv('audit_opinions_2023.csv', encoding='cp1252')
  audit_opinions_2024 = pd.read_csv('audit_opinions_2024.csv', encoding='cp1252')
  df_cams_details = pd.read_csv('critical-audit-matters-1764689243.csv', encoding='cp1252')


In [17]:
# 2. Concatenate all into one big dataframe
dfs = [audit_opinions_2020, audit_opinions_2021, audit_opinions_2022,
       audit_opinions_2023, audit_opinions_2024, audit_opinions_2025]

df_fees = pd.concat(dfs, ignore_index=True)

In [18]:
# Initial Validation
print("Combined Fees Data Shape:", df_fees.shape)
print("Detailed CAMs Data Shape:", df_cams_details.shape)
df_fees.head()

Combined Fees Data Shape: (69686, 73)
Detailed CAMs Data Shape: (30552, 42)


Unnamed: 0,Company,CIK Code,Ticker,Market,S&P Index,Russell Index,Bus Street 1,Bus Street 2,City,County,...,Financials Date,Revenue ($),Earnings ($),Book Value ($),Assets ($),Unnamed: 68,Unnamed: 69,Unnamed: 70,Unnamed: 71,Unnamed: 72
0,320193.0,AAPL,NASDAQ Global Select Market,S&P 500,Russell 1000,ONE APPLE PARK WAY,,CUPERTINO,Santa Clara,CA,...,274515000000,57411000000,,323888000000,,,,,,
1,1018724.0,AMZN,NASDAQ Global Select Market,S&P 500,Russell 1000,410 TERRY AVENUE NORTH,,SEATTLE,King,WA,...,386064000000,21331000000,78387000000.0,321195000000,,,,,,
2,789019.0,MSFT,NASDAQ Global Select Market,S&P 500,Russell 1000,ONE MICROSOFT WAY,,REDMOND,King,WA,...,143015000000,44281000000,67915000000.0,301311000000,,,,,,
3,1318605.0,TSLA,NASDAQ Global Select Market,S&P 500,Russell 1000,1 TESLA ROAD,,AUSTIN,Travis,TX,...,31536000000,721000000,22360000000.0,52148000000,,,,,,
4,1326801.0,META,NASDAQ Global Select Market,S&P 500,Russell 1000,1 META WAY,,MENLO PARK,San Mateo,CA,...,85965000000,29146000000,108617000000.0,159316000000,,,,,,


Part 2: Data Cleaning and Preparation

In [19]:
# Function to clean currency strings
def clean_currency(x):
    if isinstance(x, str):
        # Remove '$', ',', and whitespace
        return x.replace('$', '').replace(',', '').strip()
    return x

# 1. Clean Main Audit Fees Dataset
cols_to_clean_fees = ['Audit Fees ($)', 'Total Fees ($)', 'Assets ($)', 'Earnings ($)']

for col in cols_to_clean_fees:
    # Clean the string
    df_fees[col] = df_fees[col].apply(clean_currency)
    # Convert to numeric, turning errors (like empty strings) into NaN
    df_fees[col] = pd.to_numeric(df_fees[col], errors='coerce')

# 2. Clean CAMs Detail Dataset
# This file also has formatted strings for financials that need cleaning
cols_to_clean_cams = ['Revenue ($)', 'Earnings ($)', 'Book Value ($)', 'Assets ($)', 'Market Cap ($)']

# Only clean columns that actually exist in the file to avoid errors
existing_cam_cols = [c for c in cols_to_clean_cams if c in df_cams_details.columns]

for col in existing_cam_cols:
    df_cams_details[col] = df_cams_details[col].apply(clean_currency)
    df_cams_details[col] = pd.to_numeric(df_cams_details[col], errors='coerce')

# Check if it worked
print("Fees Data Info:")
df_fees[cols_to_clean_fees].info()
print("\nCAMs Data Info:")
df_cams_details[existing_cam_cols].info()


Fees Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69686 entries, 0 to 69685
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Audit Fees ($)  53121 non-null  float64
 1   Total Fees ($)  23575 non-null  float64
 2   Assets ($)      0 non-null      float64
 3   Earnings ($)    36712 non-null  float64
dtypes: float64(4)
memory usage: 2.1 MB

CAMs Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30552 entries, 0 to 30551
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Revenue ($)     29519 non-null  float64
 1   Earnings ($)    30048 non-null  float64
 2   Book Value ($)  24136 non-null  float64
 3   Assets ($)      30091 non-null  float64
 4   Market Cap ($)  26813 non-null  float64
dtypes: float64(5)
memory usage: 1.2 MB


In [21]:
# 1. Merge with CAM Details (As requested)
# We merge on 'Audit Opinion Key' to bring in specific CAM topics if needed.
# Note: This creates 'df_merged_full' which might have multiple rows per company (one per CAM).
df_fees['Audit Opinion Key'] = pd.to_numeric(df_fees['Audit Opinion Key'], errors='coerce')
df_cams_details['Audit Opinion Key'] = pd.to_numeric(df_cams_details['Audit Opinion Key'], errors='coerce')

df_merged_full = pd.merge(df_fees, df_cams_details, on='Audit Opinion Key', how='left', suffixes=('', '_cam_detail'))
print("Shape of Merged Dataset (with duplicates for multiple CAMs):", df_merged_full.shape)


Shape of Merged Dataset (with duplicates for multiple CAMs): (139372, 114)


In [24]:
# 2. Prepare Data for Regression (Maintains one row per Company-Year)
# Handle missing CAMs (assume NaN means 0)
df_fees['Total CAMS in Opinion'] = df_fees['Total CAMS in Opinion'].fillna(0)

# Log Transformations
# We use log1p for CAMS just in case the count is 0 (log(0) is undefined)
df_fees['log_audit_fees'] = np.log(df_fees['Audit Fees ($)'])
df_fees['log_assets'] = np.log(df_fees['Assets ($)'])

# Ensure 'Total CAMS in Opinion' is numeric before applying log function
# 'coerce' turns unparseable strings into NaN
df_fees['Total CAMS in Opinion'] = pd.to_numeric(df_fees['Total CAMS in Opinion'], errors='coerce')

df_fees['log_cams'] = np.log1p(df_fees['Total CAMS in Opinion']) 

# Drop rows where Assets or Fees are NaN (cannot run regression on empty data)
df_clean = df_fees.dropna(subset=['log_audit_fees', 'log_assets', 'Earnings ($)'])

print("Final Clean Shape (Unique Company-Years):", df_clean.shape)


Final Clean Shape (Unique Company-Years): (0, 76)


  result = getattr(ufunc, method)(*inputs, **kwargs)
