In [2]:
# 1. Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# List of the audit opinion files
files = [
    'audit_opinions_2020.csv',
    'audit_opinions_2021.csv',
    'audit_opinions_2022.csv',
    'audit_opinions_2023.csv',
    'audit_opinions_2024.csv',
    'audit_opinions_2025.csv'
]

# 1. Read and combine files
# We use index_col=False to prevent column misalignment due to trailing commas in the files
dfs = [pd.read_csv(f, index_col=False, low_memory=False, encoding='ISO-8859-1') for f in files]
combined_audit_df = pd.concat(dfs, ignore_index=True)

# 2. Clean up empty 'Unnamed' columns
cols_to_drop = [c for c in combined_audit_df.columns if 'Unnamed' in c]
combined_audit_df.drop(columns=cols_to_drop, inplace=True)

# 3. Clean numeric/currency columns
# Function to remove '$' and ',' and convert to float
def clean_currency(x):
    if isinstance(x, str):
        clean_str = x.replace('$', '').replace(',', '').strip()
        if clean_str == '' or clean_str == '-':
            return np.nan
        try:
            return float(clean_str)
        except ValueError:
            return np.nan
    return x

currency_cols = [
    'Audit Fees ($)', 'Non-Audit Fees ($)', 'Total Fees ($)', 
    'Stock Price ($)', 'Market Cap ($)', 'Revenue ($)', 
    'Earnings ($)', 'Book Value ($)', 'Assets ($)'
]

for col in currency_cols:
    if col in combined_audit_df.columns:
        combined_audit_df[col] = combined_audit_df[col].apply(clean_currency)

# 4. Display the result
print(f"Combined DataFrame Shape: {combined_audit_df.shape}")
combined_audit_df.head()

Combined DataFrame Shape: (69686, 68)


Unnamed: 0,Company,CIK Code,Ticker,Market,S&P Index,Russell Index,Bus Street 1,Bus Street 2,City,County,...,Non-Audit Fees ($),Total Fees ($),Stock Price ($),Stock Price Date,Market Cap ($),Financials Date,Revenue ($),Earnings ($),Book Value ($),Assets ($)
0,Apple Inc.,320193.0,AAPL,NASDAQ Global Select Market,S&P 500,Russell 1000,ONE APPLE PARK WAY,,CUPERTINO,Santa Clara,...,4633300.0,22201600.0,115.81,2020-09-30,1966079000000.0,2020-09-26,274515000000.0,57411000000.0,,323888000000.0
1,AMAZON COM INC,1018724.0,AMZN,NASDAQ Global Select Market,S&P 500,Russell 1000,410 TERRY AVENUE NORTH,,SEATTLE,King,...,4295000.0,30903000.0,3256.93,2020-12-31,1638236000000.0,2020-12-31,386064000000.0,21331000000.0,78387000000.0,321195000000.0
2,MICROSOFT CORP,789019.0,MSFT,NASDAQ Global Select Market,S&P 500,Russell 1000,ONE MICROSOFT WAY,,REDMOND,King,...,13581000.0,55291000.0,203.51,2020-06-30,1540774000000.0,2020-06-30,143015000000.0,44281000000.0,67915000000.0,301311000000.0
3,"Tesla, Inc.",1318605.0,TSLA,NASDAQ Global Select Market,S&P 500,Russell 1000,1 TESLA ROAD,,AUSTIN,Travis,...,1559000.0,14524000.0,705.67,2020-12-31,677443200000.0,2020-12-31,31536000000.0,721000000.0,22360000000.0,52148000000.0
4,"Meta Platforms, Inc.",1326801.0,META,NASDAQ Global Select Market,S&P 500,Russell 1000,1 META WAY,,MENLO PARK,San Mateo,...,11325000.0,27325000.0,273.16,2020-12-31,657223000000.0,2020-12-31,85965000000.0,29146000000.0,108617000000.0,159316000000.0


In [6]:
# Select the specific columns of interest
selected_cols = [
    'Company', 'CIK Code', 'SIC Code', 'Audit Opinion Key',
    'Year Ended Date', 'Audit Fees ($)',
    'Revenue ($)', 'Earnings ($)', 'Book Value ($)', 'Assets ($)'
]

# Create new dataframe
audit_selected_df = combined_audit_df[selected_cols].copy()

# Display the result
print(audit_selected_df.info())
audit_selected_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69686 entries, 0 to 69685
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Company            69686 non-null  object 
 1   CIK Code           69674 non-null  float64
 2   SIC Code           30254 non-null  float64
 3   Audit Opinion Key  69674 non-null  float64
 4   Year Ended Date    69674 non-null  object 
 5   Audit Fees ($)     53121 non-null  float64
 6   Revenue ($)        45570 non-null  float64
 7   Earnings ($)       45993 non-null  float64
 8   Book Value ($)     36712 non-null  float64
 9   Assets ($)         46429 non-null  float64
dtypes: float64(8), object(2)
memory usage: 5.3+ MB
None


Unnamed: 0,Company,CIK Code,SIC Code,Audit Opinion Key,Year Ended Date,Audit Fees ($),Revenue ($),Earnings ($),Book Value ($),Assets ($)
0,Apple Inc.,320193.0,3571.0,434747.0,2020-09-26,17568300.0,274515000000.0,57411000000.0,,323888000000.0
1,AMAZON COM INC,1018724.0,5961.0,440738.0,2020-12-31,26608000.0,386064000000.0,21331000000.0,78387000000.0,321195000000.0
2,MICROSOFT CORP,789019.0,7372.0,430176.0,2020-06-30,41710000.0,143015000000.0,44281000000.0,67915000000.0,301311000000.0
3,"Tesla, Inc.",1318605.0,3711.0,441145.0,2020-12-31,12965000.0,31536000000.0,721000000.0,22360000000.0,52148000000.0
4,"Meta Platforms, Inc.",1326801.0,7370.0,440356.0,2020-12-31,16000000.0,85965000000.0,29146000000.0,108617000000.0,159316000000.0


In [11]:
# Remove rows with any missing values
audit_cleaned_df = audit_selected_df.dropna(subset=['Audit Fees ($)']).copy()
audit_cleaned_df.reset_index(drop=True, inplace=True)
print(f"Rows with Valid Audit Fees: {len(audit_cleaned_df)}")

Rows with Valid Audit Fees: 53121


In [12]:
# 1. Load the Critical Audit Matters file
cam_df = pd.read_csv('critical-audit-matters-1764689243.csv', low_memory=False, encoding='ISO-8859-1')

# 2. Drop rows with missing essential identifiers (CAM Topic or Audit Opinion Key)
cam_final_df = cam_df[['CAM Key', 'CAM Topic', 'Audit Opinion Key']].copy()

# 3. Drop rows with missing values
cam_final_df.dropna(inplace=True)
cam_final_df.reset_index(drop=True, inplace=True)

# 4. Display results
print(f"CAM Rows: {len(cam_final_df)}")
cam_final_df.head()

CAM Rows: 30550


Unnamed: 0,CAM Key,CAM Topic,Audit Opinion Key
0,7348,Revenue from customer contracts,444898.0
1,15279,Revenue from customer contracts,474343.0
2,24223,Revenue from customer contracts,499735.0
3,32937,Revenue from customer contracts,524816.0
4,41172,Revenue from customer contracts,551159.0


In [13]:

# 2. Perform Left Join
merged_df = pd.merge(audit_selected_df, cam_final_df, on='Audit Opinion Key', how='left')

# 3. Save and inspect
print(merged_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77252 entries, 0 to 77251
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Company            77252 non-null  object 
 1   CIK Code           77240 non-null  float64
 2   SIC Code           37762 non-null  float64
 3   Audit Opinion Key  77240 non-null  float64
 4   Year Ended Date    77240 non-null  object 
 5   Audit Fees ($)     60235 non-null  float64
 6   Revenue ($)        52958 non-null  float64
 7   Earnings ($)       53372 non-null  float64
 8   Book Value ($)     43007 non-null  float64
 9   Assets ($)         53808 non-null  float64
 10  CAM Key            26864 non-null  object 
 11  CAM Topic          26864 non-null  object 
dtypes: float64(8), object(4)
memory usage: 7.1+ MB
None
