In [797]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [798]:
# @title Import Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [799]:
# @title Read Datasets
restatement_df = pd.read_csv('/content/drive/MyDrive/RSM8224 Accounting/Financial Restatements Data.csv')
compustat = pd.read_csv('/content/drive/MyDrive/RSM8224 Accounting/Compustat.csv')
# compustat.head(3)

  restatement_df = pd.read_csv('/content/drive/MyDrive/RSM8224 Accounting/Financial Restatements Data.csv')
  compustat = pd.read_csv('/content/drive/MyDrive/RSM8224 Accounting/Compustat.csv')


In [800]:
#@title Clean compustat_df
compustat = compustat[(compustat['fyear'] >= 2010) & (compustat['fyear'] <= 2022)]
# Filter out financial firms based on SIC codes
compustat = compustat[~((compustat['sic'] >= 4400) & (compustat['sic'] < 5000)) &
                                     ~((compustat['sic'] >= 6000) & (compustat['sic'] < 6500))]

# Ensure the dataset is sorted by firm identifier and year
compustat = compustat.sort_values(by=['gvkey', 'fyear'])
# Create an industry category variable from the first two digits of the SIC code
compustat['industry'] = compustat['sic'].astype(str).str[:2]

#Encode industry based on its frequency
# Calculate the frequency
industry_freq = compustat['industry'].value_counts(normalize=True)
# Map the frequencies to the original 'industry' column
compustat['industry_freq'] = compustat['industry'].map(industry_freq)

compustat['cik'] = pd.to_numeric(compustat['cik'])
compustat.head(3)

Unnamed: 0,gvkey,datadate,fyear,indfmt,consol,popsrc,datafmt,tic,cusip,conm,...,spcindcd,spcseccd,spcsrc,state,stko,weburl,dldte,ipodate,industry,industry_freq
3,1004,2011-05-31,2010.0,INDL,C,D,STD,AIR,361105,AAR CORP,...,110.0,925.0,B,IL,0.0,www.aarcorp.com,,1972-04-24,50,0.0108
4,1004,2012-05-31,2011.0,INDL,C,D,STD,AIR,361105,AAR CORP,...,110.0,925.0,B,IL,0.0,www.aarcorp.com,,1972-04-24,50,0.0108
5,1004,2013-05-31,2012.0,INDL,C,D,STD,AIR,361105,AAR CORP,...,110.0,925.0,B,IL,0.0,www.aarcorp.com,,1972-04-24,50,0.0108


In [801]:
#@title company_age Create

# Initial steps already provided
compustat['ipodate'] = pd.to_datetime(compustat['ipodate'])
compustat['ipo_year'] = compustat['ipodate'].dt.year
compustat['company_age'] = compustat['fyear'] - compustat['ipo_year']

# Count the number of fiscal years (fyear) for each company (gvkey)
fyear_counts = compustat.groupby('gvkey')['fyear'].transform('count')

# For entries where company_age is NaN, use the count of fiscal years as company_age
compustat['company_age'] = compustat['company_age'].fillna(fyear_counts)

# Check the results
compustat[['gvkey', 'fyear', 'ipodate','company_age']].info()

<class 'pandas.core.frame.DataFrame'>
Index: 129220 entries, 3 to 200525
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   gvkey        129220 non-null  int64         
 1   fyear        129220 non-null  float64       
 2   ipodate      46176 non-null   datetime64[ns]
 3   company_age  129220 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(1)
memory usage: 4.9 MB


note: To adjust for cases where the company_age calculation results in NaN—either because ipodate is missing or for any other reason—and instead use the count of fiscal years (fyear) available for each company (gvkey) in the dataset as the company_age.

In [802]:
def impute_with_avg_of_neighbors(df, column_name):
    """
    Imputes missing values in a specified column of a DataFrame using the average of the
    next and previous year's values for each 'gvkey'.

    Parameters:
    - df: pandas DataFrame with the data.
    - column_name: string, the name of the column to impute.

    Returns:
    - DataFrame with imputed values in the specified column.
    """
    # Sort the DataFrame by 'gvkey' and 'fyear' to ensure chronological order
    df = df.sort_values(by=['gvkey', 'fyear'])

    # Define temporary columns for forward fill and backward fill
    ffill_col = f'{column_name}_ffill'
    bfill_col = f'{column_name}_bfill'

    # Perform forward fill and backward fill within each 'gvkey' group
    df[ffill_col] = df.groupby('gvkey')[column_name].fillna(method='ffill')
    df[bfill_col] = df.groupby('gvkey')[column_name].fillna(method='bfill')

    # Calculate the average of the forward fill and backward fill values
    df[column_name] = df[column_name].fillna((df[ffill_col] + df[bfill_col]) / 2)

    # Drop the temporary columns
    df.drop(columns=[ffill_col, bfill_col], inplace=True)

    return df

In [803]:
#@title Clean restatement_df
# Extract the year from the datetime and create new columns
restatement_df['RES_BEGIN_DATE_YEAR'] = pd.to_datetime(restatement_df['RES_BEGIN_DATE']).dt.year
restatement_df['RES_END_DATE_YEAR'] = pd.to_datetime(restatement_df['RES_END_DATE']).dt.year
restatement_df['COMPANY_FKEY'] = pd.to_numeric(restatement_df['COMPANY_FKEY'])

**Note 1:** The restatement dataset has restatement begin and restatement end date variables. Those
capture the period of “incorrect” financial statements that were later restated. You will need to
convert the dates to an indicator variable where if a firm had a restatement covering any period of
that fiscal year, restatement = 1, otherwise 0.

In [804]:
#@title restatement variable creation
restatement_periods = {}
for index, row in restatement_df.iterrows():
    company_fkey = row['COMPANY_FKEY']
    period = (row['RES_BEGIN_DATE_YEAR'], row['RES_END_DATE_YEAR'])
    if company_fkey not in restatement_periods:
        restatement_periods[company_fkey] = [period]
    else:
        restatement_periods[company_fkey].append(period)

def is_restatement(cik, fyear, restatement_periods):
    """
      Check if any restatement period overlaps with the given fiscal year for a company.

      Parameters:
          cik (int): The company's CIK.
          fyear (int): The fiscal year to check.
    """
    periods = restatement_periods.get(cik, [])
    for start_date, end_date in periods:
        if not (start_date <= fyear <= end_date):
            return 1
    return 0
# Apply the function to the DataFrame
compustat['restatement']=compustat.apply(lambda row: is_restatement(row['cik'], row['fyear'], restatement_periods), axis=1)
compustat[['cik', 'tic', 'conm', 'restatement']].head(3)

Unnamed: 0,cik,tic,conm,restatement
3,1750.0,AIR,AAR CORP,1
4,1750.0,AIR,AAR CORP,1
5,1750.0,AIR,AAR CORP,1


In [805]:
#@title SEC investigation status creation
sec_restatement_periods = {}
for index, row in restatement_df.iterrows():
    company_fkey = row['COMPANY_FKEY']
    result = (row['RES_BEGIN_DATE_YEAR'], row['RES_END_DATE_YEAR'], row['RES_SEC_INVESTIGATION'])
    if company_fkey not in sec_restatement_periods:
        sec_restatement_periods[company_fkey] = [result]  # Use 'result' here
    else:
        sec_restatement_periods[company_fkey].append(result)

def is_sec_restatement(cik, fyear, sec_restatement_periods):
    """
    Check if a given fiscal year for a company falls within any restatement period that was investigated by the SEC.

    Parameters:
        cik (int): The company's CIK.
        fyear (int): The fiscal year to check.
    """
    periods = sec_restatement_periods.get(cik, [])
    for start_year, end_year, sec_invest in periods:
        if start_year <= fyear <= end_year and sec_invest == 1:  # Check if year is within period and SEC investigated
            return 1
    return 0
# Apply the function to the DataFrame
compustat['sec_restatement']=compustat.apply(lambda row: is_sec_restatement(row['cik'], row['fyear'], sec_restatement_periods), axis=1)
compustat[['cik', 'tic', 'conm', 'restatement', 'sec_restatement']].head(3)

Unnamed: 0,cik,tic,conm,restatement,sec_restatement
3,1750.0,AIR,AAR CORP,1,0
4,1750.0,AIR,AAR CORP,1,0
5,1750.0,AIR,AAR CORP,1,0


# **1**

##### filter data

In [806]:
compustat_df1 = compustat[(compustat['fyear'] >= 2010) & (compustat['fyear'] <= 2021)]

In [807]:
# imputation
for column in ['act', 'che', 'lct', 'dlc' , 'sale', 'rect', 'at']:
  compustat_df1 = impute_with_avg_of_neighbors(compustat_df1, column)

In [808]:
compustat_df1 = compustat_df1.sort_values(by=['gvkey', 'fyear'])
# Calculate the year-over-year changes for the specified columns
compustat_df1['delta_ACT'] = compustat_df1.groupby('gvkey')['act'].diff()
compustat_df1['delta_CHE'] = compustat_df1.groupby('gvkey')['che'].diff()
compustat_df1['delta_LCT'] = compustat_df1.groupby('gvkey')['lct'].diff()
compustat_df1['delta_DLC'] = compustat_df1.groupby('gvkey')['dlc'].diff()
compustat_df1['delta_sale'] = compustat_df1.groupby('gvkey')['sale'].diff()
compustat_df1['delta_rect'] = compustat_df1.groupby('gvkey')['rect'].diff()
compustat_df1['at_lag1'] = compustat_df1.groupby('gvkey')['at'].shift(1)
cols = ['gvkey', 'fyear','act', 'che', 'lct', 'dlc', 'at', 'sale', 'ppent', 'dp', 'at_lag1', 'delta_sale', 'delta_ACT', 'delta_CHE', 'delta_LCT','delta_DLC', 'delta_rect','industry']

In [809]:
compustat_df1.replace([np.inf, -np.inf], np.nan, inplace=True)
compustat_df1 = compustat_df1.dropna(subset=['ppent','at_lag1','delta_sale', 'delta_ACT', 'delta_CHE', 'delta_LCT','delta_DLC', 'delta_rect'])

In [810]:
# compustat_df = compustat_df1[(compustat_df1['fyear'] >= 2010) & (compustat_df1['fyear'] <= 2018)]
compustat_df = compustat_df1.copy()

In [811]:
# Regression Helper
import statsmodels.api as sm

def regress(data, yvar, xvars):
    # Make sure to exclude any infinite or missing values
    data = data.replace([np.inf, -np.inf], np.nan).dropna(subset=xvars + [yvar])
    Y = data[yvar]
    X = data[xvars]
    X['intercept'] = 1
    result = sm.OLS(Y, X).fit()
    return result.params

### **Jones Model**

In [812]:
#@title data preparation
jones_model = compustat_df[cols].copy()
jones_model = jones_model.sort_values(by=['gvkey', 'fyear'])
# Calculate Total Accruals based on the formula
jones_model['TA_Jones'] = (compustat_df['delta_ACT'] -
                      compustat_df['delta_CHE'] -
                      compustat_df['delta_LCT'] -
                      compustat_df['dp'])

jones_model['TA_Jones_scaled'] = jones_model['TA_Jones'] / jones_model['at_lag1']
jones_model['delta_sale_scaled'] = jones_model['delta_sale'] / jones_model['at_lag1']
jones_model['ppent_scaled'] = jones_model['ppent'] / jones_model['at_lag1']
jones_model['inverse_at_lag1'] = 1 / jones_model['at_lag1']
# Columns list needed for the Jones Model
cols_jones= ['TA_Jones_scaled', 'delta_sale_scaled', 'ppent_scaled', 'inverse_at_lag1']
# Now, drop all rows that have missing values in any of the required columns
jones_model.dropna(subset=cols_jones, inplace= True)

# Hypothetical dependent variable
yvar = 'TA_Jones_scaled'
# Hypothetical list of independent variables
xvars = ['inverse_at_lag1', 'delta_sale_scaled', 'ppent_scaled']
# Apply the model using the regress function
jones_results = jones_model.groupby(['fyear', 'industry']).apply(
     lambda df: regress(df, yvar, xvars))

# Convert the multi-index Series to a DataFrame
jones_results_df = jones_results.reset_index()

# Rename the columns to reflect that they are parameters
jones_results_df.columns = ['fyear', 'industry'] + ["param_" + var for var in xvars + ['intercept']]

# Show DataFrame
jones_results_df.head()

Unnamed: 0,fyear,industry,param_inverse_at_lag1,param_delta_sale_scaled,param_ppent_scaled,param_intercept
0,2010.0,10,-0.15483,0.46836,-1.00175,0.35022
1,2010.0,13,-0.58137,0.26441,-0.17526,0.09903
2,2010.0,27,0.00015,-0.00093,0.00063,0.0135
3,2010.0,36,-9e-05,0.03841,-0.00292,-0.08591
4,2010.0,51,-1e-05,-0.00031,-0.00255,-0.02007


In [813]:
jones_model = jones_model.merge(jones_results_df, on=['fyear', 'industry'], how='left')

# Compute predicted values
jones_model['predicted_TA_Jones_scaled'] = (
    jones_model['param_inverse_at_lag1'] * jones_model['inverse_at_lag1'] +
    jones_model['param_delta_sale_scaled'] * jones_model['delta_sale_scaled'] +
    jones_model['param_ppent_scaled'] * jones_model['ppent_scaled'] +
    jones_model['param_intercept']
)

#@title Calculate residuals
jones_model['residuals'] = jones_model['TA_Jones_scaled'] - jones_model['predicted_TA_Jones_scaled']
jones_model.head()

Unnamed: 0,gvkey,fyear,act,che,lct,dlc,at,sale,ppent,dp,...,TA_Jones_scaled,delta_sale_scaled,ppent_scaled,inverse_at_lag1,param_inverse_at_lag1,param_delta_sale_scaled,param_ppent_scaled,param_intercept,predicted_TA_Jones_scaled,residuals
0,1004,2011.0,1063.272,67.72,473.226,122.865,2195.653,2074.498,456.015,80.333,...,0.00085,0.17533,0.26766,0.00059,-0.11771,-7.95174,-1.11954,2.09737,0.40346,-0.4026
1,1004,2012.0,1033.7,75.3,389.0,86.4,2136.9,2167.1,426.4,108.6,...,-0.02802,0.04218,0.1942,0.00046,-0.05624,0.21027,0.32075,-0.2438,-0.17267,0.14465
2,1004,2013.0,1116.9,89.2,402.1,69.7,2199.5,2035.0,413.3,113.4,...,-0.02677,-0.06182,0.19341,0.00047,-0.05367,-0.60125,-2.64044,0.12826,-0.34529,0.31852
3,1004,2014.0,954.1,54.7,412.0,69.0,1515.0,1594.3,295.0,92.3,...,-0.1048,-0.20036,0.13412,0.00045,0.7395,-0.98513,-6.57248,0.20247,-0.48132,0.37652
4,1004,2015.0,873.1,31.2,329.0,12.0,1442.1,1662.6,313.9,70.8,...,-0.0299,0.04508,0.20719,0.00066,-0.0398,0.19562,1.07114,-0.21312,0.0176,-0.04751


### **Modified Jones Model**

In [814]:
#@title data preparation
modified_jones_model = compustat_df[cols].copy()
modified_jones_model = modified_jones_model.sort_values(by=['gvkey', 'fyear'])
# Calculate the Modified Jones Total Accruals based on the formula
modified_jones_model['TA_modified_jones'] = (
    modified_jones_model['delta_ACT'] - modified_jones_model['delta_CHE'] -
    modified_jones_model['delta_LCT'] + modified_jones_model['delta_DLC'] -
    modified_jones_model['dp']
)

modified_jones_model['TA_modified_jones_scaled'] = modified_jones_model['TA_modified_jones'] / modified_jones_model['at_lag1']
modified_jones_model['delta_sale_scaled'] = modified_jones_model['delta_sale'] / modified_jones_model['at_lag1']
modified_jones_model['ppent_scaled'] = modified_jones_model['ppent'] / modified_jones_model['at_lag1']
modified_jones_model['inverse_at_lag1'] = 1 / modified_jones_model['at_lag1']

# Calculate the change in revenues adjusted for the change in receivables
modified_jones_model['delta_sale_scaled_adjusted'] = (
    (modified_jones_model['delta_sale'] - modified_jones_model['delta_rect']) / modified_jones_model['at_lag1']
)

In [815]:
# Columns list needed for the Modified Jones Model
cols_modified_jones= ['TA_modified_jones_scaled', 'delta_sale_scaled', 'ppent_scaled', 'inverse_at_lag1']
# Now, drop all rows that have missing values in any of the required columns
modified_jones_model.dropna(subset=cols_modified_jones, inplace= True)

In [816]:
# Hypothetical dependent variable
yvar = 'TA_modified_jones_scaled'
# Hypothetical list of independent variables
xvars = ['inverse_at_lag1', 'delta_sale_scaled', 'ppent_scaled']
# Apply the model using the regress function
modified_jones_results = modified_jones_model.groupby(['fyear', 'industry']).apply(
     lambda df: regress(df, yvar, xvars))

# Convert the multi-index Series to a DataFrame
modified_jones_results_df = modified_jones_results.reset_index()

# Rename the columns to reflect that they are parameters
modified_jones_results_df.columns = ['fyear', 'industry'] + ["param_" + var for var in xvars + ['intercept']]

# Show DataFrame
modified_jones_results_df.head()

Unnamed: 0,fyear,industry,param_inverse_at_lag1,param_delta_sale_scaled,param_ppent_scaled,param_intercept
0,2010.0,10,-0.15483,0.46836,-1.00175,0.35022
1,2010.0,13,-0.58137,0.26441,-0.17526,0.09903
2,2010.0,27,0.00015,-0.00093,0.00063,0.0135
3,2010.0,36,-9e-05,0.03841,-0.00292,-0.08591
4,2010.0,51,-1e-05,-0.00031,-0.00255,-0.02007


In [817]:
modified_jones_model = modified_jones_model.merge(modified_jones_results_df, on=['fyear', 'industry'], how='left')

# Compute predicted values
modified_jones_model['NA'] = (
    modified_jones_model['param_inverse_at_lag1'] * modified_jones_model['inverse_at_lag1'] +
    modified_jones_model['param_delta_sale_scaled'] * modified_jones_model['delta_sale_scaled_adjusted'] +
    modified_jones_model['param_ppent_scaled'] * modified_jones_model['ppent_scaled'] +
    modified_jones_model['param_intercept']
)

#@title Calculate UAA
modified_jones_model['UAA_modified_jones'] = abs(modified_jones_model['TA_modified_jones_scaled'] - modified_jones_model['NA'])
modified_jones_model.head()

Unnamed: 0,gvkey,fyear,act,che,lct,dlc,at,sale,ppent,dp,...,delta_sale_scaled,ppent_scaled,inverse_at_lag1,delta_sale_scaled_adjusted,param_inverse_at_lag1,param_delta_sale_scaled,param_ppent_scaled,param_intercept,NA,UAA_modified_jones
0,1004,2011.0,1063.272,67.72,473.226,122.865,2195.653,2074.498,456.015,80.333,...,0.17533,0.26766,0.00059,0.15867,-0.08601,-7.213,-1.24264,1.93387,0.45674,0.45073
1,1004,2012.0,1033.7,75.3,389.0,86.4,2136.9,2167.1,426.4,108.6,...,0.04218,0.1942,0.00046,0.04649,-0.0161,0.21987,0.35599,-0.1945,-0.11515,0.07052
2,1004,2013.0,1116.9,89.2,402.1,69.7,2199.5,2035.0,413.3,113.4,...,-0.06182,0.19341,0.00047,-0.05363,-0.00242,-0.29379,-0.62723,0.02147,-0.08409,0.04951
3,1004,2014.0,954.1,54.7,412.0,69.0,1515.0,1594.3,295.0,92.3,...,-0.20036,0.13412,0.00045,-0.16999,0.99378,-0.76063,-8.01971,0.25461,-0.69125,0.58614
4,1004,2015.0,873.1,31.2,329.0,12.0,1442.1,1662.6,313.9,70.8,...,0.04508,0.20719,0.00066,0.03743,0.01017,0.22627,0.9605,-0.17927,0.02821,0.09574


### **Teoh et al Model**

In [818]:
#@title data preparation
Teoh_model = compustat_df[cols].copy()
Teoh_model = Teoh_model.sort_values(by=['gvkey', 'fyear'])
# Calculate the Teoh et al Total Current Accruals based on the formula
Teoh_model['TCA'] = (
    Teoh_model['delta_ACT'] - Teoh_model['delta_CHE'] -
    Teoh_model['delta_LCT'] + Teoh_model['delta_DLC']
)

Teoh_model['TCA_scaled'] = Teoh_model['TCA'] / Teoh_model['at_lag1']
Teoh_model['delta_sale_scaled'] = Teoh_model['delta_sale'] / Teoh_model['at_lag1']
Teoh_model['inverse_at_lag1'] = 1 / Teoh_model['at_lag1']
# Calculate the change in revenues adjusted for the change in receivables
Teoh_model['delta_sale_scaled_adjusted'] = (
    (Teoh_model['delta_sale'] - Teoh_model['delta_rect']) / Teoh_model['at_lag1']
)

# Columns list needed for the Teoh et al Model
cols_teoh= ['TCA_scaled', 'delta_sale_scaled',  'inverse_at_lag1']
# Now, drop all rows that have missing values in any of the required columns
Teoh_model.dropna(subset=cols_teoh, inplace= True)

In [819]:
# Hypothetical TCA_scaled variable
yvar = 'TCA_scaled'
# Hypothetical list of independent variables
xvars = ['inverse_at_lag1', 'delta_sale_scaled']
# Apply the model using the regress function
Teoh_results = Teoh_model.groupby(['fyear', 'industry']).apply(
     lambda df: regress(df, yvar, xvars))

# Convert the multi-index Series to a DataFrame
Teoh_results_df = Teoh_results.reset_index()

# Rename the columns to reflect that they are parameters
Teoh_results_df.columns = ['fyear', 'industry'] + ["param_" + var for var in xvars + ['intercept']]

# Show DataFrame
Teoh_results_df.head()

Unnamed: 0,fyear,industry,param_inverse_at_lag1,param_delta_sale_scaled,param_intercept
0,2010.0,10,-0.99065,2.8575,-0.16101
1,2010.0,13,-2.31956,1.05065,0.04401
2,2010.0,27,0.00023,-0.00138,0.02014
3,2010.0,36,-5e-05,0.0225,-0.05032
4,2010.0,51,-1e-05,-0.00011,-0.00725


In [820]:
Teoh_model = Teoh_model.merge(Teoh_results_df, on=['fyear', 'industry'], how='left')

# Compute predicted values
Teoh_model['NCA'] = (
    Teoh_model['param_inverse_at_lag1'] * Teoh_model['inverse_at_lag1'] +
    Teoh_model['param_delta_sale_scaled'] * Teoh_model['delta_sale_scaled_adjusted'] +
    Teoh_model['param_intercept']
)

#@title Calculate UAA
Teoh_model['UAA_Teoh'] = abs(Teoh_model['TCA_scaled'] - Teoh_model['NCA'])
Teoh_model.head()

Unnamed: 0,gvkey,fyear,act,che,lct,dlc,at,sale,ppent,dp,...,TCA,TCA_scaled,delta_sale_scaled,inverse_at_lag1,delta_sale_scaled_adjusted,param_inverse_at_lag1,param_delta_sale_scaled,param_intercept,NCA,UAA_Teoh
0,1004,2011.0,1063.272,67.72,473.226,122.865,2195.653,2074.498,456.015,80.333,...,90.574,0.05316,0.17533,0.00059,0.15867,-0.08651,-7.29979,1.78709,0.6288,0.57564
1,1004,2012.0,1033.7,75.3,389.0,86.4,2136.9,2167.1,426.4,108.6,...,10.609,0.00483,0.04218,0.00046,0.04649,-0.01671,0.21987,-0.12307,-0.11286,0.11769
2,1004,2013.0,1116.9,89.2,402.1,69.7,2199.5,2035.0,413.3,113.4,...,39.5,0.01848,-0.06182,0.00047,-0.05363,-0.00218,-0.29334,-0.03086,-0.01513,0.03361
3,1004,2014.0,954.1,54.7,412.0,69.0,1515.0,1594.3,295.0,92.3,...,-138.9,-0.06315,-0.20036,0.00045,-0.16999,0.99652,-0.77962,-0.68903,-0.55605,0.4929
4,1004,2015.0,873.1,31.2,329.0,12.0,1442.1,1662.6,313.9,70.8,...,-31.5,-0.02079,0.04508,0.00066,0.03743,0.01005,0.43698,-0.05846,-0.0421,0.02131


### **EQ**

In [821]:
# Calculate Earnings Quality
compustat_df = compustat_df.drop_duplicates(subset=['gvkey', 'fyear'], keep='last')
compustat_df_eq = pd.merge(compustat_df, jones_model[['gvkey', 'fyear', 'residuals']], on=['gvkey', 'fyear'], how='left')
compustat_df_eq = pd.merge(compustat_df_eq, modified_jones_model[['gvkey', 'fyear', 'UAA_modified_jones']], on=['gvkey', 'fyear'], how='left')
compustat_df_eq = pd.merge(compustat_df_eq, Teoh_model[['gvkey', 'fyear', 'UAA_Teoh']], on=['gvkey', 'fyear'], how='left')

# filter dataset for q1 analysis
df1 = compustat_df_eq[['gvkey', 'fyear', 'sic','tic', 'residuals', 'UAA_modified_jones', 'UAA_Teoh', 'restatement', 'sec_restatement']]
df1 = df1[(df1['fyear'] >= 2010) & (df1['fyear'] <= 2018)]
df1['jones_eq'] = abs(df1['residuals'])
df1['modified_jones_eq'] = abs(df1['UAA_modified_jones'])
df1['Teoh_eq'] = abs(df1['UAA_Teoh'])

In [822]:
df1 = df1.dropna(subset=['jones_eq', 'modified_jones_eq', 'Teoh_eq'])

all_restating = df1[df1['restatement'] == 1]
non_restating = df1[df1['restatement'] == 0]
only_sec_restating = all_restating[all_restating['sec_restatement'] == 1]
non_sec_restating = all_restating[all_restating['sec_restatement'] == 0]

In [823]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47333 entries, 0 to 64641
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   gvkey               47333 non-null  int64  
 1   fyear               47333 non-null  float64
 2   sic                 47333 non-null  int64  
 3   tic                 47326 non-null  object 
 4   residuals           47333 non-null  float64
 5   UAA_modified_jones  47333 non-null  float64
 6   UAA_Teoh            47333 non-null  float64
 7   restatement         47333 non-null  int64  
 8   sec_restatement     47333 non-null  int64  
 9   jones_eq            47333 non-null  float64
 10  modified_jones_eq   47333 non-null  float64
 11  Teoh_eq             47333 non-null  float64
dtypes: float64(7), int64(4), object(1)
memory usage: 4.7+ MB


In [824]:
# Calculate descriptive statistics for each dataset
desc_all_restating = all_restating[['jones_eq', 'modified_jones_eq', 'Teoh_eq']].describe().add_prefix('All Restating ')
desc_non_restating = non_restating[['jones_eq', 'modified_jones_eq', 'Teoh_eq']].describe().add_prefix('Non-Restating ')
desc_only_sec_restating = only_sec_restating[['jones_eq', 'modified_jones_eq', 'Teoh_eq']].describe().add_prefix('Only SEC Restating ')
desc_non_sec_restating = non_sec_restating[['jones_eq', 'modified_jones_eq', 'Teoh_eq']].describe().add_prefix('Non-SEC Restating ')

# Combine the descriptive statistics into one DataFrame
combined_desc = pd.concat([desc_all_restating, desc_non_restating, desc_only_sec_restating, desc_non_sec_restating], axis=1)

pd.set_option('display.float_format', '{:.5f}'.format)

# Show the combined descriptive statistics table
combined_desc

Unnamed: 0,All Restating jones_eq,All Restating modified_jones_eq,All Restating Teoh_eq,Non-Restating jones_eq,Non-Restating modified_jones_eq,Non-Restating Teoh_eq,Only SEC Restating jones_eq,Only SEC Restating modified_jones_eq,Only SEC Restating Teoh_eq,Non-SEC Restating jones_eq,Non-SEC Restating modified_jones_eq,Non-SEC Restating Teoh_eq
count,17253.0,17253.0,17253.0,30080.0,30080.0,30080.0,199.0,199.0,199.0,17054.0,17054.0,17054.0
mean,2.14297,1.72279,1.78919,1.81861,1.69925,1.59416,1.31402,0.78031,0.77111,2.15264,1.73379,1.80107
std,32.66022,33.00495,33.51166,18.1909,35.79965,26.88954,4.43288,1.75082,1.71493,32.84664,33.19627,33.70594
min,0.0,0.0,0.0,0.0,0.0,0.0,0.00619,0.00383,0.00085,0.0,0.0,0.0
25%,0.06515,0.04886,0.0415,0.08581,0.06194,0.05739,0.085,0.05456,0.05764,0.06494,0.04885,0.04135
50%,0.21139,0.16776,0.14218,0.2695,0.20724,0.18465,0.24118,0.16002,0.15253,0.21106,0.16792,0.14205
75%,0.732,0.55151,0.49746,0.82349,0.63638,0.58673,1.20636,0.56721,0.49593,0.72657,0.55083,0.49746
max,2926.95089,3245.0273,3245.24752,1627.17871,4409.67468,3622.69026,57.38179,14.33921,11.8663,2926.95089,3245.0273,3245.24752


The dataset's analysis across `jones_eq`, `modified_jones_eq`, and `Teoh_eq` metrics for restating and non-restating firms, including those specifically undergoing a second investigation, uncovers distinct financial reporting patterns. Notably, firms investigated for a second time exhibit markedly lower averages in these metrics, suggesting deeper issues or more pronounced financial adjustments. The reduced variability in these second-investigated firms points to a more uniform set of behaviors or adjustments within this group. Moreover, the presence of significant outliers across all categories highlights extreme cases of financial adjustments, underscoring the complexity and severity of issues leading to restatements and subsequent investigations.

#### hypothesis testing

In [936]:
from scipy import stats

# Define the variables to test
variables_to_test = ['jones_eq', 'modified_jones_eq', 'Teoh_eq']
print('Restating vs. Non-Restating')
# Loop through each variable and perform the Mann-Whitney U test
for variable in variables_to_test:
    # Extracting values for both groups
    restating_values = all_restating[variable]
    non_restating_values = non_restating[variable]

    # Ensure there are no NaN values that could affect the test
    restating_values = restating_values.dropna()
    non_restating_values = non_restating_values.dropna()

    # Performing the Mann-Whitney U test
    u_stat, p_value = stats.mannwhitneyu(restating_values, non_restating_values, alternative='two-sided')

    print(f"{variable}:\nU-statistic: {u_stat}, P-value: {p_value}\n")

Restating vs. Non-Restating
jones_eq:
U-statistic: 243384648.5, P-value: 2.236456196437614e-29

modified_jones_eq:
U-statistic: 245395850.5, P-value: 7.03451875281237e-23

Teoh_eq:
U-statistic: 240731475.0, P-value: 2.984347906730878e-39



In [937]:
# Loop through each variable and perform the Mann-Whitney U test
print('Only Second Investegated Restating vs. Non-Second Investegated Restating')
for variable in variables_to_test:
    # Extracting values for both groups
    only_sec_restating_values = only_sec_restating[variable]
    non_sec_restating_values = non_sec_restating[variable]

    # Performing the Mann-Whitney U test
    u_stat, p_value = stats.mannwhitneyu(non_sec_restating_values, only_sec_restating_values, alternative='two-sided')

    print(f"{variable}:\nU-statistic: {u_stat}, P-value: {p_value}\n")

Only Second Investegated Restating vs. Non-Second Investegated Restating
jones_eq:
U-statistic: 1545379.0, P-value: 0.03010524770128475

modified_jones_eq:
U-statistic: 1668111.0, P-value: 0.6805338038646805

Teoh_eq:
U-statistic: 1632433.0, P-value: 0.3562768601324089



**Conclusion:** The Mann-Whitney U test results highlight significant disparities in earnings quality metrics (`jones_eq`, `modified_jones_eq`, and `Teoh_eq`) between restating and non-restating firms, with extremely low p-values indicating pronounced differences in financial reporting practices or earnings management. These differences underscore potential concerns about financial reporting quality and manipulation among restating firms. Conversely, when comparing non-second investigated restating firms to those subjected to second investigations, only `jones_eq` showed a marginal but significant difference, suggesting slight variations in the nature of restatements that attract second scrutiny. However, no significant differences were observed for `modified_jones_eq` and `Teoh_eq`, indicating that these metrics do not capture distinctly different behaviors or adjustments between the two groups of restating firms. This analysis reveals the substantial impact of restatements on earnings quality, while suggesting that second investigations might not always correlate with fundamentally different financial reporting behaviors.

---



# **2**

In addition to abnormal accruals, there are other firm-level factors (i.e., market-based incentives
like prior stock performance, need for financing, etc.) that may predict fraud. Let's add some additional factors in the model above along with accruals and examine whether these factors
explain restatements.

In [910]:
df2 = df1.copy()

In [911]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model_performance(model, X, y, threshold=0.5):
    """
    Evaluates the performance of a logistic regression model fitted using statsmodels.

    Returns:
    - A dictionary containing the accuracy and, if requested, precision, recall, and F1 score.
    """
    # Generate predicted probabilities
    predicted_probabilities = model.predict(X)
    # Convert probabilities to binary outcomes based on the specified threshold
    predicted_classes = np.where(predicted_probabilities >= threshold, 1, 0)

    # Calculate accuracy
    accuracy = accuracy_score(y, predicted_classes)

    results = {'accuracy': accuracy}

    precision = precision_score(y, predicted_classes)
    recall = recall_score(y, predicted_classes)
    f1 = f1_score(y, predicted_classes)

    results.update({'precision': precision, 'recall': recall, 'f1_score': f1})

    return results

#### **original model**

In [912]:
import statsmodels.api as sm

# Define the predictor and response variables
X = df2[['jones_eq']]  # Predictor
y = df2['restatement']  # Response variable

# Add a constant to the predictor variable matrix
# This is required to include the intercept term in the model
X = sm.add_constant(X)

# Fit the logistic regression model
model_jones = sm.Logit(y, X).fit()

# Display the model summary to see the results
print(model_jones.summary())

Optimization terminated successfully.
         Current function value: 0.655945
         Iterations 4
                           Logit Regression Results                           
Dep. Variable:            restatement   No. Observations:                47333
Model:                          Logit   Df Residuals:                    47331
Method:                           MLE   Df Model:                            1
Date:                Sun, 07 Apr 2024   Pseudo R-squ.:               2.995e-05
Time:                        18:52:06   Log-Likelihood:                -31048.
converged:                       True   LL-Null:                       -31049.
Covariance Type:            nonrobust   LLR p-value:                    0.1727
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.5569      0.010    -58.124      0.000      -0.576      -0.538
jones_eq       0.0005      0.

In [913]:
performance_metrics = evaluate_model_performance(model_jones, X, y, threshold=0.5)
print(performance_metrics)

{'accuracy': 0.6355396868992035, 'precision': 0.75, 'recall': 0.0001738828029907842, 'f1_score': 0.0003476849973923625}


In [914]:
# Define the predictor and response variables
X = df2[['modified_jones_eq']]  # Predictor
y = df2['restatement']  # Response variable

# Add a constant to the predictor variable matrix
# This is required to include the intercept term in the model
X = sm.add_constant(X)

# Fit the logistic regression model
model_modified_jones = sm.Logit(y, X).fit()

# Display the model summary to see the results
print(model_modified_jones.summary())

Optimization terminated successfully.
         Current function value: 0.655965
         Iterations 4
                           Logit Regression Results                           
Dep. Variable:            restatement   No. Observations:                47333
Model:                          Logit   Df Residuals:                    47331
Method:                           MLE   Df Model:                            1
Date:                Sun, 07 Apr 2024   Pseudo R-squ.:               8.042e-08
Time:                        18:52:06   Log-Likelihood:                -31049.
converged:                       True   LL-Null:                       -31049.
Covariance Type:            nonrobust   LLR p-value:                    0.9437
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                -0.5559      0.010    -58.140      0.000      -0.575      -0.537
modified

In [915]:
performance_metrics_modified = evaluate_model_performance(model_modified_jones, X, y, threshold=0.5)
print(performance_metrics_modified)

{'accuracy': 0.6354974330805147, 'precision': 0.0, 'recall': 0.0, 'f1_score': 0.0}


  _warn_prf(average, modifier, msg_start, len(result))


In [916]:
# Define the predictor and response variables
X = df2[['Teoh_eq']]  # Predictor
y = df2['restatement']  # Response variable

# Add a constant to the predictor variable matrix
# This is required to include the intercept term in the model
X = sm.add_constant(X)

# Fit the logistic regression model
model_teoh = sm.Logit(y, X).fit()

# Display the model summary to see the results
print(model_teoh.summary())

Optimization terminated successfully.
         Current function value: 0.655960
         Iterations 4
                           Logit Regression Results                           
Dep. Variable:            restatement   No. Observations:                47333
Model:                          Logit   Df Residuals:                    47331
Method:                           MLE   Df Model:                            1
Date:                Sun, 07 Apr 2024   Pseudo R-squ.:               7.477e-06
Time:                        18:52:07   Log-Likelihood:                -31049.
converged:                       True   LL-Null:                       -31049.
Covariance Type:            nonrobust   LLR p-value:                    0.4956
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.5562      0.010    -58.153      0.000      -0.575      -0.537
Teoh_eq        0.0002      0.

In [917]:
performance_metrics_teoh = evaluate_model_performance(model_teoh, X, y, threshold=0.5)
print(performance_metrics_teoh)

{'accuracy': 0.6354974330805147, 'precision': 0.5, 'recall': 5.79609343302614e-05, 'f1_score': 0.00011590843233845261}


#### **new models**

In [918]:
features = ['gvkey', 'fyear', 'sale', 'prcc_f' ,'dltt', 'dlc', 'mkvalt', 'company_age', 'industry_freq']
df2 = pd.merge(df2, compustat_df[features], on=['gvkey', 'fyear'], how='left')
df2.head()

Unnamed: 0,gvkey,fyear,sic,tic,residuals,UAA_modified_jones,UAA_Teoh,restatement,sec_restatement,jones_eq,modified_jones_eq,Teoh_eq,sale,prcc_f,dltt,dlc,mkvalt,company_age,industry_freq
0,1004,2011.0,5080,AIR,-0.4026,0.45073,0.57564,1,0,0.4026,0.45073,0.57564,2074.498,12.05,669.489,122.865,485.2897,39.0,0.0108
1,1004,2012.0,5080,AIR,0.14465,0.07052,0.11769,1,0,0.14465,0.07052,0.11769,2167.1,20.06,622.2,86.4,790.0029,40.0,0.0108
2,1004,2013.0,5080,AIR,0.31852,0.04951,0.03361,1,0,0.31852,0.04951,0.03361,2035.0,24.3,564.3,69.7,961.308,41.0,0.0108
3,1004,2014.0,5080,AIR,0.37652,0.58614,0.4929,1,0,0.37652,0.58614,0.4929,1594.3,29.54,85.0,69.0,1046.3954,42.0,0.0108
4,1004,2015.0,5080,AIR,-0.04751,0.09574,0.02131,0,0,0.04751,0.09574,0.02131,1662.6,24.41,136.1,12.0,842.5112,43.0,0.0108


In [919]:
#@title data cleaning
for column in ['prcc_f', 'mkvalt', 'dltt']:
  df2 = impute_with_avg_of_neighbors(df2, column)

df2.replace([np.inf, -np.inf], np.nan, inplace=True)
df2 = df2.dropna(subset=['prcc_f', 'mkvalt', 'dltt'])
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 41390 entries, 0 to 47332
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   gvkey               41390 non-null  int64  
 1   fyear               41390 non-null  float64
 2   sic                 41390 non-null  int64  
 3   tic                 41383 non-null  object 
 4   residuals           41390 non-null  float64
 5   UAA_modified_jones  41390 non-null  float64
 6   UAA_Teoh            41390 non-null  float64
 7   restatement         41390 non-null  int64  
 8   sec_restatement     41390 non-null  int64  
 9   jones_eq            41390 non-null  float64
 10  modified_jones_eq   41390 non-null  float64
 11  Teoh_eq             41390 non-null  float64
 12  sale                41390 non-null  float64
 13  prcc_f              41390 non-null  float64
 14  dltt                41390 non-null  float64
 15  dlc                 41390 non-null  float64
 16  mkvalt   

In [920]:
#@title Jones
from sklearn.metrics import accuracy_score
# Define the predictor and response variables
X2 = df2[['gvkey', 'sale', 'industry_freq', 'prcc_f' ,'dltt', 'dlc', 'mkvalt', 'company_age','jones_eq']]   # Predictor
y = df2['restatement']  # Response variable

# Add a constant to the predictor variable matrix
# This is required to include the intercept term in the model
X2 = sm.add_constant(X2)

# Fit the logistic regression model
model_jones2 = sm.Logit(y, X2).fit()

# Display the model summary to see the results
print(model_jones2.summary())

Optimization terminated successfully.
         Current function value: 0.655516
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:            restatement   No. Observations:                41390
Model:                          Logit   Df Residuals:                    41380
Method:                           MLE   Df Model:                            9
Date:                Sun, 07 Apr 2024   Pseudo R-squ.:                 0.01502
Time:                        18:52:17   Log-Likelihood:                -27132.
converged:                       True   LL-Null:                       -27545.
Covariance Type:            nonrobust   LLR p-value:                2.937e-172
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
const            -0.5969      0.029    -20.592      0.000      -0.654      -0.540
gvkey         -1.194

In [921]:
performance_metrics2 = evaluate_model_performance(model_jones2, X2, y, threshold=0.5)
print(performance_metrics2)

{'accuracy': 0.6250543609567528, 'precision': 0.5734450816056462, 'recall': 0.08200857935907141, 'f1_score': 0.14349577791268833}


In [922]:
#@title Modified Jones
# Define the predictor and response variables
X2 = df2[['gvkey', 'sale', 'industry_freq', 'prcc_f' ,'dltt', 'dlc', 'mkvalt', 'company_age', 'modified_jones_eq']]   # Predictor
y = df2['restatement']  # Response variable

# Add a constant to the predictor variable matrix
# This is required to include the intercept term in the model
X2 = sm.add_constant(X2)

# Fit the logistic regression model
model_modified_jones2 = sm.Logit(y, X2).fit()

# Display the model summary to see the results
print(model_modified_jones2.summary())

Optimization terminated successfully.
         Current function value: 0.655564
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:            restatement   No. Observations:                41390
Model:                          Logit   Df Residuals:                    41380
Method:                           MLE   Df Model:                            9
Date:                Sun, 07 Apr 2024   Pseudo R-squ.:                 0.01494
Time:                        18:52:18   Log-Likelihood:                -27134.
converged:                       True   LL-Null:                       -27545.
Covariance Type:            nonrobust   LLR p-value:                2.069e-171
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                -0.5956      0.029    -20.555      0.000      -0.652      -0.539
gvkey   

In [923]:
performance_metrics_jones2 = evaluate_model_performance(model_modified_jones2, X2, y, threshold=0.5)
print(performance_metrics_jones2)

{'accuracy': 0.6251026818071999, 'precision': 0.5740822644847413, 'recall': 0.08188241231390361, 'f1_score': 0.14332247557003258}


In [924]:
#@title Teoh
# Define the predictor and response variables
X2 = df2[['gvkey', 'sale', 'industry_freq', 'prcc_f' ,'dltt', 'dlc', 'mkvalt', 'company_age', 'Teoh_eq']]   # Predictor
y = df2['restatement']  # Response variable

# Add a constant to the predictor variable matrix
# This is required to include the intercept term in the model
X2 = sm.add_constant(X2)

# Fit the logistic regression model
model_teoh2 = sm.Logit(y, X2).fit()

# Display the model summary to see the results
print(model_teoh2.summary())

Optimization terminated successfully.
         Current function value: 0.655563
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:            restatement   No. Observations:                41390
Model:                          Logit   Df Residuals:                    41380
Method:                           MLE   Df Model:                            9
Date:                Sun, 07 Apr 2024   Pseudo R-squ.:                 0.01495
Time:                        18:52:18   Log-Likelihood:                -27134.
converged:                       True   LL-Null:                       -27545.
Covariance Type:            nonrobust   LLR p-value:                1.964e-171
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
const            -0.5957      0.029    -20.557      0.000      -0.652      -0.539
gvkey         -1.192

In [925]:
performance_metrics_teoh2 = evaluate_model_performance(model_teoh2, X2, y, threshold=0.5)
print(performance_metrics_teoh2)

{'accuracy': 0.6250785213819763, 'precision': 0.5738284703801945, 'recall': 0.08188241231390361, 'f1_score': 0.14331456332118805}


**Conclusion:**

There are 8 new features added to each logistic regression models.
1. Prior Stock Performance: *prcc_f* (Price Close - Fiscal) could serve as a proxy for the firm's stock performance. Changes in this variable over time might indicate performance trends.
2. Need for Financing:
  * *dltt* (Long-Term Debt - Total) and *dlc* (Debt in Current Liabilities) can indicate a firm's reliance on external financing.
  * *mkvalt* (Market Value Total) might also give insights into the firm's market valuation and potential financing needs.
3. Company Characteristics (*industry*, *company_age*): These factors provide context on the operational environment and the maturity of the firm. The industry classification helps understand sector-specific risks and practices, while the company's age can indicate its stage in the business lifecycle, each influencing the company's approach to financial reporting and the potential for restatement.
4. Operational Metrics (*sale*): Revenue is a key indicator of operational success but is also a common target for manipulation.
5. Company Identifier (*gvkey*)

**modle summary:**
* Jones:
  * old model: *jones_eq* Coefficient is positive but with a p-value of 0.222, indicating a non-significant relationship with the likelihood of restatement at traditional significance levels.
  * new model:
    * Pseudo R-squared, LLR p-value and Log-Likelihood all improved, suggesting the model is a better fit to the data compared to the old model
    * Significant Predictors: The new model identifies several significant predictors, including *gvkey, industry_freq, dltt, dlc, mkvalt, and company_age*, each with a p-value well below 0.05, indicating strong evidence of their association with the likelihood of restatement. Notably, *jones_eq* becomes significant in this model (p-value: 0.021).
  * accuracy: The new model trades off a slight decrease in accuracy and precision for significant gains in recall and F1 score, indicating a much better balance in identifying true restatements. Despite slightly lower precision, its enhanced recall makes it more practical and effective for predicting financial restatements compared to the old model.


* Modified Jones:
  * old model: The *modified_jones_eq* Coefficient, showing a p-value of 0.986, indicates a statistically non-significant relationship with the likelihood of restatement. This suggests that, in isolation, the modified Jones model does not effectively predict restatements.
  * new model:
    * The model shows enhancements in model fit indicators, such as Pseudo R-squared, LLR p-value, and Log-Likelihood, implying a more accurate representation of the data than the simpler old model.
    * Significant Predictors: Improved model complexity reveals key significant predictors including *gvkey, industry_freq, dltt, dlc, mkvalt, and company_age*, all demonstrating a statistically significant relationship with restatement likelihood (p-values below 0.05). Interestingly, within this expanded context, modified_jones_eq still does not reach traditional levels of statistical significance (p-value: 0.147), suggesting its limited predictive power on restatements when compared to other firm-level factors.
    * accuracy: The old model, despite a seemingly decent accuracy, is practically ineffective due to its failure to identify any actual restatement cases. The new model demonstrates a more balanced and effective approach, with substantial improvements in recall and F1 score, making it significantly more useful for practical applications, despite a minor decrease in accuracy. The ability to identify true restatements, as shown by the recall and F1 score improvements, marks a significant advancement in predictive capability.

* Teoh Model:
  * old model: The *Teoh_eq* coefficient is slightly positive but not statistically significant (p-value: 0.549), indicating that the Teoh model, on its own, does not have a significant predictive relationship with the likelihood of financial restatement.
  * new model:
    * An improvement in overall fit and predictive power, as evidenced by better values for Pseudo R-squared, LLR p-value, and Log-Likelihood compared to the old model. This suggests the comprehensive model is better suited to capturing the complexities around financial restatements.

    * Significant Predictors: key predictors emerge as significant, including *gvkey, industry_freq, dltt, dlc, mkvalt, and company_age*, all showing a strong statistical relationship with the restatement likelihood (p-values below 0.05). *Teoh_eq*, while showing a positive coefficient, remains not statistically significant (p-value: 0.132), suggesting its limited direct influence on restatement likelihood in the presence of other firm-level factors.

    * accuracy: Transitioning from the old to the new model demonstrates a negligible accuracy dip for significant recall and F1 score improvements. Despite lower precision, the new model's enhanced recall markedly boosts its practicality, making it far more effective in identifying financial restatements and thus, a preferable choice for predictive analysis.
---

# **3**

In [926]:
df3 = compustat_df_eq[(compustat_df_eq['fyear'] >= 2019) & (compustat_df_eq['fyear'] <= 2021)]
cols = ['gvkey', 'fyear', 'sic','tic', 'residuals', 'UAA_modified_jones', 'UAA_Teoh', 'restatement', 'sec_restatement'] + [ 'sale', 'industry_freq', 'prcc_f' ,'dltt', 'dlc', 'mkvalt', 'company_age']
df3 = df3[cols]
df3['jones_eq'] = abs(df3['residuals'])
df3['modified_jones_eq'] = abs(df3['UAA_modified_jones'])
df3['Teoh_eq'] = abs(df3['UAA_Teoh'])
df3.head()

Unnamed: 0,gvkey,fyear,sic,tic,residuals,UAA_modified_jones,UAA_Teoh,restatement,sec_restatement,sale,industry_freq,prcc_f,dltt,dlc,mkvalt,company_age,jones_eq,modified_jones_eq,Teoh_eq
8,1004,2019.0,5080,AIR,0.12971,0.06362,0.17589,1,0,2089.3,0.0108,20.17,670.9,13.7,707.9065,47.0,0.12971,0.06362,0.17589
9,1004,2020.0,5080,AIR,0.04822,0.0556,0.04506,1,0,1651.4,0.0108,41.75,193.6,11.5,1476.9063,48.0,0.04822,0.0556,0.04506
10,1004,2021.0,5080,AIR,0.01707,0.0327,0.01728,1,0,1817.1,0.0108,48.22,156.3,11.1,1706.554,49.0,0.01707,0.0327,0.01728
19,1019,2019.0,7380,AFAP,0.61316,0.74251,0.7214,0,0,83.088,0.09242,180.0,0.0,1.833,29.16,11.0,0.61316,0.74251,0.7214
20,1019,2020.0,7380,AFAP,-0.15197,0.12573,0.14421,0,0,82.679,0.09242,174.0,6.668,2.0,28.188,11.0,0.15197,0.12573,0.14421


In [927]:
#@title data cleaning
for column in ['prcc_f', 'mkvalt', 'dltt']:
  df3 = impute_with_avg_of_neighbors(df3, column)

df3.replace([np.inf, -np.inf], np.nan, inplace=True)
df3 = df3.dropna(subset=['prcc_f', 'mkvalt', 'dltt'])
df3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14153 entries, 8 to 64670
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   gvkey               14153 non-null  int64  
 1   fyear               14153 non-null  float64
 2   sic                 14153 non-null  int64  
 3   tic                 14150 non-null  object 
 4   residuals           14045 non-null  float64
 5   UAA_modified_jones  14045 non-null  float64
 6   UAA_Teoh            14090 non-null  float64
 7   restatement         14153 non-null  int64  
 8   sec_restatement     14153 non-null  int64  
 9   sale                14153 non-null  float64
 10  industry_freq       14153 non-null  float64
 11  prcc_f              14153 non-null  float64
 12  dltt                14153 non-null  float64
 13  dlc                 14153 non-null  float64
 14  mkvalt              14153 non-null  float64
 15  company_age         14153 non-null  float64
 16  jones_eq 

##### Jones

In [928]:
# Add a constant to the predictor variable matrix in df3
X3 = sm.add_constant(df3[['gvkey', 'sale', 'industry_freq', 'prcc_f', 'dltt', 'dlc', 'mkvalt', 'company_age', 'jones_eq']])
y3 = df3['restatement']

# Apply the logistic regression model to predict probabilities for df3
df3['predicted_probability'] = model_jones2.predict(X3)

performance_metrics3 = evaluate_model_performance(model_jones2, X3, y3, threshold=0.5)
print(performance_metrics3)

{'accuracy': 0.6122376881226596, 'precision': 0.5733148019457956, 'recall': 0.14476223898929635, 'f1_score': 0.23115718688708317}


In [929]:
# Identify the top 100 firms with the highest predicted probability of restating earnings
sorted_firms = df3.sort_values(by='predicted_probability', ascending=False)
top_100_firms = sorted_firms[['gvkey', 'predicted_probability']].drop_duplicates(subset = 'gvkey').head(100)
print(top_100_firms)

        gvkey  predicted_probability
4242     6385                1.00000
39627  112005                1.00000
53382  177190                1.00000
29580   36607                1.00000
12251   16494                0.99995
...       ...                    ...
16448   20525                0.59342
9374    12713                0.59231
10834   14311                0.59150
9255    12597                0.59057
11356   14985                0.59046

[100 rows x 2 columns]


In [930]:
# Filter df3 for entries corresponding to the top 100 firms
top_100_data = df3[df3['gvkey'].isin(top_100_firms['gvkey'])]

# Verify restatements by checking the 'restatement' column in df3 for each of the years 2019, 2020, and 2021
restatement_summary = top_100_data.groupby('fyear')['restatement'].sum().reindex([2019, 2020, 2021], fill_value=0)

# Display the number of actual restatements in the top 100 firms for each year
restatement_summary

fyear
2019    52
2020    53
2021    51
Name: restatement, dtype: int64

In [931]:
# Pivot table to show restatement status by gvkey and fyear
restatement_pivot = top_100_data.pivot_table(index='gvkey',
                                             columns='fyear',
                                             values='restatement',
                                             aggfunc='first',
                                             fill_value=0).reindex(columns=[2019, 2020, 2021])

# Sum across the rows to count restatements for each firm
restatement_counts_per_firm = restatement_pivot.sum(axis=1)

# Count how many firms have restated 0, 1, 2, or 3 times
restatement_frequency = restatement_counts_per_firm.value_counts().sort_index()

# Display the counts
print(restatement_frequency)

0    52
1     2
2     5
3    41
Name: count, dtype: int64


##### modified jones

In [942]:
# Add a constant to the predictor variable matrix in df3
X3 = sm.add_constant(df3[['gvkey', 'sale', 'industry_freq', 'prcc_f', 'dltt', 'dlc', 'mkvalt', 'company_age', 'modified_jones_eq']])
y3 = df3['restatement']

# Apply the logistic regression model to predict probabilities for df3
df3['predicted_probability_modified_jones'] = model_modified_jones2.predict(X3)

performance_metrics_jones3 = evaluate_model_performance(model_modified_jones2, X3, y3, threshold=0.5)
print(performance_metrics_jones3)

{'accuracy': 0.6129442521020279, 'precision': 0.580952380952381, 'recall': 0.1391472188103176, 'f1_score': 0.22451868629671576}


In [943]:
# Identify the top 100 firms with the highest predicted probability of restating earnings
sorted_firms_modified_jones = df3.sort_values(by='predicted_probability_modified_jones', ascending=False)
top_100_firms_modified_jones = sorted_firms_modified_jones[['gvkey', 'predicted_probability_modified_jones']].drop_duplicates(subset = 'gvkey').head(100)
print(top_100_firms_modified_jones)

       gvkey  predicted_probability_modified_jones
12251  16494                               1.00000
29580  36607                               0.99789
24727  29218                               0.93207
5180    7435                               0.84290
1726    3246                               0.77818
...      ...                                   ...
11272  14913                               0.57203
20016  24350                               0.57182
158     1209                               0.57174
9363   12711                               0.57149
20038  24368                               0.57094

[100 rows x 2 columns]


In [944]:
# Filter df3 for entries corresponding to the top 100 firms
top_100_data_modified_jones = df3[df3['gvkey'].isin(top_100_firms_modified_jones['gvkey'])]

# Verify restatements by checking the 'restatement' column in df3 for each of the years 2019, 2020, and 2021
restatement_summary_modified_jones = top_100_data_modified_jones.groupby('fyear')['restatement'].sum().reindex([2019, 2020, 2021], fill_value=0)

# Display the number of actual restatements in the top 100 firms for each year
restatement_summary_modified_jones

fyear
2019    67
2020    63
2021    61
Name: restatement, dtype: int64

In [945]:
# Pivot table to show restatement status by gvkey and fyear
restatement_pivot_modified_jones = top_100_data_modified_jones.pivot_table(index='gvkey',
                                             columns='fyear',
                                             values='restatement',
                                             aggfunc='first',
                                             fill_value=0).reindex(columns=[2019, 2020, 2021])

# Sum across the rows to count restatements for each firm
restatement_counts_per_firm_modified_jones = restatement_pivot_modified_jones.sum(axis=1)

# Count how many firms have restated 0, 1, 2, or 3 times
restatement_frequency_modified_jones = restatement_counts_per_firm_modified_jones.value_counts().sort_index()

# Display the counts
print(restatement_frequency_modified_jones)

0    40
1     4
2     2
3    54
Name: count, dtype: int64


##### Teoh model

In [951]:
# Add a constant to the predictor variable matrix in df3
X3 = sm.add_constant(df3[['gvkey', 'sale', 'industry_freq', 'prcc_f', 'dltt', 'dlc', 'mkvalt', 'company_age', 'Teoh_eq']])
y3 = df3['restatement']

# Apply the logistic regression model to predict probabilities for df3
df3['predicted_probability_teoh'] = model_teoh2.predict(X3)

performance_metrics_teoh3 = evaluate_model_performance(model_teoh2, X3, y3, threshold=0.5)
print(performance_metrics_teoh3)

{'accuracy': 0.6130149084999647, 'precision': 0.5813782991202346, 'recall': 0.1391472188103176, 'f1_score': 0.22455047430270425}


In [952]:
# Identify the top 100 firms with the highest predicted probability of restating earnings
sorted_firms_teoh = df3.sort_values(by='predicted_probability_teoh', ascending=False)
top_100_firms_teoh= sorted_firms_teoh[['gvkey', 'predicted_probability_teoh']].drop_duplicates(subset = 'gvkey').head(100)
print(top_100_firms_teoh)

       gvkey  predicted_probability_teoh
12251  16494                     1.00000
29580  36607                     0.99775
24727  29218                     0.93038
5180    7435                     0.84701
1726    3246                     0.77819
...      ...                         ...
11272  14913                     0.57199
20016  24350                     0.57182
20038  24368                     0.57094
158     1209                     0.57069
19970  24293                     0.57060

[100 rows x 2 columns]


In [953]:
# Filter df3 for entries corresponding to the top 100 firms
top_100_data_teoh = df3[df3['gvkey'].isin(top_100_firms_teoh['gvkey'])]

# Verify restatements by checking the 'restatement' column in df3 for each of the years 2019, 2020, and 2021
restatement_summary_teoh = top_100_data_teoh.groupby('fyear')['restatement'].sum().reindex([2019, 2020, 2021], fill_value=0)

# Display the number of actual restatements in the top 100 firms for each year
restatement_summary_teoh

fyear
2019    67
2020    63
2021    61
Name: restatement, dtype: int64

In [954]:
# Pivot table to show restatement status by gvkey and fyear
restatement_pivot_teoh = top_100_data_teoh.pivot_table(index='gvkey',
                                             columns='fyear',
                                             values='restatement',
                                             aggfunc='first',
                                             fill_value=0).reindex(columns=[2019, 2020, 2021])

# Sum across the rows to count restatements for each firm
restatement_counts_per_firm_teoh = restatement_pivot_teoh.sum(axis=1)

# Count how many firms have restated 0, 1, 2, or 3 times
restatement_frequency_teoh = restatement_counts_per_firm_teoh.value_counts().sort_index()

# Display the counts
print(restatement_frequency_teoh)

0    40
1     4
2     2
3    54
Name: count, dtype: int64


**Conclusion:**
The 100 firms that are most likely to restate earnings in the futuris are stored in the *top_100_firms*.


**1. Jones model:**

 The model accuracy is 61%. In general, out of 100 firms, there are 52 firms are identified as restatement in 2019, 53 firms are identified as restatement in 2020 and 51 firms are identified as restatement in 2021.
  
  Among the top 100 firms regarding financial statement restatements over the three years studied. A significant portion, 52 firms, did not restate their financial statements at all, indicating a level of financial reporting stability or accuracy within this group. On the other end, 41 firms restated their financial statements all three years, suggesting persistent issues with financial accuracy or reporting practices in these entities. A small minority, 7 firms in total, show intermediate behavior with one or two restatements, pointing to occasional discrepancies in financial reporting. This bifurcation suggests distinct groups within the top 100 firms: one demonstrating consistent reporting accuracy and another facing ongoing challenges in financial statement reliability.

**2. Modified Jones model:**

The model accuracy is still 61%. In general, out of 100 firms, there are 67 firms are identified as restatement in 2019, 63 firms are identified as restatement in 2020 and 61 firms are identified as restatement in 2021.

It reveals a near-even split between firms with no restatements and those restating all three years, 40 and 54 firms respectively. This polarization suggests the model effectively identifies firms with consistent financial reporting practices as well as those with recurring restatement issues. The minimal number of firms (6 in total) with one or two restatements indicates fewer instances of intermittent reporting discrepancies, suggesting that firms tend to either consistently meet reporting standards or persistently face challenges, rather than fluctuating between these states. This analysis underscores the importance of targeted interventions for the latter group to improve financial reporting accuracy.


**3. Teoh model:**

The model accuracy is still 61%. In general, out of 100 firms, there are 67 firms are identified as restatement in 2019, 63 firms are identified as restatement in 2020 and 61 firms are identified as restatement in 2021.

Result is the same as Modified Jones model.

# **4**

In [966]:
df4 = compustat_df_eq[(compustat_df_eq['fyear'] >= 2010) & (compustat_df_eq['fyear'] <= 2021)]
cols = ['gvkey', 'fyear', 'sic','tic', 'residuals', 'UAA_modified_jones', 'UAA_Teoh', 'restatement', 'sec_restatement'] + [ 'sale', 'industry_freq', 'prcc_f' ,'dltt', 'dlc', 'mkvalt', 'company_age']
df4 = df4[cols]
df4['jones_eq'] = abs(df4['residuals'])
df4['modified_jones_eq'] = abs(df4['UAA_modified_jones'])
df4['Teoh_eq'] = abs(df4['UAA_Teoh'])
df4.head()

Unnamed: 0,gvkey,fyear,sic,tic,residuals,UAA_modified_jones,UAA_Teoh,restatement,sec_restatement,sale,industry_freq,prcc_f,dltt,dlc,mkvalt,company_age,jones_eq,modified_jones_eq,Teoh_eq
0,1004,2011.0,5080,AIR,-0.4026,0.45073,0.57564,1,0,2074.498,0.0108,12.05,669.489,122.865,485.2897,39.0,0.4026,0.45073,0.57564
1,1004,2012.0,5080,AIR,0.14465,0.07052,0.11769,1,0,2167.1,0.0108,20.06,622.2,86.4,790.0029,40.0,0.14465,0.07052,0.11769
2,1004,2013.0,5080,AIR,0.31852,0.04951,0.03361,1,0,2035.0,0.0108,24.3,564.3,69.7,961.308,41.0,0.31852,0.04951,0.03361
3,1004,2014.0,5080,AIR,0.37652,0.58614,0.4929,1,0,1594.3,0.0108,29.54,85.0,69.0,1046.3954,42.0,0.37652,0.58614,0.4929
4,1004,2015.0,5080,AIR,-0.04751,0.09574,0.02131,0,0,1662.6,0.0108,24.41,136.1,12.0,842.5112,43.0,0.04751,0.09574,0.02131


In [967]:
#@title data cleaning
for column in ['prcc_f', 'mkvalt', 'dltt']:
  df4 = impute_with_avg_of_neighbors(df4, column)

df4.replace([np.inf, -np.inf], np.nan, inplace=True)
df4 = df4.dropna(subset=['prcc_f', 'mkvalt', 'dltt', 'jones_eq', 'modified_jones_eq', 'Teoh_eq'])
df4.info()

<class 'pandas.core.frame.DataFrame'>
Index: 55688 entries, 0 to 64670
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   gvkey               55688 non-null  int64  
 1   fyear               55688 non-null  float64
 2   sic                 55688 non-null  int64  
 3   tic                 55678 non-null  object 
 4   residuals           55688 non-null  float64
 5   UAA_modified_jones  55688 non-null  float64
 6   UAA_Teoh            55688 non-null  float64
 7   restatement         55688 non-null  int64  
 8   sec_restatement     55688 non-null  int64  
 9   sale                55688 non-null  float64
 10  industry_freq       55688 non-null  float64
 11  prcc_f              55688 non-null  float64
 12  dltt                55688 non-null  float64
 13  dlc                 55688 non-null  float64
 14  mkvalt              55688 non-null  float64
 15  company_age         55688 non-null  float64
 16  jones_eq 

In [969]:
# Function to assign deciles within each year for a given EQ measure
def assign_deciles(df, column_name):
    return df.groupby('fyear')[column_name] \
             .transform(lambda x: pd.qcut(x, 10, labels=range(1, 11), duplicates='drop'))

# Assign deciles for each EQ measure
df4['jones_eq_decile'] = assign_deciles(df4, 'jones_eq')
df4['modified_jones_eq_decile'] = assign_deciles(df4, 'modified_jones_eq')
df4['Teoh_eq_decile'] = assign_deciles(df4, 'Teoh_eq')

# Invert the deciles so that higher values indicate lower EQ (and higher earnings management concern)
df4['jones_eq_decile'] = 11 - df4['jones_eq_decile'].astype(int)
df4['modified_jones_eq_decile'] = 11 - df4['modified_jones_eq_decile'].astype(int)
df4['Teoh_eq_decile'] = 11 - df4['Teoh_eq_decile'].astype(int)

# Sum the decile rankings to create the total EM Index score
df4['total_EM_index'] = df4[['jones_eq_decile', 'modified_jones_eq_decile', 'Teoh_eq_decile']].sum(axis=1)

df4.head()

Unnamed: 0,gvkey,fyear,sic,tic,residuals,UAA_modified_jones,UAA_Teoh,restatement,sec_restatement,sale,...,dlc,mkvalt,company_age,jones_eq,modified_jones_eq,Teoh_eq,jones_eq_decile,modified_jones_eq_decile,Teoh_eq_decile,total_EM_index
0,1004,2011.0,5080,AIR,-0.4026,0.45073,0.57564,1,0,2074.498,...,122.865,485.2897,39.0,0.4026,0.45073,0.57564,5,3,3,11
1,1004,2012.0,5080,AIR,0.14465,0.07052,0.11769,1,0,2167.1,...,86.4,790.0029,40.0,0.14465,0.07052,0.11769,7,7,6,20
2,1004,2013.0,5080,AIR,0.31852,0.04951,0.03361,1,0,2035.0,...,69.7,961.308,41.0,0.31852,0.04951,0.03361,4,8,9,21
3,1004,2014.0,5080,AIR,0.37652,0.58614,0.4929,1,0,1594.3,...,69.0,1046.3954,42.0,0.37652,0.58614,0.4929,5,4,4,13
4,1004,2015.0,5080,AIR,-0.04751,0.09574,0.02131,0,0,1662.6,...,12.0,842.5112,43.0,0.04751,0.09574,0.02131,9,7,9,25


In [970]:
df4_train = df4[df4['fyear'] < 2021]
df4_test = df4[df4['fyear'] == 2021]

In [971]:
#@title final model
from sklearn.metrics import accuracy_score
# Define the predictor and response variables
X4 = df4_train[['gvkey', 'sale', 'industry_freq', 'prcc_f' ,'dltt', 'dlc', 'mkvalt', 'company_age','total_EM_index']]   # Predictor
y = df4_train['restatement']  # Response variable

# Add a constant to the predictor variable matrix
X4 = sm.add_constant(X4)

# Fit the logistic regression model
model = sm.Logit(y, X4).fit()

# Display the model summary to see the results
print(model.summary())

Optimization terminated successfully.
         Current function value: 0.655363
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:            restatement   No. Observations:                50923
Model:                          Logit   Df Residuals:                    50913
Method:                           MLE   Df Model:                            9
Date:                Sun, 07 Apr 2024   Pseudo R-squ.:                 0.01801
Time:                        20:04:33   Log-Likelihood:                -33373.
converged:                       True   LL-Null:                       -33985.
Covariance Type:            nonrobust   LLR p-value:                6.299e-258
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
const             -0.8077      0.033    -24.192      0.000      -0.873      -0.742
gvkey          -1

In [972]:
performance_metrics4 = evaluate_model_performance(model, X4, y, threshold=0.5)
print(performance_metrics4)

{'accuracy': 0.6200930817116038, 'precision': 0.5467980295566502, 'recall': 0.10701781093012636, 'f1_score': 0.17900186725513495}


In [975]:
# Add a constant to the predictors if your model expects it
X_test = sm.add_constant(df4_test[['gvkey', 'sale', 'industry_freq', 'prcc_f', 'dltt', 'dlc', 'mkvalt', 'company_age', 'total_EM_index']])
y_test = df4_test['restatement']  # Response variable

# Predict the probability of restatement
df4_test['predicted_probability'] = model.predict(X_test)

performance_metrics_final = evaluate_model_performance(model, X_test, y_test, threshold=0.5)
print(performance_metrics_final)

# Sort the firms by predicted probability in descending order to get those most likely to restate at the top
df4_test_sorted = df4_test.sort_values(by='predicted_probability', ascending=False)

# Select the top 10 firms
top_10_firms = df4_test_sorted.head(10)

{'accuracy': 0.6186778593913956, 'precision': 0.5821782178217821, 'recall': 0.15473684210526314, 'f1_score': 0.24449064449064445}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4_test['predicted_probability'] = model.predict(X_test)


In [976]:
# Now, top_10_firms contains the firms most likely to restate their financials in 2021
print(top_10_firms[['gvkey', 'tic', 'sale', 'industry_freq', 'company_age','total_EM_index','predicted_probability']])

      gvkey   tic        sale  industry_freq  company_age  total_EM_index  \
5181   7435   MMM 35355.00000        0.01570     75.00000               9   
1726   3246   CMC  6729.76000        0.00703     61.00000              26   
807    2111   BDX 20248.00000        0.03662     59.00000              29   
768    2080  BSET   486.53400        0.00247     49.00000              28   
10     1004   AIR  1817.10000        0.01080     49.00000              29   
219    1327  SWKS  5109.10000        0.04129     53.00000              23   
1318   2710   STZ  8820.70000        0.01620     48.00000              29   
4876   7146   MKC  6317.90000        0.01620     49.00000              25   
6255   8850   KWR  1761.15800        0.00474     49.00000              17   
366    1632   ADI  7318.28600        0.04129     49.00000              22   

      predicted_probability  
5181                0.83629  
1726                0.80570  
807                 0.79068  
768                 0.74323  
10

In [977]:
top_10_firms['tic']

5181     MMM
1726     CMC
807      BDX
768     BSET
10       AIR
219     SWKS
1318     STZ
4876     MKC
6255     KWR
366      ADI
Name: tic, dtype: object

**Final List:**
1. 3M (MMM): 3M operates across diverse sectors with complex supply chains and regulatory requirements. Given its involvement in various legal and environmental challenges over the years, the complexity of its operations could potentially lead to accounting errors or necessitate adjustments upon further review.

2. Skyworks Solutions (SWKS): As a semiconductor company, Skyworks is part of a highly dynamic and competitive industry. The rapid pace of technological change and complex revenue recognition issues, especially related to licensing and intellectual property, could lead to a higher likelihood of financial restatement.

3. Constellation Brands (STZ): Involved in alcohol and cannabis industries, Constellation Brands engages in significant M&A activities. The valuation and integration of these deals are complex and subject to significant judgment, which could lead to restatements, especially if market conditions change or assumptions prove inaccurate.

4. McCormick & Company (MKC): Though perhaps less likely than technology or manufacturing firms to need to restate financials due to the nature of its business, McCormick's global operations and any significant acquisitions could pose challenges in financial reporting, potentially leading to restatements.

5. Analog Devices (ADI): Analog Devices operates in the semiconductor industry, which is characterized by rapid innovation and significant R&D expenditure. The valuation of intangible assets, including patents and technology licenses, as well as revenue recognition practices, might lead to restatements.