## This will be to analyse the EPS data scraped by DataAssembly/EPS/parse_10q_filings.py
### first analysis is to see how many blanks we have (can I do the rest by hand or not)
### second will be to look for outliers - these will be to look for instances where the LLM screwed up. 


In [74]:
# Analysis for news.db (e.g how many articles are there for each ticker)
import sys, os
notebook_dir = os.getcwd()
sys.path.append(os.path.abspath(os.path.join(notebook_dir, "../..")))
import config
import pandas as pd
import matplotlib.pyplot as plt

In [75]:
df = pd.read_csv("../../"+config.EPS_DATA_CSV)
type(df)
df_quarterly = df[df['Form Type'] == '10-Q (Quarterly report)'].copy()


In [76]:
def find_null_eps_10q_rows(df: pd.DataFrame) -> pd.DataFrame:
    """
    Find rows where all EPS columns are null and the form type is '10-Q'.
    Returns a DataFrame with index, Ticker, and AccessionNumber.
    """
    initial_count = len(df)


    mask = (
        (df['quarterly_raw_eps'].isnull()) &
        (df['quarterly_diluted_eps'].isnull()) &
        (df['annual_raw_eps'].isnull()) &
        (df['annual_diluted_eps'].isnull())
    )

    result_df = df.loc[mask, ['Ticker', 'Accession Number']].copy()
    print(f"Total of {initial_count} rows.")
    print(f"[INFO] Found {len(result_df)} rows with all EPS values missing for 10-Q filings. ({len(result_df)*100/initial_count}%)")
    return result_df


In [77]:
missing_df = find_null_eps_10q_rows(df=df_quarterly)
missing_df.head(10)

Total of 6163 rows.
[INFO] Found 140 rows with all EPS values missing for 10-Q filings. (2.271620963816323%)


Unnamed: 0,Ticker,Accession Number
178,TER,0001193125-17-255654
260,WDC,0000106040-24-000040
261,WDC,0000106040-24-000022
262,WDC,0000106040-24-000014
263,WDC,0000106040-23-000034
264,WDC,0000106040-23-000017
265,WDC,0000106040-23-000010
269,WDC,0000106040-21-000053
506,ROST,0001206774-12-002608
799,TSLA,0001564590-18-019254


In [78]:
ticker_missing_counts_df = missing_df['Ticker'].value_counts().reset_index()
ticker_missing_counts_df.columns = ['Ticker', 'MissingCount']
print(len(ticker_missing_counts_df))
print(ticker_missing_counts_df)
# Tickers with no articles:
# [TEL,MCHP,BAC,EG,AJG(7),PTC]
# 22 + 7 + 7 + 3 + 2 + 1 = 42
# 140 - 42 = 98
# I can do this by hand

19
   Ticker  MissingCount
0      BX            23
1     TEL            22
2     KKR            20
3    TSLA            16
4      MS            11
5     WDC             7
6    MCHP             7
7     BAC             7
8     HAS             6
9    ULTA             4
10    AMD             4
11     EG             3
12    PGR             2
13   BKNG             2
14    AJG             2
15   ERIE             1
16   ROST             1
17    PTC             1
18    TER             1


In [85]:
def flag_eps_anomalies(df: pd.DataFrame, threshold_pct: float = 5.0) -> pd.DataFrame:
    """
    Flags EPS rows where:
    - Raw and diluted EPS have opposite signs
    - Absolute percentage difference exceeds threshold AND absolute difference > 0.05
    - Absolute raw EPS < absolute diluted EPS
    """

    # Filter for rows with both EPS values present
    mask = df['quarterly_raw_eps'].notnull() & df['quarterly_diluted_eps'].notnull()
    eps_df = df.loc[mask].copy()

    raw_eps = eps_df['quarterly_raw_eps']
    diluted_eps = eps_df['quarterly_diluted_eps']

    abs_raw = raw_eps.abs()
    abs_diluted = diluted_eps.abs()

    # Absolute difference
    abs_diff = (raw_eps - diluted_eps).abs()

    # Percentage difference
    eps_df['percentage_difference'] = ((abs_raw - abs_diluted) / abs_raw.replace(0, float('nan'))) * 100

    # Conditions
    sign_mismatch = (raw_eps * diluted_eps) < 0
    pct_difference = (eps_df['percentage_difference'].abs() > threshold_pct) & (abs_diff > 0.02)
    raw_smaller = abs_raw < abs_diluted

    # Combined condition
    anomaly_mask = sign_mismatch | pct_difference | raw_smaller

    return eps_df.loc[anomaly_mask, [
        'Ticker', 'Query', 'quarterly_raw_eps', 'quarterly_diluted_eps', 'percentage_difference'
    ]]


In [86]:
anomalies = flag_eps_anomalies(df_quarterly, threshold_pct=20)
print(anomalies.head(50))
print(f"Total anomalies found: {len(anomalies)}")

     Ticker                      Query  quarterly_raw_eps  \
815    TSLA  TSLA/0001193125-13-212354               0.10   
889    WYNN  WYNN/0001174922-23-000105               0.11   
938    FTNT  FTNT/0001262039-24-000020               0.39   
1021    CRM   CRM/0001193125-13-453124               0.21   
1023    CRM   CRM/0001193125-13-235664              -0.12   
1736   VRSN  VRSN/0001014473-18-000028               1.38   
1831    AON   AON/0001628280-21-008451               3.94   
1872    IBM   IBM/0000051143-24-000049              -0.34   
2031   SCHW  SCHW/0000316709-23-000062               2.04   
2526    BAC   BAC/0000070858-14-000139               0.20   
4533    WTW   WTW/0001564590-21-052630               7.01   
5535      V     V/0001403161-17-000011               0.80   
5856     GM    GM/0001467858-14-000125               0.08   
6431    PFG   PFG/0001104659-13-035720               0.61   
6903    LOW   LOW/0000060667-12-000149               0.43   
7491   AMZN  AMZN/000101