In [3]:
# Re-running the single-cell notebook solution (session was reset).

import os, re, glob, csv, html
from bs4 import BeautifulSoup
import pandas as pd

INPUT_DIR = './Training_Filings/'
OUTPUT_CSV = './final_output.csv'

def load_html_text(path: str) -> str:
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        raw = html.unescape(f.read())
    soup = BeautifulSoup(raw, "html.parser")
    text = soup.get_text(" ", strip=True)
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"\(\s*(\d+(?:\.\d+)?)\s*\)", r"-\1", text)  # bracket negatives
    return text

NUMBER = r"(-?\d+(?:\.\d+)?)"
PATTERNS = {
    "basic": re.compile(rf"(basic(?:\s+and\s+diluted)?\s+(?:earnings|loss)\s+per\s+share.*?)({NUMBER})", re.I),
    "diluted": re.compile(rf"(diluted\s+(?:earnings|loss)\s+per\s+share.*?){NUMBER}", re.I),
    "per_basic": re.compile(rf"({NUMBER})\s+per\s+basic\s+share", re.I),
    "per_diluted": re.compile(rf"({NUMBER})\s+per\s+diluted\s+share", re.I),
    "generic": re.compile(rf"(?:earnings|loss).{{0,80}}?\s({NUMBER})\s+per\s+share", re.I),
}
PREF_MAP = {"basic": 3, "basic_and_diluted": 2, "diluted": 1}

def score_candidate(val: float, context: str):
    score = 0.0
    label = "unknown"
    c = context.lower()
    if "basic" in c and "diluted" not in c:
        score += 5; label = "basic"
    if "diluted" in c:
        score += 3; label = "diluted"
    if "basic and diluted" in c or "diluted and basic" in c:
        score += 4; label = "basic_and_diluted"
    if "adjusted" in c or "non-gaap" in c or "non gaap" in c:
        score -= 3
    if "gaap" in c:
        score += 1
    if any(q in c for q in ["first quarter","second quarter","third quarter","fourth quarter","quarter","q1","q2","q3","q4"]):
        score += 1
    if "loss" in c and val > 0:
        val = -val
    return score, val, label

def extract_eps_from_text(text: str):
    cands = []
    for tag, patt in PATTERNS.items():
        for m in patt.finditer(text):
            ctx = text[max(0, m.start()-160): m.end()+160]
            if re.search(r"\b\d+\s*pt\b", ctx, re.I):
                continue
            if tag in ("generic","per_basic","per_diluted"):
                cl = ctx.lower()
                if "earnings" not in cl and "loss" not in cl:
                    continue
            val = float(m.group(2) if tag in ("basic","diluted") else m.group(1))
            if abs(val) > 10 and "loss" not in ctx.lower():
                continue
            score, adj, lbl = score_candidate(val, ctx)
            cands.append((score, adj, lbl if lbl!="unknown" else tag, ctx))
    if not cands:
        return None, None
    cands.sort(key=lambda x: (x[0], PREF_MAP.get(x[2], 0)), reverse=True)
    best = cands[0]
    return round(best[1], 2), best[2]

files = sorted(glob.glob(os.path.join(INPUT_DIR, "0000*.html")))
rows = []
for path in files:
    fname = os.path.basename(path)
    try:
        txt = load_html_text(path)
        eps, note = extract_eps_from_text(txt)
        rows.append({"filename": fname, "EPS": "" if eps is None else f"{eps:.2f}", "note": note or ""})
    except Exception as e:
        rows.append({"filename": fname, "EPS": "", "note": f"error: {e}"})

with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["filename","EPS","note"])
    writer.writeheader()
    writer.writerows(rows)

df = pd.read_csv(OUTPUT_CSV)

OUTPUT_CSV


'./final_output.csv'

In [8]:
# Single-cell EDGAR EPS parser (recursive, case-insensitive file discovery)

# If BeautifulSoup isn't installed:
# !pip install beautifulsoup4 lxml

import os, re, glob, csv, html, sys
from bs4 import BeautifulSoup
import pandas as pd
from IPython.display import display

# -------- Settings --------
# Change this to your folder with the filings:
INPUT_DIR = './Training_Filings/'
OUTPUT_CSV = './final_output.csv'

# -------- Discover files (recursive; case-insensitive .htm/.html) --------
patterns = ["**/*.html", "**/*.htm", "**/*.HTML", "**/*.HTM"]
files = []
for p in patterns:
    files.extend(glob.glob(os.path.join(INPUT_DIR, p), recursive=True))
# De-duplicate & sort
files = sorted(set(files))

print(f"Discovered {len(files)} files under {os.path.abspath(INPUT_DIR)}")
if len(files) == 0:
    raise FileNotFoundError("No .html/.htm files found. Check INPUT_DIR.")

# -------- Helpers --------
def load_html_text(path: str) -> str:
    """Load HTML, strip tags to text, normalize whitespace, convert bracketed negatives."""
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        raw = html.unescape(f.read())
    # lxml is robust; if not present, switch to 'html.parser'
    soup = BeautifulSoup(raw, "lxml")
    text = soup.get_text(" ", strip=True)
    text = re.sub(r"\s+", " ", text)
    # (1.23) -> -1.23
    text = re.sub(r"\(\s*(\d+(?:\.\d+)?)\s*\)", r"-\1", text)
    return text

NUMBER = r"(-?\d+(?:\.\d+)?)"
PATTERNS = {
    "basic": re.compile(rf"(basic(?:\s+and\s+diluted)?\s+(?:earnings|loss)\s+per\s+share.*?)({NUMBER})", re.I),
    "diluted": re.compile(rf"(diluted\s+(?:earnings|loss)\s+per\s+share.*?){NUMBER}", re.I),
    "per_basic": re.compile(rf"({NUMBER})\s+per\s+basic\s+share", re.I),
    "per_diluted": re.compile(rf"({NUMBER})\s+per\s+diluted\s+share", re.I),
    # Allow up to 80 chars between "earnings"/"loss" and "per share"
    "generic": re.compile(rf"(?:earnings|loss).{{0,80}}?\s({NUMBER})\s+per\s+share", re.I),
}

# tie-breaker: prefer basic over diluted when scores tie
PREF_MAP = {"basic": 3, "basic_and_diluted": 2, "diluted": 1}

def score_candidate(val: float, context: str):
    """Assign a score and canonical label based on GAAP-ish signals."""
    score = 0.0
    label = "unknown"
    c = context.lower()

    # Preference: basic > (basic and diluted) > diluted
    if "basic" in c and "diluted" not in c:
        score += 5; label = "basic"
    if "diluted" in c:
        score += 3; label = "diluted"
    if "basic and diluted" in c or "diluted and basic" in c:
        score += 4; label = "basic_and_diluted"

    # Demote adjusted / non-GAAP
    if "adjusted" in c or "non-gaap" in c or "non gaap" in c:
        score -= 3

    # Slight bump for explicit GAAP mentions
    if "gaap" in c:
        score += 1

    # Small bump for quarterly language (recency / section relevance)
    if any(q in c for q in ["first quarter","second quarter","third quarter","fourth quarter","quarter","q1","q2","q3","q4"]):
        score += 1

    # If explicitly a loss and value is positive, flip sign
    if "loss" in c and val > 0:
        val = -val

    return score, val, label

def extract_eps_from_text(text: str):
    """Return best (eps_value, note_label) or (None, None)."""
    cands = []

    for tag, patt in PATTERNS.items():
        for m in patt.finditer(text):
            # Context window helps identify GAAP vs adjusted, quarter, loss/earnings, etc.
            ctx = text[max(0, m.start()-160): m.end()+160]

            # Filter layout noise like "12pt"
            if re.search(r"\b\d+\s*pt\b", ctx, re.I):
                continue

            # Require 'earnings' or 'loss' for generic/per_* variants
            if tag in ("generic","per_basic","per_diluted"):
                cl = ctx.lower()
                if "earnings" not in cl and "loss" not in cl:
                    continue

            # Extract number
            val = float(m.group(2) if tag in ("basic","diluted") else m.group(1))

            # Exclude very large magnitudes unless "loss" is nearby (e.g., big GAAP loss per share)
            if abs(val) > 10 and "loss" not in ctx.lower():
                continue

            score, adj, lbl = score_candidate(val, ctx)
            cands.append((score, adj, lbl if lbl!="unknown" else tag, ctx))

    if not cands:
        return None, None

    # Pick the highest score; on ties prefer basic > basic_and_diluted > diluted
    cands.sort(key=lambda x: (x[0], PREF_MAP.get(x[2], 0)), reverse=True)
    best = cands[0]
    return round(best[1], 2), best[2]

# -------- Run on all discovered files --------
rows = []
for path in files:
    fname = os.path.basename(path)
    try:
        txt = load_html_text(path)
        eps, note = extract_eps_from_text(txt)
        rows.append({"filename": fname, "EPS": "" if eps is None else f"{eps:.2f}", "note": note or ""})
    except Exception as e:
        rows.append({"filename": fname, "EPS": "", "note": f"error: {e}"})

# Save CSV and show a preview
df = pd.DataFrame(rows)
df.to_csv(OUTPUT_CSV, index=False)
print(f"Wrote: {os.path.abspath(OUTPUT_CSV)}  |  Rows: {len(df)}")
display(df.head(20))


Discovered 50 files under c:\Users\karan\Documents\Projects\Trexquant\Training_Filings
Wrote: c:\Users\karan\Documents\Projects\Trexquant\final_output.csv  |  Rows: 50


Unnamed: 0,filename,EPS,note
0,0000004977-20-000054.html,0.78,diluted
1,0000008947-20-000044.html,-0.34,diluted
2,0000046080-20-000050.html,-0.51,diluted
3,0000066570-20-000013.html,1.11,diluted
4,0000314808-20-000062.html,,
5,0000706129-20-000012.html,-8.0,diluted
6,0000846617-20-000024.html,-0.47,diluted
7,0000874766-20-000033.html,0.74,diluted
8,0000875320-20-000014.html,,
9,0000892537-20-000010.html,0.71,diluted


In [9]:
# Single-cell EDGAR EPS parser (fixed)
# - Recursive file discovery (.html/.htm, any case)
# - Quarter-targeted extraction (prefers Q EPS over full-year lines)
# - Strong preference for BASIC over DILUTED
# - Demotes adjusted / non-GAAP
# - Outputs: eps_notebook_output_v2.csv

# If BeautifulSoup isn't installed:
# !pip install beautifulsoup4 lxml

import os, re, glob, csv, html
from bs4 import BeautifulSoup
import pandas as pd
from IPython.display import display

# ====== SETTINGS ======
INPUT_DIR = './Training_Filings/'
OUTPUT_CSV = './final_output.csv'

# ====== DISCOVER FILES (recursive; case-insensitive) ======
patterns = ["**/*.html", "**/*.htm", "**/*.HTML", "**/*.HTM"]
files = sorted(set(sum([glob.glob(os.path.join(INPUT_DIR, p), recursive=True) for p in patterns], [])))
print(f"Discovered {len(files)} files under {os.path.abspath(INPUT_DIR)}")
if not files:
    raise FileNotFoundError("No .html/.htm files found. Double-check INPUT_DIR.")

# ====== HELPERS ======
def load_html_text(path: str) -> str:
    """Load HTML, strip tags to text, normalize whitespace, convert bracketed negatives like (0.41)->-0.41."""
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        raw = html.unescape(f.read())
    soup = BeautifulSoup(raw, "lxml")   # robust; if unavailable, switch to "html.parser"
    text = soup.get_text(" ", strip=True)
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"\(\s*(\d+(?:\.\d+)?)\s*\)", r"-\1", text)
    return text

NUMBER = r"(-?\d+(?:\.\d+)?)"

# Quarter phrases we want to anchor on
QUARTER_BLOCK = re.compile(r"((?:first|second|third|fourth)\s+quarter.*?)", re.I)

# EPS patterns (general)
PATTERNS = {
    "basic": re.compile(rf"(basic(?:\s+and\s+diluted)?\s+(?:earnings|loss)\s+per\s+share.*?)({NUMBER})", re.I),
    "diluted": re.compile(rf"(diluted\s+(?:earnings|loss)\s+per\s+share.*?){NUMBER}", re.I),
    "per_basic": re.compile(rf"({NUMBER})\s+per\s+basic\s+share", re.I),
    "per_diluted": re.compile(rf"({NUMBER})\s+per\s+diluted\s+share", re.I),
    "generic": re.compile(rf"(?:earnings|loss).{{0,120}}?\s({NUMBER})\s+per\s+share", re.I),
}

# tie-breaker weight: prefer basic > (basic_and_diluted) > diluted
PREF_MAP = {"basic": 6, "basic_and_diluted": 3, "diluted": 1}

def score_candidate(val: float, context: str, base_label: str):
    """Score a candidate number based on GAAP-ish signals; return (score, adjusted_val, label)."""
    score = 0.0
    label = base_label
    c = context.lower()

    # Strong preference for BASIC over DILUTED
    if "basic" in c and "diluted" not in c:
        score += 7; label = "basic"
    if "diluted" in c:
        score += 2; label = "diluted"
    if "basic and diluted" in c or "diluted and basic" in c:
        score += 5; label = "basic_and_diluted"

    # Demote adjusted / non-GAAP
    if "adjusted" in c or "non-gaap" in c or "non gaap" in c:
        score -= 5

    # Small bump for explicit GAAP and for quarter language
    if "gaap" in c:
        score += 1
    if any(q in c for q in ["first quarter","second quarter","third quarter","fourth quarter","quarter","q1","q2","q3","q4"]):
        score += 2

    # Flip sign if explicitly a loss and value is positive
    if "loss" in c and val > 0:
        val = -val

    return score, val, label

def nearest_number_to_phrase(text, phrase_span, window=220):
    """Find the EPS-like number closest to a quarter phrase."""
    start, end = phrase_span
    ctx = text[max(0, start-window): min(len(text), end+window)]
    # Prefer '... per (diluted|basic) share' immediately near the quarter phrase
    m = re.search(rf"{NUMBER}\s+per\s+(?:diluted|basic)\s+share", ctx, re.I)
    if m:
        return float(m.group(1)), ctx
    # Otherwise accept 'earnings/loss ... per share'
    m = re.search(rf"(?:earnings|loss).{{0,80}}?\s({NUMBER})\s+per\s+share", ctx, re.I)
    if m:
        return float(m.group(1)), ctx
    return None, ctx

def extract_eps_from_text(text: str):
    cands = []

    # 1) Quarter-targeted pass (preferred when present)
    for qb in QUARTER_BLOCK.finditer(text):
        val, ctx = nearest_number_to_phrase(text, qb.span(), window=220)
        if val is not None:
            # Allow big magnitudes only if 'loss' nearby (e.g., large GAAP loss per share)
            if abs(val) > 10 and "loss" not in ctx.lower():
                continue
            score, adj, lbl = score_candidate(val, ctx, base_label="quarter")
            score += 4  # extra boost for quarter-anchored EPS
            cands.append((score, adj, lbl, ctx))

    # 2) General patterns
    for tag, patt in PATTERNS.items():
        for m in patt.finditer(text):
            ctx = text[max(0, m.start()-180): m.end()+180]

            # Filter formatting noise like "12pt"
            if re.search(r"\b\d+\s*pt\b", ctx, re.I):
                continue

            # Require 'earnings' or 'loss' context for generic / per_* variants
            if tag in ("generic","per_basic","per_diluted"):
                cl = ctx.lower()
                if "earnings" not in cl and "loss" not in cl:
                    continue

            # Extract the numeric value
            val = float(m.group(2) if tag in ("basic","diluted") else m.group(1))

            # Exclude huge magnitudes unless an explicit loss context exists
            if abs(val) > 10 and "loss" not in ctx.lower():
                continue

            score, adj, lbl = score_candidate(val, ctx, base_label=tag)
            cands.append((score, adj, lbl, ctx))

    if not cands:
        return None, None

    # Pick highest scoring; on ties, prefer basic > basic_and_diluted > diluted
    cands.sort(key=lambda x: (x[0], PREF_MAP.get(x[2], 0)), reverse=True)
    best = cands[0]
    return round(best[1], 2), best[2]

# ====== RUN ======
rows = []
for path in files:
    fname = os.path.basename(path)
    try:
        txt = load_html_text(path)
        eps, note = extract_eps_from_text(txt)
        rows.append({"filename": fname, "eps": "" if eps is None else f"{eps:.2f}", "note": note or ""})
    except Exception as e:
        rows.append({"filename": fname, "eps": "", "note": f"error: {e}"})

df = pd.DataFrame(rows)
df.to_csv(OUTPUT_CSV, index=False)
print(f"Wrote: {os.path.abspath(OUTPUT_CSV)} | Rows: {len(df)}")
display(df.head(25))


Discovered 50 files under c:\Users\karan\Documents\Projects\Trexquant\Training_Filings
Wrote: c:\Users\karan\Documents\Projects\Trexquant\final_output.csv | Rows: 50


Unnamed: 0,filename,eps,note
0,0000004977-20-000054.html,0.78,diluted
1,0000008947-20-000044.html,0.34,diluted
2,0000046080-20-000050.html,-0.51,diluted
3,0000066570-20-000013.html,1.11,diluted
4,0000314808-20-000062.html,,
5,0000706129-20-000012.html,-8.0,diluted
6,0000846617-20-000024.html,0.47,diluted
7,0000874766-20-000033.html,0.74,diluted
8,0000875320-20-000014.html,,
9,0000892537-20-000010.html,0.71,diluted


In [1]:
from bs4 import BeautifulSoup

In [4]:
import os

def dump_html_to_txt(html_files, output_file):
    """
    Reads content from a list of HTML files and dumps it into a single text file.

    Args:
        html_files (list): A list of strings, where each string is a path to an HTML file.
        output_file (str): The path to the text file where content will be saved.
    """
    try:
        with open(output_file, 'w', encoding='utf-8') as outfile:
            for html_file in html_files:
                if not os.path.exists(html_file):
                    print(f"Warning: File not found at '{html_file}'. Skipping.")
                    continue

                try:
                    with open(html_file, 'r', encoding='utf-8') as infile:
                        content = infile.read()

                        outfile.write(f"filename = {html_file}\n\n")
                        outfile.write(f'"{content}"\n\n\n\n')
                        print(f"Successfully processed and dumped content from '{html_file}'.")

                except Exception as e:
                    print(f"Error reading file '{html_file}': {e}")

        print(f"\nAll content successfully dumped to '{output_file}'.")

    except IOError as e:
        print(f"Error opening or writing to the output file '{output_file}': {e}")

if __name__ == '__main__':
    # List of HTML files you want to process.
    # Add your HTML file paths to this list.
    files_to_process = [
        "./Training_Filings/0000046080-20-000050.html",
        "./Training_Filings/0000874766-20-000033.html",
        "./Training_Filings/0000875320-20-000014.html",
        "./Training_Filings/0001008654-20-000048.html",
        "./Training_Filings/0001104659-20-052792.html",
        "./Training_Filings/0001140361-20-010070.html ",
        "./Training_Filings/0001157523-20-000597.html",
        "./Training_Filings/0001165002-20-000083.html",
        "./Training_Filings/0001171843-20-003035.html",
        "./Training_Filings/0001193125-20-124288.html",
        "./Training_Filings/0001193125-20-126089.html",
        "./Training_Filings/0001193125-20-126683.html",
        "./Training_Filings/0001373715-20-000098.html",
        "./Training_Filings/0001423689-20-000040.html",
        "./Training_Filings/0001564590-20-019431.html",
        "./Training_Filings/0001564590-20-019442.html",
        "./Training_Filings/0001576427-20-000032.html",
        "./Training_Filings/0001620459-20-000067.html",
        "./Training_Filings/0001722482-20-000089.html",
    ]

    # The name of the text file where you want to dump the content.
    destination_file = 'output.txt'

    dump_html_to_txt(files_to_process, destination_file)


Successfully processed and dumped content from './Training_Filings/0000046080-20-000050.html'.
Successfully processed and dumped content from './Training_Filings/0000874766-20-000033.html'.
Successfully processed and dumped content from './Training_Filings/0000875320-20-000014.html'.
Successfully processed and dumped content from './Training_Filings/0001008654-20-000048.html'.
Successfully processed and dumped content from './Training_Filings/0001104659-20-052792.html'.
Successfully processed and dumped content from './Training_Filings/0001140361-20-010070.html '.
Successfully processed and dumped content from './Training_Filings/0001157523-20-000597.html'.
Successfully processed and dumped content from './Training_Filings/0001165002-20-000083.html'.
Successfully processed and dumped content from './Training_Filings/0001171843-20-003035.html'.
Successfully processed and dumped content from './Training_Filings/0001193125-20-124288.html'.
Successfully processed and dumped content from '.

In [1]:
import os

def dump_html_to_txt(html_files, output_file):
    """
    Reads content from a list of HTML files and dumps it into a single text file.

    Args:
        html_files (list): A list of strings, where each string is a path to an HTML file.
        output_file (str): The path to the text file where content will be saved.
    """
    try:
        with open(output_file, 'w', encoding='utf-8') as outfile:
            for html_file in html_files:
                if not os.path.exists(html_file):
                    print(f"Warning: File not found at '{html_file}'. Skipping.")
                    continue

                try:
                    with open(html_file, 'r', encoding='utf-8') as infile:
                        content = infile.read()

                        outfile.write(f"filename = {html_file}\n\n")
                        outfile.write(f'"{content}"\n\n\n\n')
                        print(f"Successfully processed and dumped content from '{html_file}'.")

                except Exception as e:
                    print(f"Error reading file '{html_file}': {e}")

        print(f"\nAll content successfully dumped to '{output_file}'.")

    except IOError as e:
        print(f"Error opening or writing to the output file '{output_file}': {e}")

if __name__ == '__main__':
    # The folder containing the HTML files you want to process.
    # All files ending with .html in this folder will be processed.
    source_folder = './Training_Filings/'

    # The name of the text file where you want to dump the content.
    destination_file = 'output_Files.txt'

    # Check if the source folder exists
    if not os.path.isdir(source_folder):
        print(f"Error: Source folder '{source_folder}' not found.")
        print("Please create the folder and place your HTML files inside it.")
    else:
        # Find all files in the folder that end with .html
        files_to_process = [
            os.path.join(source_folder, f)
            for f in os.listdir(source_folder)
            if f.lower().endswith('.html')
        ]

        if not files_to_process:
            print(f"No HTML files found in '{source_folder}'.")
        else:
            dump_html_to_txt(files_to_process, destination_file)

Successfully processed and dumped content from './Training_Filings/0000004977-20-000054.html'.
Successfully processed and dumped content from './Training_Filings/0000008947-20-000044.html'.
Successfully processed and dumped content from './Training_Filings/0000046080-20-000050.html'.
Successfully processed and dumped content from './Training_Filings/0000066570-20-000013.html'.
Successfully processed and dumped content from './Training_Filings/0000314808-20-000062.html'.
Successfully processed and dumped content from './Training_Filings/0000706129-20-000012.html'.
Successfully processed and dumped content from './Training_Filings/0000846617-20-000024.html'.
Successfully processed and dumped content from './Training_Filings/0000874766-20-000033.html'.
Successfully processed and dumped content from './Training_Filings/0000875320-20-000014.html'.
Successfully processed and dumped content from './Training_Filings/0000892537-20-000010.html'.
Successfully processed and dumped content from './