In [2]:
import os
import re
from bs4 import BeautifulSoup
import pandas as pd

In [4]:
# Configure the path to the folder with your HTML files
input_dir = './Training_Filings/'

# Configure the name of the output CSV file
output_file = './final_output.csv'

In [9]:
import re

def find_best_eps(soup):
    candidates = []
    number_regex = re.compile(r'\(?\$?([\d,.-]+)\)?')

    for row in soup.find_all('tr'):
        row_text = row.get_text(separator=' ', strip=True).lower()
        
        if 'per share' not in row_text:
            continue

        cells = row.find_all(['td', 'th'])
        keyword_cell_index = -1

        # Find which cell contains the descriptive keywords
        for i, cell in enumerate(cells):
            cell_text = cell.get_text(strip=True).lower()
            if 'earnings' in cell_text or 'loss' in cell_text:
                keyword_cell_index = i
                break
        
        # If keywords were found, search for the number in the *subsequent* cells
        if keyword_cell_index != -1:
            for i in range(keyword_cell_index + 1, len(cells)):
                cell_text = cells[i].get_text(strip=True)
                match = number_regex.search(cell_text)
                if match and cell_text != "0": # Exclude placeholder zeros
                    try:
                        num_str = match.group(1).replace(',', '')
                        value = float(num_str)
                        if '(' in cell_text and ')' in cell_text:
                            value = -abs(value)
                        
                        # Score the row
                        score = 0
                        if 'basic' in row_text: score += 10
                        if 'diluted' in row_text: score -= 10
                        if 'gaap' in row_text: score += 5
                        if 'non-gaap' in row_text or 'adjusted' in row_text: score -= 10
                        if 'net' in row_text: score += 5
                        if 'earnings' in row_text: score += 2
                        if 'loss' in row_text:
                            score += 1
                            if value > 0: value = -value
                        
                        candidates.append({'score': score, 'value': value})
                        break # Found the first number after the description, stop.
                    except (ValueError, IndexError):
                        continue
    
    if not candidates:
        return None

    best_candidate = max(candidates, key=lambda x: x['score'])
    return best_candidate['value']

In [10]:
results = []
print(f"Starting processing for files in '{input_dir}'...")

for filename in os.listdir(input_dir):
    if filename.endswith(".html"):
        file_path = os.path.join(input_dir, filename)
        
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            html_content = f.read()
        
        soup = BeautifulSoup(html_content, 'lxml')
        
        eps = find_best_eps(soup)

        if eps is not None:
            results.append({'filename': filename, 'EPS': eps})
        else:
            results.append({'filename': filename, 'EPS': 'N/A'})

print("Processing complete.")

Starting processing for files in './Training_Filings/'...
Processing complete.


In [11]:
# Convert the list of results into a pandas DataFrame
df = pd.DataFrame(results)

# Display the first 10 rows of the DataFrame
df.head(10)

Unnamed: 0,filename,EPS
0,0000004977-20-000054.html,0.78
1,0000008947-20-000044.html,-10643.0
2,0000046080-20-000050.html,
3,0000066570-20-000013.html,
4,0000314808-20-000062.html,-15.19
5,0000706129-20-000012.html,0.26
6,0000846617-20-000024.html,
7,0000874766-20-000033.html,1.34
8,0000875320-20-000014.html,
9,0000892537-20-000010.html,


In [12]:
# Save the DataFrame to the specified output file
df.to_csv(output_file, index=False)

print(f"Successfully created '{output_file}' with {len(df)} entries.")

Successfully created './final_output.csv' with 50 entries.


In [12]:
import pandas as pd

df1 = pd.read_csv('final_output_v11.csv')
df2 = pd.read_csv('Test_Entries.csv')

merged_df = pd.merge(df1, df2, on='filename', suffixes=('_file1', '_file2'), how='outer')

differences = merged_df[merged_df['EPS_file1'] != merged_df['EPS_file2']]

different_rows = differences[['filename', 'EPS_file1', 'EPS_file2']].copy()
different_rows.rename(columns={'EPS_file1': 'EPS_final_output_v11', 'EPS_file2': 'EPS_Test_Entries'}, inplace=True)

print("Rows with different EPS values:")
print(different_rows)

file1_only = merged_df[merged_df['EPS_file2'].isnull()]
file2_only = merged_df[merged_df['EPS_file1'].isnull()]

print("\nFilenames present only in final_output_v11.csv:")
print(file1_only[['filename', 'EPS_file1']])

print("\nFilenames present only in Test_Entries.csv:")
print(file2_only[['filename', 'EPS_file2']])

Rows with different EPS values:
                     filename  EPS_final_output_v11  EPS_Test_Entries
7   0000874766-20-000033.html                  0.64              0.74
11  0000939057-20-000186.html                  0.20              0.60
19  0001140361-20-010070.html                  0.38              0.34
21  0001157523-20-000597.html                  0.03             -0.03
22  0001157523-20-000599.html                  0.57              0.56
24  0001165002-20-000083.html                  0.13              0.05
25  0001171843-20-003035.html                 -6.79              0.72
26  0001193125-20-124288.html                  2.01             -0.24
28  0001193125-20-126089.html                  1.41              0.54
31  0001299709-20-000078.html                  0.92              0.91
33  0001373715-20-000098.html                 -3.00              0.24
34  0001423689-20-000040.html                  0.57             -4.46
39  0001564590-20-019431.html                  1.08       

In [2]:
import pandas as pd

df1 = pd.read_csv('final_output_v4.csv')
df2 = pd.read_csv('Test_Entries.csv')

merged_df = pd.merge(df1, df2, on='filename', suffixes=('_file1', '_file2'), how='outer')

differences = merged_df[merged_df['EPS_file1'] != merged_df['EPS_file2']]

different_rows = differences[['filename', 'EPS_file1', 'EPS_file2']].copy()
different_rows.rename(columns={'EPS_file1': 'EPS_final_output_v4', 'EPS_file2': 'EPS_Test_Entries'}, inplace=True)

print("Rows with different EPS values:")
print(different_rows)

file1_only = merged_df[merged_df['EPS_file2'].isnull()]
file2_only = merged_df[merged_df['EPS_file1'].isnull()]

print("\nFilenames present only in final_output_v4.csv:")
print(file1_only[['filename', 'EPS_file1']])

print("\nFilenames present only in Test_Entries.csv:")
print(file2_only[['filename', 'EPS_file2']])

Rows with different EPS values:
                     filename  EPS_final_output_v4  EPS_Test_Entries
2   0000046080-20-000050.html                 0.76             -0.51
7   0000874766-20-000033.html                 0.08              0.74
8   0000875320-20-000014.html                 2.29              2.32
13  0001008654-20-000048.html                 0.16             -0.16
15  0001104659-20-052792.html                -0.04             -0.03
19  0001140361-20-010070.html                 0.38              0.34
21  0001157523-20-000597.html                 0.03             -0.03
24  0001165002-20-000083.html                 0.13              0.05
25  0001171843-20-003035.html                -6.79              0.72
26  0001193125-20-124288.html                 1.96             -0.24
28  0001193125-20-126089.html                 1.41              0.54
29  0001193125-20-126683.html                 1.51              1.52
33  0001373715-20-000098.html                -3.00              0.24
34