In [None]:
import os
import re
import requests
import pandas as pd
from datetime import datetime
import time

TICKER_SOURCE_PATH = r"C:\Users\surji\Desktop\Quant_Poject\CompanyList_File_Tickers.xlsx"
DOWNLOAD_DIR = r"C:\Users\surji\Desktop\Quant_Poject\Downloaded Filings"
USER_AGENT = "MyAppName your_email@example.com"
TICKER_COLUMN = "Ticker" 
OUTPUT_FILE = os.path.join(DOWNLOAD_DIR, "Parsed_DATA.xlsx")

def read_tickers_from_excel(path, column_name):
    df = pd.read_excel(path)
    return df[column_name].dropna().unique().tolist()
TICKERS = read_tickers_from_excel(TICKER_SOURCE_PATH, TICKER_COLUMN)
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

In [None]:

def get_cik_mapping():
    url = "https://www.sec.gov/files/company_tickers.json"
    resp = requests.get(url, headers={'User-Agent': USER_AGENT})
    resp.raise_for_status()
    data = resp.json()
    df = pd.DataFrame.from_dict(data, orient='index')
    df['ticker'] = df['ticker'].str.upper()
    df['cik_str'] = df['cik_str'].apply(lambda x: str(x).zfill(10))
    return df

def find_cik(ticker, cik_df):
    ticker_norm = ticker.replace('.', '-').upper()
    row = cik_df[cik_df['ticker'] == ticker_norm]
    if not row.empty:
        return row.iloc[0]['cik_str'], row.iloc[0]['title']

    row = cik_df[cik_df['ticker'] == ticker.upper()]
    if not row.empty:
        return row.iloc[0]['cik_str'], row.iloc[0]['title']
    
    partial_matches = cik_df[cik_df['ticker'].str.startswith(ticker.split('.')[0].upper())]
    if not partial_matches.empty:
        best_row = partial_matches.iloc[0]
        print(f"\u26a0\ufe0f Auto-fallback matched '{ticker}' → '{best_row['ticker']}' ({best_row['title']})")
        return best_row['cik_str'], best_row['title']

    print(f"\u274c No CIK for {ticker}")
    return None, None


Downloading the lastest filings ( 10-k, 10-Q, 20-F, 6-K )

In [None]:
def download_latest_filing(cik):
    url = f"https://data.sec.gov/submissions/CIK{cik}.json"
    resp = requests.get(url, headers={'User-Agent': USER_AGENT})
    resp.raise_for_status()
    data = resp.json()
    filings = data['filings']['recent']
    
    form_priority = ['10-K', '20-F', '10-Q', '6-K'] 

    for form_type in form_priority:
        for i in range(len(filings['form'])):
            form = filings['form'][i]
            if form == form_type:
                accession = filings['accessionNumber'][i].replace('-', '')
                doc = filings['primaryDocument'][i]
                filing_date = filings['filingDate'][i]
                file_url = f"https://www.sec.gov/Archives/edgar/data/{int(cik)}/{accession}/{doc}"
                print(f"📥 Downloading {form} on {filing_date}")
                r = requests.get(file_url, headers={'User-Agent': USER_AGENT})
                r.raise_for_status()
                return r.text, form, filing_date
    raise Exception("No suitable filing found")

Saving the file that has been downloaded 

In [None]:

def save_filing(content, ticker, form, date):
    filename = f"{ticker}_{form}_{date.replace('-', '')}.html"
    path = os.path.join(DOWNLOAD_DIR, filename)
    with open(path, 'w', encoding='utf-8') as f:
        f.write(content)
    return path