Data Extraction

In [1]:
import os
import zipfile
from pathlib import Path
import pandas as pd
from kaggle.api.kaggle_api_extended import KaggleApi

In [2]:
# CONFIG
DATASET_SLUG = "rohanpanda80/us-sec-financial-statement-2021-2024"

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.precision', 30)

In [3]:
# ENV DETECTION
def detect_env():
    if "KAGGLE_URL_BASE" in os.environ:
        return "kaggle"
    try:
        import google.colab  # noqa
        return "colab"
    except ImportError:
        return "local"

In [4]:
def get_sec_root(dest_path="kdata"):
    """
    Returns the path to the root SEC dataset folder containing the quarter subfolders.
    If already downloaded, uses existing files. Otherwise downloads from Kaggle.
    Works for local, Colab, and Kaggle environments.
    """
    dest_path = Path(dest_path)
    dest_path.mkdir(exist_ok=True)

    # Look for existing folders with quarter subfolders
    existing_dirs = [d for d in dest_path.iterdir() if d.is_dir()]
    for d in existing_dirs:
        if any("q" in q.name for q in d.iterdir() if q.is_dir()):
            print(f"Using existing dataset at {d}")
            return d

    # If not found, download from Kaggle
    print("Dataset not found locally. Downloading from Kaggle...")
    api = KaggleApi()
    api.authenticate()
    api.dataset_download_files(DATASET_SLUG, path=dest_path, unzip=True)

    # Handle nested folder (Kaggle often wraps data in a subfolder)
    extracted_dirs = [d for d in dest_path.iterdir() if d.is_dir()]
    for d in extracted_dirs:
        if any("q" in q.name for q in d.iterdir() if q.is_dir()):
            print(f"Dataset ready at {d}")
            return d

    # Fallback: just return dest_path if structure is unusual
    print(f"Dataset ready at {dest_path}")
    return dest_path

In [5]:
def get_quarter_dirs(sec_root):
    """
    Returns a sorted list of quarter directories under the SEC root.
    """
    return sorted([d for d in sec_root.iterdir() if d.is_dir() and "q" in d.name])

In [6]:
# LOAD ALL SUB & NUM FILES
def load_all_data():
    """
    Load all sub.txt and num.txt files across all quarters and return merged dataframes.
    """
    sec_root = get_sec_root()
    quarter_dirs = get_quarter_dirs(sec_root)

    if not quarter_dirs:
        raise FileNotFoundError(f"No quarter directories found in {sec_root}")

    all_sub, all_num = [], []

    for qdir in quarter_dirs:
        sub_path = qdir / "sub.txt"
        num_path = qdir / "num.txt"

        if sub_path.exists() and num_path.exists():
            print(f"Loading {qdir.name} ...")
            df_sub = pd.read_csv(sub_path, sep="\t", low_memory=False)
            df_num = pd.read_csv(num_path, sep="\t", low_memory=False)

            # Add quarter info
            df_sub["quarter"] = qdir.name
            df_num["quarter"] = qdir.name

            all_sub.append(df_sub)
            all_num.append(df_num)
        else:
            print(f"Skipping {qdir.name}, files not found")

    # Combine all quarters
    sub_full = pd.concat(all_sub, ignore_index=True)
    num_full = pd.concat(all_num, ignore_index=True)

    return sub_full, num_full

In [9]:
def merge_dataset(sub_df, num_df, key, how):
    return pd.merge(sub_df, num_df, on=key, how=how)

In [8]:
sub_df, num_df = load_all_data()

Dataset not found locally. Downloading from Kaggle...
Dataset URL: https://www.kaggle.com/datasets/rohanpanda80/us-sec-financial-statement-2021-2024
Dataset ready at kdata\Dataset
Loading 2022q1 ...
Loading 2022q2 ...
Loading 2022q3 ...
Loading 2022q4 ...
Loading 2023q1 ...
Loading 2023q2 ...
Loading 2023q3 ...
Loading 2023q4 ...
Loading 2024q1 ...
Loading 2024q2 ...
Loading 2024q3 ...
Loading 2024q4 ...


In [11]:
sub_df.head()

Unnamed: 0,adsh,cik,name,sic,countryba,stprba,cityba,zipba,bas1,bas2,baph,countryma,stprma,cityma,zipma,mas1,mas2,countryinc,stprinc,ein,former,changed,afs,wksi,fye,form,period,fy,fp,filed,accepted,prevrpt,detail,instance,nciks,aciks,quarter
0,0000002178-22-000033,2178,"ADAMS RESOURCES & ENERGY, INC.",5172.0,US,TX,HOUSTON,77027,17 S. BRIAR HOLLOW LN.,,713-881-3600,US,TX,HOUSTON,77001,P O BOX 844,,US,DE,741753147.0,ADAMS RESOURCES & ENERGY INC,19920703.0,4-NON,0,1231.0,10-K,20211231.0,2021.0,FY,20220309,2022-03-09 16:17:00.0,0,1,ae-20211231_htm.xml,1,,2022q1
1,0000002488-22-000016,2488,ADVANCED MICRO DEVICES INC,3674.0,US,CA,SANTA CLARA,95054,2485 AUGUSTINE DRIVE,,(408) 749-4000,US,CA,SANTA CLARA,95054,2485 AUGUSTINE DRIVE,,US,DE,941692300.0,,,1-LAF,1,1231.0,10-K,20211231.0,2021.0,FY,20220203,2022-02-03 17:23:00.0,0,1,amd-20211225_htm.xml,1,,2022q1
2,0000002969-22-000010,2969,AIR PRODUCTS & CHEMICALS INC /DE/,2810.0,US,PA,ALLENTOWN,18106-5500,1940 AIR PRODUCTS BLVD.,,6104814911,US,PA,ALLENTOWN,18106-5500,1940 AIR PRODUCTS BLVD.,,US,DE,231274455.0,,,1-LAF,0,930.0,10-Q,20211231.0,2022.0,Q1,20220204,2022-02-04 10:32:00.0,0,1,apd-20211231_htm.xml,1,,2022q1
3,0000003499-22-000004,3499,ALEXANDERS INC,6798.0,US,NJ,PARAMUS,07652,210 ROUTE 4 EAST,,201-587-8541,US,NJ,PARAMUS,07652,210 ROUTE 4 EAST,,US,DE,510100517.0,,,2-ACC,0,1231.0,10-K,20211231.0,2021.0,FY,20220214,2022-02-14 08:19:00.0,0,1,alx-20211231_htm.xml,1,,2022q1
4,0000003570-22-000024,3570,CHENIERE ENERGY INC,4924.0,US,TX,HOUSTON,77002,700 MILAM ST.,SUITE 1900,7133755000,US,TX,HOUSTON,77002,700 MILAM ST.,SUITE 1900,US,DE,954352386.0,CHENIERE ENERGY INC,19960827.0,1-LAF,1,1231.0,10-K,20211231.0,2021.0,FY,20220224,2022-02-23 21:37:00.0,0,1,lng-20211231_htm.xml,1,,2022q1


In [12]:
num_df.head()

Unnamed: 0,adsh,tag,version,ddate,qtrs,uom,segments,coreg,value,footnote,quarter
0,0001126975-22-000070,OtherComprehensiveIncomeLossNetOfTax,us-gaap/2021,20201231,4,USD,PartnerCapitalComponents=AccumulatedOtherCompr...,,-21100000.0,,2022q1
1,0001748824-22-000018,AdjustmentsRelatedToTaxWithholdingForShareBase...,us-gaap/2021,20211231,4,USD,ConsolidatedEntities=ConsolidatedEntityExcludi...,,31300000.0,,2022q1
2,0000012927-22-000010,Revenues,us-gaap/2021,20211231,4,USD,BusinessSegments=GlobalServices;ConsolidationI...,,248000000.0,,2022q1
3,0001564590-22-006237,OtherAssetsNoncurrent,us-gaap/2021,20201231,0,USD,BusinessSegments=TruckPartsAndOther;,,998900000.0,,2022q1
4,0000107815-22-000116,DefinedBenefitPlanAssetsForPlanBenefitsNoncurrent,us-gaap/2021,20211231,0,USD,RetirementPlanType=PensionPlansDefinedBenefit;,,389000000.0,,2022q1


In [10]:
df = merge_dataset(sub_df, num_df, 'adsh', 'inner')

In [10]:
df.shape

(41260371, 47)

In [11]:
df.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41260371 entries, 0 to 41260370
Data columns (total 47 columns):
 #   Column      Non-Null Count     Dtype  
---  ------      --------------     -----  
 0   adsh        41260371 non-null  object 
 1   cik         41260371 non-null  int64  
 2   name        41260371 non-null  object 
 3   sic         37370274 non-null  float64
 4   countryba   41214170 non-null  object 
 5   stprba      36606706 non-null  object 
 6   cityba      41206440 non-null  object 
 7   zipba       41190746 non-null  object 
 8   bas1        41214170 non-null  object 
 9   bas2        18790405 non-null  object 
 10  baph        41194517 non-null  object 
 11  countryma   41115065 non-null  object 
 12  stprma      36705267 non-null  object 
 13  cityma      41114733 non-null  object 
 14  zipma       41086516 non-null  object 
 15  mas1        41119068 non-null  object 
 16  mas2        18763143 non-null  object 
 17  countryinc  37164640 non-null  object 
 18  

In [12]:
# Total number of unique company names
df['name'].unique()

array(['ADAMS RESOURCES & ENERGY, INC.', 'ADVANCED MICRO DEVICES INC',
       'AIR PRODUCTS & CHEMICALS INC /DE/', ..., 'DELTASOFT CORP',
       'FRONTVIEW REIT, INC.', 'STANDARDAERO, INC.'],
      shape=(9646,), dtype=object)

In [13]:
df['countryba'].unique()

array(['US', 'CA', 'SG', 'GB', 'NL', 'BM', 'ES', 'IE', 'CH', 'GR', 'MH',
       'JE', 'KY', nan, 'CN', 'DK', 'PR', 'ZA', 'HK', 'SE', 'JP', 'DE',
       'BR', 'LU', 'FR', 'AR', 'BE', 'IL', 'MC', 'CY', 'NO', 'TW', 'AU',
       'MY', 'CL', 'KZ', 'TH', 'CZ', 'MX', 'MT', 'ID', 'CO', 'IN', 'AI',
       'VI', 'FI', 'GG', 'UY', 'BS', 'GU', 'PH', 'IT', 'GE', 'VG', 'RO',
       'LT', 'MO', 'PL', 'KR', 'AE', 'LV', 'RU', 'TR', 'PE', 'PA', 'GH',
       'JO', 'GI', 'DO', 'AL', 'RS', 'KG', 'AM', 'CR', 'NG', 'IM', 'KE',
       'NZ', 'ME'], dtype=object)

In [14]:
df['tag'].unique()

array(['PaymentsToAcquireBusinessesNetOfCashAcquired', 'Revenues',
       'StockIssuedDuringPeriodValueAcquisitions', ...,
       'CumulativeEffectAdjustmentUponAdoption',
       'AdvancesPaidForBrandAuthorization',
       'ProceedsFromPaymentForOfferingCosts'],
      shape=(198311,), dtype=object)

In [21]:
rev_tag_end = df['tag'].str.contains(r"revenue", case=False, na=False)
df_revenue_tags = df.loc[rev_tag_end, 'tag'].unique()
df_revenue_tags

array(['Revenues', 'RevenueFromContractWithCustomerExcludingAssessedTax',
       'IncreaseDecreaseInDeferredRevenue', ..., 'RevenueExpense',
       'NonCashRentalRevenueAdjustments',
       'CashReceivedFromDeferredRevenue'], shape=(2192,), dtype=object)

In [17]:
excluded_patterns = [
    "Deferred", 
    "Unearned",
    "Amortization",
    "Accrued", 
    "Noncash",
    "ContraRevenue",
    "Tax", 
    "Adjustment",
    "Costs"
]

In [18]:
# Getting all the partial revenue match
df_rev = df[df['tag'].str.contains("Revenue", case=False, na=False)]

# Exclude all unwanted patterns
pattern = "|".join(excluded_patterns)
df_clean = df_rev[~df_rev['tag'].str.contains(pattern, case=False, na=False)]

In [20]:
df_revenue_clean = df_clean.reset_index(drop=True)
df_revenue_clean.head()

Unnamed: 0,adsh,cik,name,sic,countryba,stprba,cityba,zipba,bas1,bas2,baph,countryma,stprma,cityma,zipma,mas1,mas2,countryinc,stprinc,ein,former,changed,afs,wksi,fye,form,period,fy,fp,filed,accepted,prevrpt,detail,instance,nciks,aciks,quarter_x,tag,version,ddate,qtrs,uom,segments,coreg,value,footnote,quarter_y
0,0000002178-22-000033,2178,"ADAMS RESOURCES & ENERGY, INC.",5172.0,US,TX,HOUSTON,77027,17 S. BRIAR HOLLOW LN.,,713-881-3600,US,TX,HOUSTON,77001,P O BOX 844,,US,DE,741753147.0,ADAMS RESOURCES & ENERGY INC,19920703.0,4-NON,0,1231.0,10-K,20211231.0,2021.0,FY,20220309,2022-03-09 16:17:00.0,0,1,ae-20211231_htm.xml,1,,2022q1,Revenues,us-gaap/2021,20211231,4,USD,BusinessSegments=CrudeOilMarketing;Consolidati...,,1930042000.0,,2022q1
1,0000002178-22-000033,2178,"ADAMS RESOURCES & ENERGY, INC.",5172.0,US,TX,HOUSTON,77027,17 S. BRIAR HOLLOW LN.,,713-881-3600,US,TX,HOUSTON,77001,P O BOX 844,,US,DE,741753147.0,ADAMS RESOURCES & ENERGY INC,19920703.0,4-NON,0,1231.0,10-K,20211231.0,2021.0,FY,20220309,2022-03-09 16:17:00.0,0,1,ae-20211231_htm.xml,1,,2022q1,Revenues,us-gaap/2021,20211231,4,USD,ConsolidationItems=CorporateNonSegment;,,0.0,,2022q1
2,0000002178-22-000033,2178,"ADAMS RESOURCES & ENERGY, INC.",5172.0,US,TX,HOUSTON,77027,17 S. BRIAR HOLLOW LN.,,713-881-3600,US,TX,HOUSTON,77001,P O BOX 844,,US,DE,741753147.0,ADAMS RESOURCES & ENERGY INC,19920703.0,4-NON,0,1231.0,10-K,20211231.0,2021.0,FY,20220309,2022-03-09 16:17:00.0,0,1,ae-20211231_htm.xml,1,,2022q1,Revenues,us-gaap/2021,20211231,4,USD,ProductOrService=PipelineAndStorage;,,664000.0,,2022q1
3,0000002178-22-000033,2178,"ADAMS RESOURCES & ENERGY, INC.",5172.0,US,TX,HOUSTON,77027,17 S. BRIAR HOLLOW LN.,,713-881-3600,US,TX,HOUSTON,77001,P O BOX 844,,US,DE,741753147.0,ADAMS RESOURCES & ENERGY INC,19920703.0,4-NON,0,1231.0,10-K,20211231.0,2021.0,FY,20220309,2022-03-09 16:17:00.0,0,1,ae-20211231_htm.xml,1,,2022q1,Revenues,us-gaap/2021,20201231,4,USD,BusinessSegments=Transportation;ConsolidationI...,,71724000.0,,2022q1
4,0000002178-22-000033,2178,"ADAMS RESOURCES & ENERGY, INC.",5172.0,US,TX,HOUSTON,77027,17 S. BRIAR HOLLOW LN.,,713-881-3600,US,TX,HOUSTON,77001,P O BOX 844,,US,DE,741753147.0,ADAMS RESOURCES & ENERGY INC,19920703.0,4-NON,0,1231.0,10-K,20211231.0,2021.0,FY,20220309,2022-03-09 16:17:00.0,0,1,ae-20211231_htm.xml,1,,2022q1,Revenues,us-gaap/2021,20210331,1,USD,,,325491000.0,,2022q1
