This file creates metadata from train_df and reports(pdf) collected online. Merges them to create combined_train.csv which will have address to the online collected files and also information about that from train_df

##### Will need orbit API

In [2]:
import pandas as pd
import boto3
import json
import os, json, boto3, requests
import re
import numpy as np

In [3]:
# Authenticate
s3_client = boto3.client(
    's3',
    aws_access_key_id="",      # need access key
    aws_secret_access_key="" # insert keys 
)

# Bucket and folder
bucket_name = "orbit-data-provider"
prefix = "clients/eagle-alpha/"  

# files (index JSONs) under the folder
response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)

json_keys = [
    obj["Key"]
    for obj in response.get("Contents", [])
    if obj["Key"].endswith(".json")
]

In [4]:
# Print number of JSONs we found
print(f"Found {len(json_keys)} JSON index files\n")

# fetch and print a few sample JSONs
for key in json_keys[:3]:  # just first 3 for inspection
    print(f"\nüîé Inspecting {key}...")
    obj = s3_client.get_object(Bucket=bucket_name, Key=key)
    index_data = json.loads(obj["Body"].read().decode("utf-8"))

    # Print top-level keys
    print("Top-level keys:", list(index_data.keys()))

    # Print company info and report id
    company_info = index_data.get("company_info", [{}])[0]
    print("Company:", company_info.get("company_name"))
    print("Report ID:", index_data.get("report_id"))

    # Print available file paths (if any)
    for rf in index_data.get("report_files", []):
        for s3_key in ["s3_path_file", "s3_path_pages", "s3_path_blocks"]:
            if rf.get(s3_key):
                print(f" - {s3_key}: {rf[s3_key]}")

# fetch and print a few sample JSONs 
for key in json_keys[:3]:  # first 3 JSONs
    print(f"\nüîé Inspecting {key}...")
    obj = s3_client.get_object(Bucket=bucket_name, Key=key)
    index_data = json.loads(obj["Body"].read().decode("utf-8"))

    # Print nicely formatted JSON (2 spaces indent)
    print(json.dumps(index_data, indent=2)[:1000], "...")  # truncate after 1000 chars

Found 996 JSON index files


üîé Inspecting clients/eagle-alpha/reports_annual_quarterly/20250821033756__f_0gxFdzT9qftZ2xKpUIDOwC@1.json...
Top-level keys: ['report_id', 'report_title', 'report_title_local', 'report_title_en', 'reported_at', 'report_type_id_list', 'language', 'company_info', 'report_files', 'report_link', 'x_version']
Company: SMURFIT KAPPA GROUP PLC
Report ID: f_0gxFdzT9qftZ2xKpUIDOwC
 - s3_path_file: s3://filing-reports/reports-data/stock_us/2024/06/07/edgar-data-2005951-000141057824000999-tmb-20240331x10q.htm.pdf
 - s3_path_pages: s3://filing-reports/txt-vector/reports-data/stock_us/2024/06/07/edgar-data-2005951-000141057824000999-tmb-20240331x10q.htm.pdf/pages.txt
 - s3_path_blocks: s3://filing-reports/txt-vector/reports-data/stock_us/2024/06/07/edgar-data-2005951-000141057824000999-tmb-20240331x10q.htm.pdf/blocks.txt

üîé Inspecting clients/eagle-alpha/reports_annual_quarterly/20250821033756__f_1u7DBoAn@2.json...
Top-level keys: ['report_id', 'report_title', 're

In [5]:
# Creating a metadata about files collected from API

base_dir = "data/reports"


# Paginate through all JSON indexes
def list_all_json_keys(bucket, prefix):
    keys = []
    continuation_token = None
    while True:
        if continuation_token:
            response = s3_client.list_objects_v2(
                Bucket=bucket, Prefix=prefix, ContinuationToken=continuation_token
            )
        else:
            response = s3_client.list_objects_v2(Bucket=bucket, Prefix=prefix)

        for obj in response.get("Contents", []):
            if obj["Key"].endswith(".json"):
                keys.append(obj["Key"])

        if response.get("IsTruncated"):
            continuation_token = response["NextContinuationToken"]
        else:
            break
    return keys


# Collect metadata from JSONs ---
def collect_report_metadata(bucket, keys, base_dir="data/reports"):
    records = []

    for i, key in enumerate(keys):
        try:
            resp = s3_client.get_object(Bucket=bucket, Key=key)
            index_data = json.loads(resp["Body"].read().decode("utf-8"))

            company = index_data["company_info"][0]
            report_id = index_data["report_id"]
            safe_name = company.get("company_name", "Unknown").replace(" ", "_").replace("/", "_")

            records.append({
                "report_id": report_id,
                "company_name": company.get("company_name"),
                "isin": company.get("isin", []),       # full list
                "ticker": company.get("ticker", []),   # full list
                "reported_at": index_data.get("reported_at"),
                "report_title": index_data.get("report_title"),
                "report_title_local": index_data.get("report_title_local"),
                "report_title_en": index_data.get("report_title_en"),
                "pdf_path": os.path.join(base_dir, safe_name, f"{report_id}_file.pdf"),
                "pages_path": os.path.join(base_dir, safe_name, f"{report_id}_pages.txt"),
                "blocks_path": os.path.join(base_dir, safe_name, f"{report_id}_blocks.txt")
            })


            # print progress
            print(f"{i}. {key}")

        except Exception as e:
            print(f"‚ùå Failed on {key}: {e}")

    return pd.DataFrame(records)

In [6]:
# Run
json_keys = list_all_json_keys(bucket_name, prefix)
print(f"Found {len(json_keys)} JSON index files\n")

meta_df = collect_report_metadata(bucket_name, json_keys, base_dir)

meta_df

Found 6981 JSON index files

0. clients/eagle-alpha/reports_annual_quarterly/20250821033756__f_0gxFdzT9qftZ2xKpUIDOwC@1.json
1. clients/eagle-alpha/reports_annual_quarterly/20250821033756__f_1u7DBoAn@2.json
2. clients/eagle-alpha/reports_annual_quarterly/20250821033756__f_4csLsjAhudR2mcDosUbKD5@1.json
3. clients/eagle-alpha/reports_annual_quarterly/20250821033756__f_7A8O2V6Ehh6RTO39HDPYVp@1.json
4. clients/eagle-alpha/reports_annual_quarterly/20250821033756__f_9tgeLS94nO4peRbBEIbvje@1.json
5. clients/eagle-alpha/reports_annual_quarterly/20250821033756__f_D2ugYCYH@1.json
6. clients/eagle-alpha/reports_annual_quarterly/20250821033756__f_OM7uNqpX@1.json
7. clients/eagle-alpha/reports_annual_quarterly/20250821033756__f_PDplDokn@1.json
8. clients/eagle-alpha/reports_annual_quarterly/20250821033756__f_TCL50Sz2@1.json
9. clients/eagle-alpha/reports_annual_quarterly/20250821033756__f_U87d66o1@1.json
10. clients/eagle-alpha/reports_annual_quarterly/20250821033756__f_VUN8jZBe@1.json
11. clients/

Unnamed: 0,report_id,company_name,isin,ticker,reported_at,report_title,report_title_local,report_title_en,pdf_path,pages_path,blocks_path
0,f_0gxFdzT9qftZ2xKpUIDOwC,SMURFIT KAPPA GROUP PLC,"[IE00028FXN24, IE00B1RR8406, US83272W1062]","[N4U, SMFKY, SW, SWR]",2024-06-07,Quarterly Report,SMURFIT KAPPA GROUP PLC - FORM 10-Q,SMURFIT KAPPA GROUP PLC - FORM 10-Q,data/reports/SMURFIT_KAPPA_GROUP_PLC/f_0gxFdzT...,data/reports/SMURFIT_KAPPA_GROUP_PLC/f_0gxFdzT...,data/reports/SMURFIT_KAPPA_GROUP_PLC/f_0gxFdzT...
1,f_1u7DBoAn,CARNIVAL PLC,"[AR0180391374, BRC1CLBDR004, GB0031215220, PA1...","[C1CL34, CCL, CCL1N, CCLC, CCLl, CCLm, CUK]",2023-09-29,Quarterly Report,"CARNIVAL PLC - EX-10.2, ebh1c10q22023.htm EX-10.2","CARNIVAL PLC - EX-10.2, ebh1c10q22023.htm EX-10.2",data/reports/CARNIVAL_PLC/f_1u7DBoAn_file.pdf,data/reports/CARNIVAL_PLC/f_1u7DBoAn_pages.txt,data/reports/CARNIVAL_PLC/f_1u7DBoAn_blocks.txt
2,f_4csLsjAhudR2mcDosUbKD5,LULULEMON ATHLETICA INC,"[BRL1ULBDR005, CA5499211046, US5500211090]","[33L, L1UL34, LULU, LULU *]",2024-03-21,Annual Report,LULULEMON ATHLETICA INC - Form 10-K,LULULEMON ATHLETICA INC - Form 10-K,data/reports/LULULEMON_ATHLETICA_INC/f_4csLsjA...,data/reports/LULULEMON_ATHLETICA_INC/f_4csLsjA...,data/reports/LULULEMON_ATHLETICA_INC/f_4csLsjA...
3,f_7A8O2V6Ehh6RTO39HDPYVp,SMURFIT KAPPA GROUP PLC,"[IE00028FXN24, IE00B1RR8406, US83272W1062]","[N4U, SMFKY, SW, SWR]",2024-11-08,Quarterly Report,SMURFIT KAPPA GROUP PLC - FORM 10-Q,SMURFIT KAPPA GROUP PLC - FORM 10-Q,data/reports/SMURFIT_KAPPA_GROUP_PLC/f_7A8O2V6...,data/reports/SMURFIT_KAPPA_GROUP_PLC/f_7A8O2V6...,data/reports/SMURFIT_KAPPA_GROUP_PLC/f_7A8O2V6...
4,f_9tgeLS94nO4peRbBEIbvje,LULULEMON ATHLETICA INC,"[BRL1ULBDR005, CA5499211046, US5500211090]","[33L, L1UL34, LULU, LULU *]",2024-08-29,Quarterly Report,LULULEMON ATHLETICA INC - Form 10-Q,LULULEMON ATHLETICA INC - Form 10-Q,data/reports/LULULEMON_ATHLETICA_INC/f_9tgeLS9...,data/reports/LULULEMON_ATHLETICA_INC/f_9tgeLS9...,data/reports/LULULEMON_ATHLETICA_INC/f_9tgeLS9...
...,...,...,...,...,...,...,...,...,...,...,...
6976,o_fhmH4CTO5pcEVOAoSj88tU,GE VERNOVA INC,"[BRG2EVBDR003, US36828A1016]","[G2EV34, GEV]",2024-03-06,Wind CEO Vic Abate Transcript,GE VERNOVA INC - Wind CEO Vic Abate Transcript,GE VERNOVA INC - Wind CEO Vic Abate Transcript,data/reports/GE_VERNOVA_INC/o_fhmH4CTO5pcEVOAo...,data/reports/GE_VERNOVA_INC/o_fhmH4CTO5pcEVOAo...,data/reports/GE_VERNOVA_INC/o_fhmH4CTO5pcEVOAo...
6977,o_fwe9aJYEVTFtxduTAdK6sd,GE VERNOVA INC,"[BRG2EVBDR003, US36828A1016]","[G2EV34, GEV]",2024-12-10,GE Vernova 2024 Investor Update,GE VERNOVA INC - GE Vernova 2024 Investor Update,GE VERNOVA INC - GE Vernova 2024 Investor Update,data/reports/GE_VERNOVA_INC/o_fwe9aJYEVTFtxduT...,data/reports/GE_VERNOVA_INC/o_fwe9aJYEVTFtxduT...,data/reports/GE_VERNOVA_INC/o_fwe9aJYEVTFtxduT...
6978,o_gAqntEyHWJXJDmnXkTAQEe,BANK OF AMERICA CORP,"[ARDEUT112851, BRBOACBDR004, CA06048X1087, US0...","[1BAC, BAC, BACCL, BAC_KZ, BACm, BAC_pe, BAC_p...",2024-09-30,Bank of America Third Quarter 2024 Earnings An...,BANK OF AMERICA CORP - Bank of America Third Q...,BANK OF AMERICA CORP - Bank of America Third Q...,data/reports/BANK_OF_AMERICA_CORP/o_gAqntEyHWJ...,data/reports/BANK_OF_AMERICA_CORP/o_gAqntEyHWJ...,data/reports/BANK_OF_AMERICA_CORP/o_gAqntEyHWJ...
6979,o_gdd49aPyFdkFEH34pnybsJ,CITIGROUP INC,"[ARDEUT110426, BRCTGPBDR000, CA17331G1081, US1...","[1C, C, CCL, CITI, C_KZ, Cm, C_pj, C_pk, CTGP3...",2024-06-18,Citi 2024 Services Investor Day Full Transcript,CITIGROUP INC - Citi 2024 Services Investor Da...,CITIGROUP INC - Citi 2024 Services Investor Da...,data/reports/CITIGROUP_INC/o_gdd49aPyFdkFEH34p...,data/reports/CITIGROUP_INC/o_gdd49aPyFdkFEH34p...,data/reports/CITIGROUP_INC/o_gdd49aPyFdkFEH34p...


In [7]:
#### Get Train DATA

In [8]:
# train_df file
train_df = pd.read_csv("TRAIN_DATA.csv", sep="|")

# DATE is datetime
train_df["DATE"] = pd.to_datetime(train_df["DATE"])
train_df

Unnamed: 0.1,Unnamed: 0,DATE,SYMBOL,COMPANY_NAME,TICKER,SECTOR,MOMENTUM,VALUE,SIZE,BETA,SEDOL,ISIN,RETURNS
0,0,2010-01-29,00101J10,ADT Corporation,ADT.XX11,Industrials,,,,,B7XWRM2,US00101J1060,
1,2,2010-01-29,00120410,"AGL Resources, Inc.",GAS,Utilities,24.41300,0.793,7.911439,0.539984,2060961,US0012041069,-0.032355
2,3,2010-01-29,00130H10,AES Corporation,AES,Utilities,64.02880,0.815,9.043690,1.422540,2002479,US00130H1059,-0.051089
3,5,2010-01-29,00206R10,AT&T Inc,T,Communication Services,20.41870,1.013,11.923123,0.713166,2831811,US00206R1023,-0.081492
4,6,2010-01-29,00282410,Abbott Laboratories,ABT,Health Care,0.46649,-0.332,11.331961,0.210722,2002305,US0028241000,-0.012354
...,...,...,...,...,...,...,...,...,...,...,...,...,...
112382,149781,2025-07-31,N5374510,LyondellBasell Industries NV,LYB,Materials,-37.48230,1.837,9.848451,0.830765,B3SPXZ3,NL0009434992,0.001210
112383,149783,2025-07-31,S0042089,"Discovery, Inc A",S0042089,Communication Services,,,,,,,
112384,149784,2025-07-31,S8112735,Apartment Investment & Mgmt,S8112735,Real Estate,,,,,,,
112385,149785,2025-07-31,S8338817,Amentum Holdings Inc Placeho,S8338817,Industrials,,,,,,,


In [9]:
# Days left in the month for each DATE
days_left = train_df["DATE"].dt.daysinmonth - train_df["DATE"].dt.day

# True if date is in last 5 days of month
mask = days_left < 5

print(mask.all())

True


So the date is end of the month

In [11]:
# remove NaNs in RETURNS or ISIN
train_df_clean = train_df.dropna(subset=["RETURNS", "ISIN"])

# keep only ISINs starting with 'US'
train_df = train_df_clean[train_df_clean["ISIN"].str.startswith("US")]

print("Original shape:", train_df.shape)
print("Cleaned shape:", train_df_clean.shape)

Original shape: (85643, 13)
Cleaned shape: (91577, 13)


#### Merge train_df and meta_df

In [15]:
meta_df["reported_at"] = pd.to_datetime(meta_df["reported_at"])

# Year-month columns
meta_df["year_month"] = meta_df["reported_at"].dt.to_period("M")

train_df = train_df.copy()
train_df["year_month"] = train_df["DATE"].dt.to_period("M")

In [16]:
# explode ISIN list into rows
meta_exploded = meta_df.explode("isin")

# merge on ISIN + year_month
merged = pd.merge(
    meta_exploded,
    train_df,
    left_on=["isin", "year_month"],
    right_on=["ISIN", "year_month"],
    how="inner"
)

# count matches
print("Total matches:", len(merged))

Total matches: 4719


In [17]:
# Drop unwanted columns 
cols_to_drop = ["ticker", "ticker_norm", "Unnamed: 0"]
merged = merged.drop(columns=[c for c in cols_to_drop if c in merged.columns])

merged

Unnamed: 0,report_id,company_name,isin,reported_at,report_title,report_title_local,report_title_en,pdf_path,pages_path,blocks_path,...,COMPANY_NAME,TICKER,SECTOR,MOMENTUM,VALUE,SIZE,BETA,SEDOL,ISIN,RETURNS
0,f_4csLsjAhudR2mcDosUbKD5,LULULEMON ATHLETICA INC,US5500211090,2024-03-21,Annual Report,LULULEMON ATHLETICA INC - Form 10-K,LULULEMON ATHLETICA INC - Form 10-K,data/reports/LULULEMON_ATHLETICA_INC/f_4csLsjA...,data/reports/LULULEMON_ATHLETICA_INC/f_4csLsjA...,data/reports/LULULEMON_ATHLETICA_INC/f_4csLsjA...,...,lululemon athletica inc.,LULU,Consumer Discretionary,44.9034,-0.609,10.840906,1.217020,B23FN39,US5500211090,-0.163652
1,f_9tgeLS94nO4peRbBEIbvje,LULULEMON ATHLETICA INC,US5500211090,2024-08-29,Quarterly Report,LULULEMON ATHLETICA INC - Form 10-Q,LULULEMON ATHLETICA INC - Form 10-Q,data/reports/LULULEMON_ATHLETICA_INC/f_9tgeLS9...,data/reports/LULULEMON_ATHLETICA_INC/f_9tgeLS9...,data/reports/LULULEMON_ATHLETICA_INC/f_9tgeLS9...,...,lululemon athletica inc.,LULU,Consumer Discretionary,-32.0468,-0.131,10.408057,0.576961,B23FN39,US5500211090,0.003132
2,f_PDplDokn,LULULEMON ATHLETICA INC,US5500211090,2023-08-31,Quarterly Report,LULULEMON ATHLETICA INC - Form 10-Q,LULULEMON ATHLETICA INC - Form 10-Q,data/reports/LULULEMON_ATHLETICA_INC/f_PDplDok...,data/reports/LULULEMON_ATHLETICA_INC/f_PDplDok...,data/reports/LULULEMON_ATHLETICA_INC/f_PDplDok...,...,lululemon athletica inc.,LULU,Consumer Discretionary,26.1935,-0.758,10.815705,1.478690,B23FN39,US5500211090,0.007212
3,f_WoQLjVEq,LULULEMON ATHLETICA INC,US5500211090,2023-12-07,Quarterly Report,LULULEMON ATHLETICA INC - Form 10-Q,LULULEMON ATHLETICA INC - Form 10-Q,data/reports/LULULEMON_ATHLETICA_INC/f_WoQLjVE...,data/reports/LULULEMON_ATHLETICA_INC/f_WoQLjVE...,data/reports/LULULEMON_ATHLETICA_INC/f_WoQLjVE...,...,lululemon athletica inc.,LULU,Consumer Discretionary,38.3593,-0.938,11.110417,1.365380,B23FN39,US5500211090,0.144338
4,f_cC5GK9NDnFMWH7KTPOIlox,LULULEMON ATHLETICA INC,US5500211090,2024-12-05,Quarterly Report,LULULEMON ATHLETICA INC - Form 10-Q,LULULEMON ATHLETICA INC - Form 10-Q,data/reports/LULULEMON_ATHLETICA_INC/f_cC5GK9N...,data/reports/LULULEMON_ATHLETICA_INC/f_cC5GK9N...,data/reports/LULULEMON_ATHLETICA_INC/f_cC5GK9N...,...,lululemon athletica inc.,LULU,Consumer Discretionary,-37.2841,-0.506,10.794169,-0.378504,B23FN39,US5500211090,0.192572
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4714,o_fJwqPxyinmaxzZNTErSOwf,GE VERNOVA INC,US36828A1016,2024-11-20,GE Vernova to host Investor Update event on De...,GE VERNOVA INC - GE Vernova to host Investor U...,GE VERNOVA INC - GE Vernova to host Investor U...,data/reports/GE_VERNOVA_INC/o_fJwqPxyinmaxzZNT...,data/reports/GE_VERNOVA_INC/o_fJwqPxyinmaxzZNT...,data/reports/GE_VERNOVA_INC/o_fJwqPxyinmaxzZNT...,...,GE Vernova Inc.,GEV,Industrials,,-0.843,11.434258,-0.458317,BP6H4Y1,US36828A1016,0.107605
4715,o_fwe9aJYEVTFtxduTAdK6sd,GE VERNOVA INC,US36828A1016,2024-12-10,GE Vernova 2024 Investor Update,GE VERNOVA INC - GE Vernova 2024 Investor Update,GE VERNOVA INC - GE Vernova 2024 Investor Update,data/reports/GE_VERNOVA_INC/o_fwe9aJYEVTFtxduT...,data/reports/GE_VERNOVA_INC/o_fwe9aJYEVTFtxduT...,data/reports/GE_VERNOVA_INC/o_fwe9aJYEVTFtxduT...,...,GE Vernova Inc.,GEV,Industrials,,-0.803,11.419023,0.218221,BP6H4Y1,US36828A1016,-0.014815
4716,o_gAqntEyHWJXJDmnXkTAQEe,BANK OF AMERICA CORP,US0605051046,2024-09-30,Bank of America Third Quarter 2024 Earnings An...,BANK OF AMERICA CORP - Bank of America Third Q...,BANK OF AMERICA CORP - Bank of America Third Q...,data/reports/BANK_OF_AMERICA_CORP/o_gAqntEyHWJ...,data/reports/BANK_OF_AMERICA_CORP/o_gAqntEyHWJ...,data/reports/BANK_OF_AMERICA_CORP/o_gAqntEyHWJ...,...,Bank of America Corp,BAC,Financials,51.9570,2.520,12.630449,1.355190,2295677,US0605051046,-0.019726
4717,o_gdd49aPyFdkFEH34pnybsJ,CITIGROUP INC,US1729674242,2024-06-18,Citi 2024 Services Investor Day Full Transcript,CITIGROUP INC - Citi 2024 Services Investor Da...,CITIGROUP INC - Citi 2024 Services Investor Da...,data/reports/CITIGROUP_INC/o_gdd49aPyFdkFEH34p...,data/reports/CITIGROUP_INC/o_gdd49aPyFdkFEH34p...,data/reports/CITIGROUP_INC/o_gdd49aPyFdkFEH34p...,...,Citigroup Inc.,C,Financials,40.8893,3.993,11.673367,1.276030,2297907,US1729674242,0.018456


In [46]:
# Check for duplicates based returns, company, date, report title
merged = merged.drop_duplicates(subset=["isin", "reported_at", "RETURNS", "report_title"], keep="first")

In [54]:
merged["report_type"] = np.where(
    merged["report_title"] == "Annual Report", "Annual",
    np.where(merged["report_title"] == "Quarterly Report", "Quarterly", None)
)

In [62]:
merged.isna().sum()

report_id                0
company_name             0
isin                     0
reported_at              0
report_title             0
report_title_local       0
report_title_en          0
pdf_path                 0
pages_path               0
blocks_path              0
year_month               0
DATE                     0
SYMBOL                   0
COMPANY_NAME             0
TICKER                   0
SECTOR                   0
MOMENTUM                27
VALUE                    0
SIZE                     0
BETA                     3
SEDOL                    0
ISIN                     0
RETURNS                  0
report_type           1401
dtype: int64

In [58]:
merged

Unnamed: 0,report_id,company_name,isin,reported_at,report_title,report_title_local,report_title_en,pdf_path,pages_path,blocks_path,...,TICKER,SECTOR,MOMENTUM,VALUE,SIZE,BETA,SEDOL,ISIN,RETURNS,report_type
0,f_4csLsjAhudR2mcDosUbKD5,LULULEMON ATHLETICA INC,US5500211090,2024-03-21,Annual Report,LULULEMON ATHLETICA INC - Form 10-K,LULULEMON ATHLETICA INC - Form 10-K,data/reports/LULULEMON_ATHLETICA_INC/f_4csLsjA...,data/reports/LULULEMON_ATHLETICA_INC/f_4csLsjA...,data/reports/LULULEMON_ATHLETICA_INC/f_4csLsjA...,...,LULU,Consumer Discretionary,44.9034,-0.609,10.840906,1.217020,B23FN39,US5500211090,-0.163652,Annual
1,f_9tgeLS94nO4peRbBEIbvje,LULULEMON ATHLETICA INC,US5500211090,2024-08-29,Quarterly Report,LULULEMON ATHLETICA INC - Form 10-Q,LULULEMON ATHLETICA INC - Form 10-Q,data/reports/LULULEMON_ATHLETICA_INC/f_9tgeLS9...,data/reports/LULULEMON_ATHLETICA_INC/f_9tgeLS9...,data/reports/LULULEMON_ATHLETICA_INC/f_9tgeLS9...,...,LULU,Consumer Discretionary,-32.0468,-0.131,10.408057,0.576961,B23FN39,US5500211090,0.003132,Quarterly
2,f_PDplDokn,LULULEMON ATHLETICA INC,US5500211090,2023-08-31,Quarterly Report,LULULEMON ATHLETICA INC - Form 10-Q,LULULEMON ATHLETICA INC - Form 10-Q,data/reports/LULULEMON_ATHLETICA_INC/f_PDplDok...,data/reports/LULULEMON_ATHLETICA_INC/f_PDplDok...,data/reports/LULULEMON_ATHLETICA_INC/f_PDplDok...,...,LULU,Consumer Discretionary,26.1935,-0.758,10.815705,1.478690,B23FN39,US5500211090,0.007212,Quarterly
3,f_WoQLjVEq,LULULEMON ATHLETICA INC,US5500211090,2023-12-07,Quarterly Report,LULULEMON ATHLETICA INC - Form 10-Q,LULULEMON ATHLETICA INC - Form 10-Q,data/reports/LULULEMON_ATHLETICA_INC/f_WoQLjVE...,data/reports/LULULEMON_ATHLETICA_INC/f_WoQLjVE...,data/reports/LULULEMON_ATHLETICA_INC/f_WoQLjVE...,...,LULU,Consumer Discretionary,38.3593,-0.938,11.110417,1.365380,B23FN39,US5500211090,0.144338,Quarterly
4,f_cC5GK9NDnFMWH7KTPOIlox,LULULEMON ATHLETICA INC,US5500211090,2024-12-05,Quarterly Report,LULULEMON ATHLETICA INC - Form 10-Q,LULULEMON ATHLETICA INC - Form 10-Q,data/reports/LULULEMON_ATHLETICA_INC/f_cC5GK9N...,data/reports/LULULEMON_ATHLETICA_INC/f_cC5GK9N...,data/reports/LULULEMON_ATHLETICA_INC/f_cC5GK9N...,...,LULU,Consumer Discretionary,-37.2841,-0.506,10.794169,-0.378504,B23FN39,US5500211090,0.192572,Quarterly
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4714,o_fJwqPxyinmaxzZNTErSOwf,GE VERNOVA INC,US36828A1016,2024-11-20,GE Vernova to host Investor Update event on De...,GE VERNOVA INC - GE Vernova to host Investor U...,GE VERNOVA INC - GE Vernova to host Investor U...,data/reports/GE_VERNOVA_INC/o_fJwqPxyinmaxzZNT...,data/reports/GE_VERNOVA_INC/o_fJwqPxyinmaxzZNT...,data/reports/GE_VERNOVA_INC/o_fJwqPxyinmaxzZNT...,...,GEV,Industrials,,-0.843,11.434258,-0.458317,BP6H4Y1,US36828A1016,0.107605,
4715,o_fwe9aJYEVTFtxduTAdK6sd,GE VERNOVA INC,US36828A1016,2024-12-10,GE Vernova 2024 Investor Update,GE VERNOVA INC - GE Vernova 2024 Investor Update,GE VERNOVA INC - GE Vernova 2024 Investor Update,data/reports/GE_VERNOVA_INC/o_fwe9aJYEVTFtxduT...,data/reports/GE_VERNOVA_INC/o_fwe9aJYEVTFtxduT...,data/reports/GE_VERNOVA_INC/o_fwe9aJYEVTFtxduT...,...,GEV,Industrials,,-0.803,11.419023,0.218221,BP6H4Y1,US36828A1016,-0.014815,
4716,o_gAqntEyHWJXJDmnXkTAQEe,BANK OF AMERICA CORP,US0605051046,2024-09-30,Bank of America Third Quarter 2024 Earnings An...,BANK OF AMERICA CORP - Bank of America Third Q...,BANK OF AMERICA CORP - Bank of America Third Q...,data/reports/BANK_OF_AMERICA_CORP/o_gAqntEyHWJ...,data/reports/BANK_OF_AMERICA_CORP/o_gAqntEyHWJ...,data/reports/BANK_OF_AMERICA_CORP/o_gAqntEyHWJ...,...,BAC,Financials,51.9570,2.520,12.630449,1.355190,2295677,US0605051046,-0.019726,
4717,o_gdd49aPyFdkFEH34pnybsJ,CITIGROUP INC,US1729674242,2024-06-18,Citi 2024 Services Investor Day Full Transcript,CITIGROUP INC - Citi 2024 Services Investor Da...,CITIGROUP INC - Citi 2024 Services Investor Da...,data/reports/CITIGROUP_INC/o_gdd49aPyFdkFEH34p...,data/reports/CITIGROUP_INC/o_gdd49aPyFdkFEH34p...,data/reports/CITIGROUP_INC/o_gdd49aPyFdkFEH34p...,...,C,Financials,40.8893,3.993,11.673367,1.276030,2297907,US1729674242,0.018456,


### Using Llama to categorize reports - Quaterly/Annually/Legal/Earnings

In [64]:
API_URL = "http://localhost:11434/api/generate"

def safe_parse_response(text, rows):
    """
    Parse model output, only keeping one of:
    Annual, Quarterly, Earnings, Financing/Legal, Event/Other
    If parsing fails or label is not valid, return None for that row.
    """
    match = re.search(r"\[.*\]", text, re.DOTALL)
    if not match:
        print("‚ö†Ô∏è No JSON found. Returning None for batch.")
        return [None] * len(rows)

    try:
        parsed = json.loads(match.group(0))
    except json.JSONDecodeError:
        print("‚ö†Ô∏è JSON decode failed. Returning None for batch.")
        return [None] * len(rows)

    valid_labels = {"annual": "Annual",
                    "quarter": "Quarterly",
                    "earning": "Earnings",
                    "financ": "Financing/Legal",
                    "event": "Event/Other"}  # collapse "Other" into "Event/Other"

    cleaned = []
    for label in parsed:
        if not isinstance(label, str):
            cleaned.append(None)
            continue

        low = label.strip().lower()
        mapped = None
        for k, v in valid_labels.items():
            if low.startswith(k):
                mapped = v
                break
        cleaned.append(mapped)

    # Adjust length to match rows
    if len(cleaned) < len(rows):
        cleaned.extend([None] * (len(rows) - len(cleaned)))
    elif len(cleaned) > len(rows):
        cleaned = cleaned[:len(rows)]

    return cleaned

def classify_batch(rows):
    prompt = "Classify each report strictly as one of: Quarterly, Annual, Earnings, Financing/Legal, Event/Other.\n\n"
    for i, r in enumerate(rows):
        prompt += f"{i+1}. {r['report_title_en']} | {r['report_title_local']} | {r['report_title']}\n"
    prompt += "\nReturn only a JSON list of categories in order, nothing else."

    response = requests.post(
        API_URL,
        json={"model": "llama3:8b", "prompt": prompt, "stream": False}
    )

    text = response.json()["response"]
    return safe_parse_response(text, rows)

In [65]:
# only for rows with missing report_type
mask = merged["report_type"].isna()
subset = merged[mask].copy()

batch_size = 1
labels = []

for i in range(0, len(subset), batch_size):
    batch = subset.iloc[i:i+batch_size].to_dict(orient="records")
    batch_labels = classify_batch(batch)
    labels.extend(batch_labels)

    if (i + batch_size) % 100 == 0 or i + batch_size >= len(subset):
        print(f"Processed {min(i+batch_size, len(subset))} rows out of {len(subset)}")

# Fill back into merged
subset["report_type"] = labels
merged.loc[mask, "report_type"] = subset["report_type"].values

‚ö†Ô∏è JSON decode failed. Returning None for batch.
Processed 100 rows out of 1401
Processed 200 rows out of 1401
Processed 300 rows out of 1401
Processed 400 rows out of 1401
Processed 500 rows out of 1401
Processed 600 rows out of 1401
Processed 700 rows out of 1401
Processed 800 rows out of 1401
Processed 900 rows out of 1401
Processed 1000 rows out of 1401
Processed 1100 rows out of 1401
Processed 1200 rows out of 1401
Processed 1300 rows out of 1401
Processed 1400 rows out of 1401
Processed 1401 rows out of 1401


In [66]:
# Unique labels
print(merged["report_type"].unique())

# Count per category
print(merged["report_type"].value_counts())

['Annual' 'Quarterly' 'Earnings' None 'Event/Other' 'Financing/Legal']
report_type
Quarterly          2653
Annual              857
Earnings            850
Event/Other         308
Financing/Legal       4
Name: count, dtype: int64


In [67]:
merged

Unnamed: 0,report_id,company_name,isin,reported_at,report_title,report_title_local,report_title_en,pdf_path,pages_path,blocks_path,...,TICKER,SECTOR,MOMENTUM,VALUE,SIZE,BETA,SEDOL,ISIN,RETURNS,report_type
0,f_4csLsjAhudR2mcDosUbKD5,LULULEMON ATHLETICA INC,US5500211090,2024-03-21,Annual Report,LULULEMON ATHLETICA INC - Form 10-K,LULULEMON ATHLETICA INC - Form 10-K,data/reports/LULULEMON_ATHLETICA_INC/f_4csLsjA...,data/reports/LULULEMON_ATHLETICA_INC/f_4csLsjA...,data/reports/LULULEMON_ATHLETICA_INC/f_4csLsjA...,...,LULU,Consumer Discretionary,44.9034,-0.609,10.840906,1.217020,B23FN39,US5500211090,-0.163652,Annual
1,f_9tgeLS94nO4peRbBEIbvje,LULULEMON ATHLETICA INC,US5500211090,2024-08-29,Quarterly Report,LULULEMON ATHLETICA INC - Form 10-Q,LULULEMON ATHLETICA INC - Form 10-Q,data/reports/LULULEMON_ATHLETICA_INC/f_9tgeLS9...,data/reports/LULULEMON_ATHLETICA_INC/f_9tgeLS9...,data/reports/LULULEMON_ATHLETICA_INC/f_9tgeLS9...,...,LULU,Consumer Discretionary,-32.0468,-0.131,10.408057,0.576961,B23FN39,US5500211090,0.003132,Quarterly
2,f_PDplDokn,LULULEMON ATHLETICA INC,US5500211090,2023-08-31,Quarterly Report,LULULEMON ATHLETICA INC - Form 10-Q,LULULEMON ATHLETICA INC - Form 10-Q,data/reports/LULULEMON_ATHLETICA_INC/f_PDplDok...,data/reports/LULULEMON_ATHLETICA_INC/f_PDplDok...,data/reports/LULULEMON_ATHLETICA_INC/f_PDplDok...,...,LULU,Consumer Discretionary,26.1935,-0.758,10.815705,1.478690,B23FN39,US5500211090,0.007212,Quarterly
3,f_WoQLjVEq,LULULEMON ATHLETICA INC,US5500211090,2023-12-07,Quarterly Report,LULULEMON ATHLETICA INC - Form 10-Q,LULULEMON ATHLETICA INC - Form 10-Q,data/reports/LULULEMON_ATHLETICA_INC/f_WoQLjVE...,data/reports/LULULEMON_ATHLETICA_INC/f_WoQLjVE...,data/reports/LULULEMON_ATHLETICA_INC/f_WoQLjVE...,...,LULU,Consumer Discretionary,38.3593,-0.938,11.110417,1.365380,B23FN39,US5500211090,0.144338,Quarterly
4,f_cC5GK9NDnFMWH7KTPOIlox,LULULEMON ATHLETICA INC,US5500211090,2024-12-05,Quarterly Report,LULULEMON ATHLETICA INC - Form 10-Q,LULULEMON ATHLETICA INC - Form 10-Q,data/reports/LULULEMON_ATHLETICA_INC/f_cC5GK9N...,data/reports/LULULEMON_ATHLETICA_INC/f_cC5GK9N...,data/reports/LULULEMON_ATHLETICA_INC/f_cC5GK9N...,...,LULU,Consumer Discretionary,-37.2841,-0.506,10.794169,-0.378504,B23FN39,US5500211090,0.192572,Quarterly
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4714,o_fJwqPxyinmaxzZNTErSOwf,GE VERNOVA INC,US36828A1016,2024-11-20,GE Vernova to host Investor Update event on De...,GE VERNOVA INC - GE Vernova to host Investor U...,GE VERNOVA INC - GE Vernova to host Investor U...,data/reports/GE_VERNOVA_INC/o_fJwqPxyinmaxzZNT...,data/reports/GE_VERNOVA_INC/o_fJwqPxyinmaxzZNT...,data/reports/GE_VERNOVA_INC/o_fJwqPxyinmaxzZNT...,...,GEV,Industrials,,-0.843,11.434258,-0.458317,BP6H4Y1,US36828A1016,0.107605,Event/Other
4715,o_fwe9aJYEVTFtxduTAdK6sd,GE VERNOVA INC,US36828A1016,2024-12-10,GE Vernova 2024 Investor Update,GE VERNOVA INC - GE Vernova 2024 Investor Update,GE VERNOVA INC - GE Vernova 2024 Investor Update,data/reports/GE_VERNOVA_INC/o_fwe9aJYEVTFtxduT...,data/reports/GE_VERNOVA_INC/o_fwe9aJYEVTFtxduT...,data/reports/GE_VERNOVA_INC/o_fwe9aJYEVTFtxduT...,...,GEV,Industrials,,-0.803,11.419023,0.218221,BP6H4Y1,US36828A1016,-0.014815,Event/Other
4716,o_gAqntEyHWJXJDmnXkTAQEe,BANK OF AMERICA CORP,US0605051046,2024-09-30,Bank of America Third Quarter 2024 Earnings An...,BANK OF AMERICA CORP - Bank of America Third Q...,BANK OF AMERICA CORP - Bank of America Third Q...,data/reports/BANK_OF_AMERICA_CORP/o_gAqntEyHWJ...,data/reports/BANK_OF_AMERICA_CORP/o_gAqntEyHWJ...,data/reports/BANK_OF_AMERICA_CORP/o_gAqntEyHWJ...,...,BAC,Financials,51.9570,2.520,12.630449,1.355190,2295677,US0605051046,-0.019726,Earnings
4717,o_gdd49aPyFdkFEH34pnybsJ,CITIGROUP INC,US1729674242,2024-06-18,Citi 2024 Services Investor Day Full Transcript,CITIGROUP INC - Citi 2024 Services Investor Da...,CITIGROUP INC - Citi 2024 Services Investor Da...,data/reports/CITIGROUP_INC/o_gdd49aPyFdkFEH34p...,data/reports/CITIGROUP_INC/o_gdd49aPyFdkFEH34p...,data/reports/CITIGROUP_INC/o_gdd49aPyFdkFEH34p...,...,C,Financials,40.8893,3.993,11.673367,1.276030,2297907,US1729674242,0.018456,Event/Other


In [69]:
merged.columns

Index(['report_id', 'company_name', 'isin', 'reported_at', 'report_title',
       'report_title_local', 'report_title_en', 'pdf_path', 'pages_path',
       'blocks_path', 'year_month', 'DATE', 'SYMBOL', 'COMPANY_NAME', 'TICKER',
       'SECTOR', 'MOMENTUM', 'VALUE', 'SIZE', 'BETA', 'SEDOL', 'ISIN',
       'RETURNS', 'report_type'],
      dtype='object')

In [70]:
# Drop unwanted columns 
cols_to_drop = ['report_title', 'report_title_local', 'report_title_en']
merged = merged.drop(columns=[c for c in cols_to_drop if c in merged.columns])

merged

Unnamed: 0,report_id,company_name,isin,reported_at,pdf_path,pages_path,blocks_path,year_month,DATE,SYMBOL,...,TICKER,SECTOR,MOMENTUM,VALUE,SIZE,BETA,SEDOL,ISIN,RETURNS,report_type
0,f_4csLsjAhudR2mcDosUbKD5,LULULEMON ATHLETICA INC,US5500211090,2024-03-21,data/reports/LULULEMON_ATHLETICA_INC/f_4csLsjA...,data/reports/LULULEMON_ATHLETICA_INC/f_4csLsjA...,data/reports/LULULEMON_ATHLETICA_INC/f_4csLsjA...,2024-03,2024-03-28,55002110,...,LULU,Consumer Discretionary,44.9034,-0.609,10.840906,1.217020,B23FN39,US5500211090,-0.163652,Annual
1,f_9tgeLS94nO4peRbBEIbvje,LULULEMON ATHLETICA INC,US5500211090,2024-08-29,data/reports/LULULEMON_ATHLETICA_INC/f_9tgeLS9...,data/reports/LULULEMON_ATHLETICA_INC/f_9tgeLS9...,data/reports/LULULEMON_ATHLETICA_INC/f_9tgeLS9...,2024-08,2024-08-30,55002110,...,LULU,Consumer Discretionary,-32.0468,-0.131,10.408057,0.576961,B23FN39,US5500211090,0.003132,Quarterly
2,f_PDplDokn,LULULEMON ATHLETICA INC,US5500211090,2023-08-31,data/reports/LULULEMON_ATHLETICA_INC/f_PDplDok...,data/reports/LULULEMON_ATHLETICA_INC/f_PDplDok...,data/reports/LULULEMON_ATHLETICA_INC/f_PDplDok...,2023-08,2023-08-31,55002110,...,LULU,Consumer Discretionary,26.1935,-0.758,10.815705,1.478690,B23FN39,US5500211090,0.007212,Quarterly
3,f_WoQLjVEq,LULULEMON ATHLETICA INC,US5500211090,2023-12-07,data/reports/LULULEMON_ATHLETICA_INC/f_WoQLjVE...,data/reports/LULULEMON_ATHLETICA_INC/f_WoQLjVE...,data/reports/LULULEMON_ATHLETICA_INC/f_WoQLjVE...,2023-12,2023-12-29,55002110,...,LULU,Consumer Discretionary,38.3593,-0.938,11.110417,1.365380,B23FN39,US5500211090,0.144338,Quarterly
4,f_cC5GK9NDnFMWH7KTPOIlox,LULULEMON ATHLETICA INC,US5500211090,2024-12-05,data/reports/LULULEMON_ATHLETICA_INC/f_cC5GK9N...,data/reports/LULULEMON_ATHLETICA_INC/f_cC5GK9N...,data/reports/LULULEMON_ATHLETICA_INC/f_cC5GK9N...,2024-12,2024-12-31,55002110,...,LULU,Consumer Discretionary,-37.2841,-0.506,10.794169,-0.378504,B23FN39,US5500211090,0.192572,Quarterly
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4714,o_fJwqPxyinmaxzZNTErSOwf,GE VERNOVA INC,US36828A1016,2024-11-20,data/reports/GE_VERNOVA_INC/o_fJwqPxyinmaxzZNT...,data/reports/GE_VERNOVA_INC/o_fJwqPxyinmaxzZNT...,data/reports/GE_VERNOVA_INC/o_fJwqPxyinmaxzZNT...,2024-11,2024-11-29,36828A10,...,GEV,Industrials,,-0.843,11.434258,-0.458317,BP6H4Y1,US36828A1016,0.107605,Event/Other
4715,o_fwe9aJYEVTFtxduTAdK6sd,GE VERNOVA INC,US36828A1016,2024-12-10,data/reports/GE_VERNOVA_INC/o_fwe9aJYEVTFtxduT...,data/reports/GE_VERNOVA_INC/o_fwe9aJYEVTFtxduT...,data/reports/GE_VERNOVA_INC/o_fwe9aJYEVTFtxduT...,2024-12,2024-12-31,36828A10,...,GEV,Industrials,,-0.803,11.419023,0.218221,BP6H4Y1,US36828A1016,-0.014815,Event/Other
4716,o_gAqntEyHWJXJDmnXkTAQEe,BANK OF AMERICA CORP,US0605051046,2024-09-30,data/reports/BANK_OF_AMERICA_CORP/o_gAqntEyHWJ...,data/reports/BANK_OF_AMERICA_CORP/o_gAqntEyHWJ...,data/reports/BANK_OF_AMERICA_CORP/o_gAqntEyHWJ...,2024-09,2024-09-30,06050510,...,BAC,Financials,51.9570,2.520,12.630449,1.355190,2295677,US0605051046,-0.019726,Earnings
4717,o_gdd49aPyFdkFEH34pnybsJ,CITIGROUP INC,US1729674242,2024-06-18,data/reports/CITIGROUP_INC/o_gdd49aPyFdkFEH34p...,data/reports/CITIGROUP_INC/o_gdd49aPyFdkFEH34p...,data/reports/CITIGROUP_INC/o_gdd49aPyFdkFEH34p...,2024-06,2024-06-28,17296742,...,C,Financials,40.8893,3.993,11.673367,1.276030,2297907,US1729674242,0.018456,Event/Other


In [71]:
merged.to_csv('combined_train.csv', index=False)