In [1]:
import os, glob, json
import pandas as pd

BASE_PATH = r"C:\Users\Impana\Downloads\invoice-classification\\"
SROIE_RAW = os.path.join(BASE_PATH, "data", "raw", "sroie_raw")

print("Top level:", os.listdir(SROIE_RAW))
print("sroie subfolder:", os.listdir(os.path.join(SROIE_RAW, "sroie")))
print("train subfolder sample:", os.listdir(os.path.join(SROIE_RAW, "train"))[:10])

Top level: ['layoutlm-base-uncased', 'sroie', 'test', 'train']
sroie subfolder: []
train subfolder sample: ['box', 'entities', 'img']


In [None]:
ENTITIES_DIR = os.path.join(SROIE_RAW, "train", "entities")
BOX_DIR = os.path.join(SROIE_RAW, "train", "box")
IMG_DIR = os.path.join(SROIE_RAW, "train", "img")

entity_files = os.listdir(ENTITIES_DIR)[:5]
print("Sample entity files:", entity_files)

sample_path = os.path.join(ENTITIES_DIR, entity_files[0])
with open(sample_path, "r", encoding="utf-8") as f:
    txt = f.read()

print("Raw entity file content:\n", txt[:500])

Sample entity files: ['X00016469612.txt', 'X00016469619.txt', 'X00016469620.txt', 'X00016469622.txt', 'X00016469623.txt']
Raw entity file content:
 {
    "company": "BOOK TA .K (TAMAN DAYA) SDN BHD",
    "date": "25/12/2018",
    "address": "NO.53 55,57 & 59, JALAN SAGU 18, TAMAN DAYA, 81100 JOHOR BAHRU, JOHOR.",
    "total": "9.00"
}


In [None]:
import json

records = []

for fname in os.listdir(ENTITIES_DIR):
    if not fname.endswith(".txt"):
        continue
    path = os.path.join(ENTITIES_DIR, fname)
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)  

    company = data.get("company", "").strip()
    date = data.get("date", "").strip()
    address = data.get("address", "").strip()
    total = data.get("total", "").strip()

    invoice_id = os.path.splitext(fname)[0]

    if company and address:
        records.append({
            "invoice_id": invoice_id,
            "text": address,
            "company": company,
            "date": date,
            "total": total
        })

sroie_df = pd.DataFrame(records)
print("Rows:", len(sroie_df))
print("Unique companies:", sroie_df['company'].nunique())
sroie_df.head()

Rows: 625
Unique companies: 235


Unnamed: 0,invoice_id,text,company,date,total
0,X00016469612,"NO.53 55,57 & 59, JALAN SAGU 18, TAMAN DAYA, 8...",BOOK TA .K (TAMAN DAYA) SDN BHD,25/12/2018,9.0
1,X00016469619,"27, JALAN DEDAP 13, TAMAN JOHOR JAYA, 81100 JO...",INDAH GIFT & HOME DECO,19/10/2018,60.3
2,X00016469620,"LOT 1851-A & 1851-B, JALAN KPB 6, KAWASAN PERI...",MR D.I.Y. (JOHOR) SDN BHD,12-01-19,33.9
3,X00016469622,NO 122.124. JALAN DEDAP 13 81100 JOHOR BAHRU,YONGFATT ENTERPRISE,25/12/2018,80.9
4,X00016469623,"LOT 1851-A & 1851-B, JALAN KPB 6, KAWASAN PERI...",MR D.I.Y. (M) SDN BHD,18-11-18,30.9


In [4]:
OUT_DIR = os.path.join(BASE_PATH, "data", "sroie")
os.makedirs(OUT_DIR, exist_ok=True)

out_path = os.path.join(OUT_DIR, "D01_sroie_raw.csv")
sroie_df.to_csv(out_path, index=False)
print("Saved:", out_path)

Saved: C:\Users\Impana\Downloads\invoice-classification\\data\sroie\D01_sroie_raw.csv
