# Dataset exploration (lightweight)

This notebook reads a label file (train.txt / labels.csv), computes:
- REAL vs FAKE counts
- Totals distribution (if OCR/text files exist)
- Image resolution / aspect ratio / file size
- Simple quality proxies (blur via variance of Laplacian, brightness/contrast)


In [None]:
from pathlib import Path
import sys
import pandas as pd
import matplotlib.pyplot as plt

# Resolve project root â€” works whether Jupyter is run from project root or notebooks/
_here = Path.cwd()
PROJECT_ROOT = _here if (_here / "src").exists() else _here.parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src.dataset import extract_total_from_text, image_basic_stats, load_label_table
from src.features import blur_variance_of_laplacian, brightness_contrast

LABELS = PROJECT_ROOT / "data/sample/labels.csv"
IMAGE_DIR = PROJECT_ROOT / "data/sample/images"
OCR_DIR = PROJECT_ROOT / "data/sample/ocr"

df = load_label_table(LABELS)
df

In [None]:
counts = df['label'].value_counts()
counts

In [None]:
plt.figure()
plt.bar(counts.index, counts.values)
plt.title('REAL vs FAKE count')
plt.show()

In [None]:
rows = []
for _, r in df.iterrows():
    image_id = r['image']
    label = r['label']
    img_path = IMAGE_DIR / image_id
    stats = image_basic_stats(img_path)
    stats.update(brightness_contrast(img_path))
    stats['blur_var_laplacian'] = blur_variance_of_laplacian(img_path)
    ocr_path = OCR_DIR / Path(image_id).with_suffix('.txt').name
    total = extract_total_from_text(ocr_path.read_text(errors='ignore')) if ocr_path.exists() else None
    stats['total_amount'] = total
    stats['image'] = image_id
    stats['label'] = label
    rows.append(stats)

feat = pd.DataFrame(rows)
feat

In [None]:
if feat['total_amount'].notna().any():
    plt.figure()
    plt.hist(feat['total_amount'].dropna(), bins=10)
    plt.title('Receipt totals distribution')
    plt.show()
else:
    print('No totals available (missing OCR/text)')