# Compute & persist hashes (cleaned raw)

In [4]:
import hashlib, pathlib, json, os

def sha256_file(path: pathlib.Path) -> str:
    if not path.exists():
        return None
    h = hashlib.sha256()
    with path.open('rb') as f:
        for chunk in iter(lambda: f.read(8192), b''):
            h.update(chunk)
    return h.hexdigest()

# Paths (adjust if your files live elsewhere)
CLEANED = pathlib.Path("datasets/cleaned_eta_logs.csv")
RAW_1   = pathlib.Path("datasets/time.txt")
RAW_2   = pathlib.Path("datasets/time6.txt")

# Compute hashes
hash_cleaned = sha256_file(CLEANED)
hash_raw_1   = sha256_file(RAW_1)
hash_raw_2   = sha256_file(RAW_2)

print("Data Integrity — SHA-256 fingerprints")
print("--------------------------------------")
print(f"cleaned_eta_logs.csv : {hash_cleaned or 'NOT FOUND'}")
print(f"time.txt             : {hash_raw_1 or 'NOT FOUND'}")
print(f"time6.txt            : {hash_raw_2 or 'NOT FOUND'}")

# Persist a manifest that ties raw -> cleaned -> analysis
manifest = {
    "analysis_input": {
        "path": str(CLEANED),
        "sha256": hash_cleaned,
    },
    "raw_sources": [
        {"path": str(RAW_1), "sha256": hash_raw_1},
        {"path": str(RAW_2), "sha256": hash_raw_2},
    ],
    "notes": {
        "transformations": [
            "Cleaning (duplicate removal, normalization)",
            "Time ordering (sorting by timestamp)"
        ],
        "caveats": [
            "Hash is byte-exact; ordering/line endings will change the fingerprint.",
            "For authenticity, use a signing step in CI/CD."
        ]
    }
}

with open("datasets/artifact_data_manifest.json", "w") as f:
    json.dump(manifest, f, indent=2)

print("\nSaved -> datasets/artifact_data_manifest.json")

Data Integrity — SHA-256 fingerprints
--------------------------------------
cleaned_eta_logs.csv : b7c4fe3646d472cf92b9221abd54302fb82712f5969e9dabb942b2f459dabacd
time.txt             : 1fcd364de88fe5b50e05351ff5db760896a9a39ad5093cc7fdbbf77a710da395
time6.txt            : 8bcb160b30e128f3e88fa7deea267ef08e9bf09b606b271b48cc84d49e151c55

Saved -> datasets/artifact_data_manifest.json
