# Sleep and Dream Database - Raw

Compress complete Sleep and Dream Database raw exported file.

In [1]:
import hashlib
import lzma
from datetime import datetime, timezone
from pathlib import Path

import pandas as pd
from pandas.testing import assert_frame_equal

In [2]:
import_path = Path(".") / "dream-export.csv"
export_path = import_path.with_suffix(import_path.suffix + ".xz")

def file_hash(fname, alg, chunksize=65536):
    """Calculate the hash of a given file in chunks to avoid memory overload"""
    hasher = getattr(hashlib, alg)()
    with open(fname, "rb") as f:
        buffer = f.read(chunksize)
        while buffer:
            hasher.update(buffer)
            buffer = f.read(chunksize)
    return hasher.hexdigest()

with open(import_path, "rb") as f:
    data = f.read()  # # Read original file
    with lzma.open(export_path, "xb") as lzf:
        lzf.write(data)  # Write it to compressed file

for fp in [import_path, export_path]:
    print("\n")
    print(f"file: {fp}")
    print(f"size: {fp.stat().st_size / 1e6} Mb")
    print(f"timestamp: {datetime.fromtimestamp(fp.stat().st_mtime, tz=timezone.utc).isoformat(timespec='seconds')}")
    print(f"md5: {file_hash(fp, alg='md5')}")
    print(f"sha256: {file_hash(fp, alg='sha256')}")
    print("\n")

# Read both files back in and ensure they contain identical content
import_df = pd.read_csv(import_path, low_memory=False)  # low_memory=False to avoid column warnings
export_df = pd.read_csv(export_path, low_memory=False)
assert_frame_equal(import_df, export_df, check_exact=True)



file: dream-export.csv
size: 37.127982 Mb
timestamp: 2024-03-15T18:37:37+00:00
md5: 59fc13c2eb78aeb0bdb9d5a157e45d28
sha256: 68e4de21cefce661a4148214e0a6454d56662cb430010fdda6ef7eab7fcfffe8




file: dream-export.csv.xz
size: 5.348736 Mb
timestamp: 2024-03-22T22:15:10+00:00
md5: 7a28c8f29584ab3375db2dc2bbc86d6d
sha256: 11518b215efdff596b70b5f150e8df673355297d210e36fb46ac0e08651dcbff


