# 01 · Data Inventory
        
This notebook explores the DNA sequence data structure and file organization.

## Project Structure
- **Raw Data**: `DNA-Data for Telhai/2023-05-11/`
- **Processed Data**: `artifacts/`
- **Logs**: `logs/`
- **Scratch Space**: `scratch/`


In [None]:
# Import libraries and set paths
from pathlib import Path
import os
import pandas as pd

root = Path("/home/mch/dna")
DATA_DIR = root / "DNA-Data for Telhai" / "2023-05-11"
ARTIFACTS = root / "artifacts"
LOGS = root / "logs"

print(f"Root directory: {root}")
print(f"Data directory: {DATA_DIR}")
print(f"Data exists: {DATA_DIR.exists()}")

In [None]:
# Check raw data files
clusters_csv = DATA_DIR / "clusters.csv"
zip_dir = DATA_DIR / "7zip"

print(f"\nClusters CSV:")
print(f"  Path: {clusters_csv}")
print(f"  Exists: {clusters_csv.exists()}")
if clusters_csv.exists():
    size_mb = clusters_csv.stat().st_size / (1024 * 1024)
    print(f"  Size: {size_mb:.2f} MB")
    
print(f"\n7zip directory:")
print(f"  Path: {zip_dir}")
if zip_dir.exists():
    for f in zip_dir.iterdir():
        print(f"  - {f.name} ({f.stat().st_size / (1024*1024):.2f} MB)")

In [None]:
# Quick preview of clusters.csv
if clusters_csv.exists():
    df_sample = pd.read_csv(clusters_csv, nrows=10)
    print(f"Shape (first 10 rows): {df_sample.shape}")
    print(f"Columns: {list(df_sample.columns)}")
    print("\nFirst 5 rows:")
    display(df_sample.head())
    
    # Check for null values
    print(f"\nNull values in sample:")
    print(df_sample.isnull().sum())

In [None]:
# Check processed data
parquet_dir = ARTIFACTS / "clusters_parquet"
duckdb_file = ARTIFACTS / "dna.duckdb"

print("Processed data status:")
print(f"  Parquet directory exists: {parquet_dir.exists()}")
if parquet_dir.exists():
    parquet_files = list(parquet_dir.glob("*.parquet"))
    print(f"  Number of parquet files: {len(parquet_files)}")
    if parquet_files:
        total_size = sum(f.stat().st_size for f in parquet_files) / (1024*1024)
        print(f"  Total size: {total_size:.2f} MB")

print(f"\n  DuckDB file exists: {duckdb_file.exists()}")
if duckdb_file.exists():
    print(f"  Size: {duckdb_file.stat().st_size / (1024*1024):.2f} MB")