# Inspect Gene .bin.gz Files

This notebook helps you inspect and verify the gene coordinate files generated by the single molecule viewer.

In [1]:
import gzip
import struct
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

In [2]:
# Update this path to your downloaded gene file
gene_file = "/Users/kresnajenie/Downloads/testealkj/dataset_1760993538440_genes_LEF1.bin.gz"

# Or use this to browse for file
# gene_file = "/path/to/your/LEF1.bin.gz"

In [3]:
def read_gene_file(filepath):
    """Read and decompress a .bin.gz gene file"""
    with gzip.open(filepath, 'rb') as f:
        data = f.read()
    
    # Convert bytes to float32 array
    num_floats = len(data) // 4
    floats = struct.unpack(f'{num_floats}f', data)
    
    # Reshape to (N, 3) for x, y, z coordinates
    coords = np.array(floats).reshape(-1, 3)
    
    return coords

In [4]:
# Read the gene file
coords = read_gene_file(gene_file)

print(f"📁 File: {Path(gene_file).name}")
print(f"🧬 Total molecules: {len(coords):,}")
print(f"📊 Shape: {coords.shape}")
print(f"\n📍 First 10 molecules (x, y, z):")
print(coords[:10])

📁 File: dataset_1760993538440_genes_LEF1.bin.gz
🧬 Total molecules: 168,904
📊 Shape: (168904, 3)

📍 First 10 molecules (x, y, z):
[[0.00195838 0.04346269 0.00453295]
 [0.0022557  0.04578075 0.00542367]
 [0.00226408 0.06313464 0.00462616]
 [0.00250214 0.04624089 0.00560001]
 [0.0024152  0.06389669 0.00430054]
 [0.00258289 0.04379554 0.00561424]
 [0.00266235 0.0423978  0.00488866]
 [0.00277791 0.05484274 0.00429079]
 [0.00284535 0.06247249 0.00522181]
 [0.00308716 0.04430548 0.00572653]]


In [None]:
# Statistics
print("📈 Statistics:")
print(f"\nX coordinates:")
print(f"  Min:  {coords[:, 0].min():.6f}")
print(f"  Max:  {coords[:, 0].max():.6f}")
print(f"  Mean: {coords[:, 0].mean():.6f}")

print(f"\nY coordinates:")
print(f"  Min:  {coords[:, 1].min():.6f}")
print(f"  Max:  {coords[:, 1].max():.6f}")
print(f"  Mean: {coords[:, 1].mean():.6f}")

print(f"\nZ coordinates:")
print(f"  Min:  {coords[:, 2].min():.6f}")
print(f"  Max:  {coords[:, 2].max():.6f}")
print(f"  Mean: {coords[:, 2].mean():.6f}")

print(f"\n⚠️  Coordinates should be normalized to [-1, 1] range")

In [None]:
# Visualize 2D projection (X vs Y)
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# X vs Y
axes[0].scatter(coords[:, 0], coords[:, 1], s=1, alpha=0.5)
axes[0].set_xlabel('X')
axes[0].set_ylabel('Y')
axes[0].set_title('X vs Y')
axes[0].set_aspect('equal')
axes[0].grid(True, alpha=0.3)

# X vs Z
axes[1].scatter(coords[:, 0], coords[:, 2], s=1, alpha=0.5)
axes[1].set_xlabel('X')
axes[1].set_ylabel('Z')
axes[1].set_title('X vs Z')
axes[1].set_aspect('equal')
axes[1].grid(True, alpha=0.3)

# Y vs Z
axes[2].scatter(coords[:, 1], coords[:, 2], s=1, alpha=0.5)
axes[2].set_xlabel('Y')
axes[2].set_ylabel('Z')
axes[2].set_title('Y vs Z')
axes[2].set_aspect('equal')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# 3D scatter plot (sample for performance)
from mpl_toolkits.mplot3d import Axes3D

# Sample if too many points
sample_size = min(10000, len(coords))
sample_indices = np.random.choice(len(coords), sample_size, replace=False)
sample_coords = coords[sample_indices]

fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

ax.scatter(sample_coords[:, 0], sample_coords[:, 1], sample_coords[:, 2], 
           s=1, alpha=0.5)

ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')
ax.set_title(f'3D Visualization (sampled {sample_size:,} molecules)')

plt.show()

In [None]:
# Distribution histograms
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

axes[0].hist(coords[:, 0], bins=50, alpha=0.7, edgecolor='black')
axes[0].set_xlabel('X coordinate')
axes[0].set_ylabel('Frequency')
axes[0].set_title('X Distribution')
axes[0].axvline(-1, color='r', linestyle='--', alpha=0.5, label='Expected range')
axes[0].axvline(1, color='r', linestyle='--', alpha=0.5)
axes[0].legend()

axes[1].hist(coords[:, 1], bins=50, alpha=0.7, edgecolor='black')
axes[1].set_xlabel('Y coordinate')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Y Distribution')
axes[1].axvline(-1, color='r', linestyle='--', alpha=0.5)
axes[1].axvline(1, color='r', linestyle='--', alpha=0.5)

axes[2].hist(coords[:, 2], bins=50, alpha=0.7, edgecolor='black')
axes[2].set_xlabel('Z coordinate')
axes[2].set_ylabel('Frequency')
axes[2].set_title('Z Distribution')
axes[2].axvline(-1, color='r', linestyle='--', alpha=0.5)
axes[2].axvline(1, color='r', linestyle='--', alpha=0.5)

plt.tight_layout()
plt.show()

In [None]:
# Export to CSV if needed
import pandas as pd

df = pd.DataFrame(coords, columns=['x', 'y', 'z'])
output_csv = gene_file.replace('.bin.gz', '.csv')
df.to_csv(output_csv, index=False)
print(f"✅ Saved to {output_csv}")
print(f"📊 First few rows:")
print(df.head(10))