# Generate CNV Segments File
This notebook creates a formatted CNV segments file from CONGAS results and bin coordinates.


In [2]:
import numpy as np
import pandas as pd
import re


In [3]:
def parse_coordinates(bin_name):
    """Parse bin name like 'chr7:100000001-100200000' into seqnames, start, end"""
    match = re.match(r'(.+):(\d+)-(\d+)', bin_name)
    if match:
        seqnames = match.group(1)
        start = int(match.group(2))
        end = int(match.group(3))
        return seqnames, start, end
    else:
        raise ValueError(f"Cannot parse bin name: {bin_name}")


In [4]:
# Load CONGAS results
results_path = './congas_results.npy'
params = np.load(results_path, allow_pickle=True).item()
CNA = params['CNA']  # (K, segments)
K, segments = CNA.shape
print(f"Loaded CONGAS results: {K} clusters, {segments} segments")


Loaded CONGAS results: 3 clusters, 791 segments


In [5]:
# Load bin segments
bin_segments = pd.read_csv('./bin_segments.csv')
bin_names = bin_segments['bin'].tolist()
print(f"Loaded {len(bin_names)} bin segments")
print("First few bins:")
print(bin_names[:5])


Loaded 791 bin segments
First few bins:
['chr7:100000001-100200000', 'chr7:1000001-1200000', 'chr7:100200001-100400000', 'chr7:100400001-100600000', 'chr7:100600001-100800000']


In [6]:
# Create CNV segments output
cnv_segments = []

for cluster_idx in range(K):
    cluster_cn = CNA[cluster_idx, :]
    
    for seg_idx, bin_name in enumerate(bin_names):
        cn = cluster_cn[seg_idx]
        
        # Only include segments with copy number changes
        if cn != 2:
            # Parse coordinates
            seqnames, start, end = parse_coordinates(bin_name)
            
            # Determine CNV type
            if cn > 2:
                cnv_type = "amp"
            elif cn < 2:
                cnv_type = "del"
            else:
                continue  # Skip diploid segments
            
            cnv_segments.append({
                'seqnames': seqnames,
                'start': start,
                'end': end,
                'cnv': cnv_type,
                'cluster': cluster_idx,
                'copy_number': cn
            })

print(f"Found {len(cnv_segments)} CNV segments")


Found 1729 CNV segments


In [7]:
# Create DataFrame and process
cnv_df = pd.DataFrame(cnv_segments)

if len(cnv_df) > 0:
    # Sort by chromosome and position
    cnv_df = cnv_df.sort_values(['seqnames', 'start', 'cluster'])
    
    print(f"Generated CNV segments file with {len(cnv_df)} entries")
    print(f"Number of clusters: {K}")
    print(f"CNV types found: {cnv_df['cnv'].value_counts().to_dict()}")
    print(f"CNV by cluster:")
    for cluster in range(K):
        cluster_cnvs = cnv_df[cnv_df['cluster'] == cluster]
        if len(cluster_cnvs) > 0:
            print(f"  Cluster {cluster}: {len(cluster_cnvs)} CNVs")
    
    print("\nFirst few entries:")
    display(cnv_df[['seqnames', 'start', 'end', 'cnv', 'cluster']].head(10))
    
else:
    print("No copy number variations found!")


Generated CNV segments file with 1729 entries
Number of clusters: 3
CNV types found: {'del': 1291, 'amp': 438}
CNV by cluster:
  Cluster 0: 573 CNVs
  Cluster 1: 565 CNVs
  Cluster 2: 591 CNVs

First few entries:


Unnamed: 0,seqnames,start,end,cnv,cluster
845,chr7,1,200000,del,1
1411,chr7,1,200000,amp,2
693,chr7,200001,400000,del,1
1271,chr7,200001,400000,del,2
741,chr7,400001,600000,del,1
769,chr7,600001,800000,del,1
804,chr7,800001,1000000,del,1
1379,chr7,800001,1000000,del,2
0,chr7,1000001,1200000,del,0
574,chr7,1000001,1200000,amp,1


In [8]:
# Save files
if len(cnv_df) > 0:
    # Save the main output file (seqnames start end cnv cluster)
    cnv_df[['seqnames', 'start', 'end', 'cnv', 'cluster']].to_csv(
        './cnv_segments_formatted.tsv', 
        sep='\t', 
        index=False
    )
    
    # Save extended version with copy numbers
    cnv_df.to_csv('./cnv_segments_detailed.tsv', sep='\t', index=False)
    
    print("Files saved:")
    print("- cnv_segments_formatted.tsv (main output)")
    print("- cnv_segments_detailed.tsv (with copy numbers)")


Files saved:
- cnv_segments_formatted.tsv (main output)
- cnv_segments_detailed.tsv (with copy numbers)
