# This is a step by step code for Breast cancer CNV analysis comparision


## Step1 is to load CSV files as dataframe

In [2]:
import pandas as pd

# Load CSV files
invasive_df = pd.read_csv('inv_tumor_call_table.csv')
non_invasive_df = pd.read_csv('noniv_tumor_call_table.csv')

print("invasive_df")
print(invasive_df.head(5))
print("non_invasive_df")
print(non_invasive_df.head(5))

invasive_df
   Unnamed: 0  CHR    START      END  352  354  357  358  359  360  363  371  \
0           1    1   840001  2250000    1    1   -1   -1    0    0   -1    0   
1           2    1  2250000  2520001    1    0   -1   -1    0    0   -1    0   
2           3    1  2520001  6240000    1    0   -1   -1    0   -1   -1    0   
3           4    1  6240000  7500001    1    0   -1   -1    0    0   -1    0   
4           5    1  7500001  7770001    1    0   -1   -1    0    0   -1    1   

   372  373  378  379  383  
0    0    0    0   -1   -1  
1    0    0    0   -1   -1  
2    0    0    0   -1   -1  
3    0    0    0   -1   -1  
4    0    0    0   -1   -1  
non_invasive_df
   Unnamed: 0  CHR    START      END  352_ADH  354_ADH  357_ADH  \
0           1    1   840001  2280001        0       -1        0   
1           2    1  2280001  2520001        0       -1        0   
2           3    1  2520001  2550001        0       -1       -1   
3           4    1  2550001  2700001       -1    

## Step2 is to define a method to find overlapped chromosome regions


In [1]:
def find_overlapping_cnv(invasive_sample, non_invasive_sample):
    # Filter invasive and non-invasive samples based on chromosome changes
    invasive_changes = invasive_sample[(invasive_sample['Value'] != 0)]
    non_invasive_changes = non_invasive_sample[(non_invasive_sample['Value'] != 0)]
    
    print(invasive_changes.head(10))
    print(non_invasive_changes.head(10))

    # Initialize list to store overlapping regions
    overlapping_regions = []
    
    # Loop over invasive changes and check for overlap with non-invasive changes
    for _, inv_row in invasive_changes.iterrows():
        inv_chr, inv_start, inv_end, inv_value = inv_row['CHR'], inv_row['start'], inv_row['end'], inv_row['Value']
        
        # Check if non-invasive data has overlaps with invasive changes on the same chromosome
        overlaps = non_invasive_changes[(non_invasive_changes['CHR'] == inv_chr) & 
                                        (non_invasive_changes['start'] <= inv_end) &
                                        (non_invasive_changes['end'] >= inv_start)]
        
        # Append overlapping regions with their CNV status
        for _, non_inv_row in overlaps.iterrows():
            overlap_start = max(inv_start, non_inv_row['start'])
            overlap_end = min(inv_end, non_inv_row['end'])
            region_data = {
                'CHR': inv_chr,
                'Overlap_Start': overlap_start,
                'Overlap_End': overlap_end,
                'Invasive_Value': inv_value,
                'Non_Invasive_Value': non_inv_row['Value'],
                'Region_Type': 'Gain' if inv_value > 0 else 'Loss' if inv_value < 0 else 'Normal'
            }
            overlapping_regions.append(region_data)
    
    return overlapping_regions

## Step 3 is to define a function compare samples.

In [7]:
# Compare all invasive and corresponding non-invasive samples
def compare_samples(invasive_df, non_invasive_df):
    results = []
    
    for sample in invasive_df.columns[3:]:  # Exclude CHR, start, end columns
        non_inv_sample = f"{sample}_ADH"
        
        if non_inv_sample in non_invasive_df.columns:
            invasive_data = invasive_df[['CHR', 'start', 'end', sample]].rename(columns={sample: 'Value'})
            non_invasive_data = non_invasive_df[['CHR', 'start', 'end', non_inv_sample]].rename(columns={non_inv_sample: 'Value'})
            
            overlapping_regions = find_overlapping_cnv(invasive_data, non_invasive_data)
            results.extend(overlapping_regions)
    
    # Convert results to DataFrame and save
    overlap_df = pd.DataFrame(results)
    overlap_df.to_csv('cnv_overlaps.csv', index=False)
    print("Comparison complete. Results saved to 'cnv_overlaps.csv'.")

# Run comparison

In [8]:
# Run comparison
compare_samples(invasive_df, non_invasive_df)

KeyError: "['start', 'end'] not in index"

In [None]:
# Test find_overlapping function
