## Comparing Structural Variant Calling between HiC-breakfinder and HiSV


### Process:
1. Combine HiSV Intra and Inter SV data.
2. Merge with Breakfinder Data
3. Subset breakfinder data for 10kb res

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


In [None]:
# Headers for input files
hisv_intra_header = ["chrom1","chrom1_start","chrom1_end","chrom2_start", "chrom2_end"]
hisv_inter_header = ["chrom1","chrom1_start","chrom1_end","chrom2","chrom2_start","chrom2_end"]
breakfinder_header= ["call_odds","chrom1","chrom1_start","chrom1_end","chrom1_strand","chrom2","chrom2_start","chrom2_end","chrom2_strand","resolution"]

In [None]:
# Read the data files
breakfinder_raw_data = pd.read_csv("/Users/jkirkland/2023_chavez_rotation/data/RCMB56/HiC-breakfinder/RCMB56.breaks.txt", 
                                  sep='\t',header=None, names=breakfinder_header)
breakfinder_raw_data['source'] = "breakfinder"
hisv_intra_raw = pd.read_csv("/Users/jkirkland/2023_chavez_rotation/data/RCMB56/HiSV/HiSV_intra_SV_result.txt", sep='\t', header=None, names=hisv_intra_header)
hisv_inter_raw = pd.read_csv("/Users/jkirkland/2023_chavez_rotation/data/RCMB56/HiSV/HiSV_inter_SV_result.txt",
                             sep='\t', header=None, names=hisv_inter_header)

# format hisv data and merge
hisv_intra_raw['chrom2'] = hisv_intra_raw['chrom1']
hisv_merged = pd.concat([hisv_inter_raw, hisv_intra_raw])
hisv_merged['resolution'] = "50kb"
hisv_merged['source'] = "hisv"

# merge breakfinder and hisv data
sv_calls_merged = pd.concat([breakfinder_raw_data, hisv_merged])
sv_calls_merged = sv_calls_merged.drop(columns=['call_odds', 'chrom1_strand', 'chrom2_strand']).reset_index()
sv_calls_merged = sv_calls_merged[sv_calls_merged['resolution'] != '100kb']


In [None]:
ch1_sv_calls = sv_calls_merged[sv_calls_merged['chrom1'] == "chr1"] 
sv_calls_no_chr7 = sv_calls_merged[sv_calls_merged['chrom1'] != 'chr7']
sv_calls_chr7 = sv_calls_merged[sv_calls_merged['chrom1'] == 'chr7']

grouped_sv_calls = sv_calls_no_chr7.groupby(['chrom1', 'chrom2', 'source']).size().reset_index(name='count')
grouped_chr_7 = sv_calls_chr7.groupby(['chrom1', 'chrom2', 'source']).size().reset_index(name='count')



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Assuming result is the DataFrame obtained from the previous steps
pivot_df = grouped_sv_calls.pivot(index=['chrom1', 'chrom2'], columns='source', values='count').fillna(0)

# Order the chromosomes
chrom_order = [f"chr{i}" for i in range(1, 23)] + ['chrX', 'chrY']
pivot_df = pivot_df.reset_index()
pivot_df['chrom1'] = pd.Categorical(pivot_df['chrom1'], categories=chrom_order, ordered=True)
pivot_df = pivot_df.sort_values(by=['chrom1', 'chrom2'])
pivot_df.set_index(['chrom1', 'chrom2'], inplace=True)

# Plotting
fig, ax = plt.subplots(figsize=(20,7))

bar_width = 0.35
positions = np.arange(len(pivot_df))
spacing = 0.05

# Create bars for each source and include total count in the legend label
for i, source in enumerate(pivot_df.columns):
    total = pivot_df[source].sum()
    label_with_total = f"{source} (Total: {int(total)})"
    bar_positions = positions + i * (bar_width + spacing)
    bars = ax.bar(bar_positions, pivot_df[source], bar_width, label=label_with_total)

    # Place counts above bars but hide 0 values
    for bar in bars:
        height = bar.get_height()
        if height != 0:
            ax.annotate(f'{int(height)}',
                        xy=(bar.get_x() + bar.get_width() / 2, height),
                        xytext=(0, 3),  
                        textcoords="offset points",
                        ha='center', va='bottom')

# Adjust the x-ticks
ax.set_xticks(positions + (len(pivot_df.columns) * bar_width + spacing) / 2)
ax.set_xticklabels(pivot_df.index, rotation=45)
ax.yaxis.set_major_locator(plt.MaxNLocator(integer=True))

plt.ylabel('Count')
plt.title('Comparison of Calls Between Sources')
plt.legend()
plt.tight_layout()

plt.show()


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Assuming result is the DataFrame obtained from the previous steps
pivot_df = grouped_chr_7.pivot(index=['chrom1', 'chrom2'], columns='source', values='count').fillna(0)

# Order the chromosomes
chrom_order = [f"chr{i}" for i in range(1, 23)] + ['chrX', 'chrY']
pivot_df = pivot_df.reset_index()
pivot_df['chrom1'] = pd.Categorical(pivot_df['chrom1'], categories=chrom_order, ordered=True)
pivot_df = pivot_df.sort_values(by=['chrom1', 'chrom2'])
pivot_df.set_index(['chrom1', 'chrom2'], inplace=True)

# Plotting
fig, ax = plt.subplots(figsize=(20,7))

bar_width = 0.35
positions = np.arange(len(pivot_df))
spacing = 0.05

# Create bars for each source and include total count in the legend label
for i, source in enumerate(pivot_df.columns):
    total = pivot_df[source].sum()
    label_with_total = f"{source} (Total: {int(total)})"
    bar_positions = positions + i * (bar_width + spacing)
    bars = ax.bar(bar_positions, pivot_df[source], bar_width, label=label_with_total)

    # Place counts above bars but hide 0 values
    for bar in bars:
        height = bar.get_height()
        if height != 0:
            ax.annotate(f'{int(height)}',
                        xy=(bar.get_x() + bar.get_width() / 2, height),
                        xytext=(0, 3),  
                        textcoords="offset points",
                        ha='center', va='bottom')

# Adjust the x-ticks
ax.set_xticks(positions + (len(pivot_df.columns) * bar_width + spacing) / 2)
ax.set_xticklabels(pivot_df.index, rotation=45)
ax.yaxis.set_major_locator(plt.MaxNLocator(integer=True))

plt.ylabel('Count')
plt.title('Comparison of Calls Between Sources')
plt.legend()
plt.tight_layout()

plt.show()
