In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

repo_dir = '../'

seq_length = 2**19*2
bin_size = 2048

# Scoring dnSVs using SuPreMo-Akita

In [27]:
# Generate input for SuPreMo
dnSVs = pd.read_csv(f'{repo_dir}data/SFARI_SSC_dnSVs.csv').rename(columns = {'#chrom':'chrom'})
dnSVs_for_supremo = dnSVs[[x in ['DEL', 'DUP', 'INV'] for x in dnSVs.svtype]][['chrom', 'pos', 'end', 'svtype', 'svlen']].drop_duplicates().reset_index(drop = True)
dnSVs_for_supremo.insert(2, 'REF', '-')
dnSVs_for_supremo.insert(3, 'ALT', '-')
dnSVs_for_supremo.columns = ['CHROM', 'POS', 'REF', 'ALT', 'END', 'SVTYPE', 'SVLEN']
dnSVs_for_supremo.to_csv(f'{repo_dir}variant_scoring/supremo-akita_input/dnSVs_for_SuPreMo.txt', sep = '\t', index = False)


Run SuPreMo:

python SuPreMo/scripts/SuPreMo.py variant_scoring/supremo-akita_input/dnSVs_for_SuPreMo.txt \
--get_Akita_scores \
--dir variant_scoring/supremo-akita_output \
--file dnSV \
--fa SuPreMo/data/hg38.fa

In [28]:
# Read SuPreMo output and save to file

supremo_scores = pd.concat([dnSVs_for_supremo,
                            pd.read_csv(f'{repo_dir}variant_scoring/supremo-akita_output/dnSV_scores', 
                                        sep = '\t')],
                           axis = 1).drop(['REF', 'ALT', 'var_index'], axis = 1)

supremo_scores.columns = ['chrom', 'pos', 'end', 'svtype', 'svlen', 'MSE', 'correlation']

dnSVs_scored_noCPX = supremo_scores.merge(dnSVs, on = ['chrom', 'pos', 'end', 'svtype', 'svlen'], how = 'left')

dnSVs_scored_noCPX                                      

Unnamed: 0,chrom,pos,end,svtype,svlen,MSE,correlation,project,sample,father,father_age_birth_years,mosaic,mother,mother_age_birth_years,role,sex
0,chr1,6364588,6365496,DEL,908,0.010807,0.975127,SFARI,SSC06730,12707.fa,32.833333,False,12707.mo,31.750000,sib,female
1,chr1,8652435,8658053,DEL,5618,0.025499,0.921123,SFARI,SSC08663,13776.fa,29.000000,False,13776.mo,31.666667,proband,female
2,chr1,10817401,10817860,DEL,459,0.007486,0.988214,SFARI,SSC08564,12834.fa,32.916667,False,12834.mo,29.666667,proband,male
3,chr1,15687875,15696221,DEL,8346,0.012714,0.961685,SFARI,SSC10147,13259.fa,20.666667,False,13259.mo,22.000000,proband,male
4,chr1,22181932,22182865,DEL,933,0.015066,0.976237,SFARI,SSC12167,14545.fa,34.250000,False,14545.mo,30.500000,proband,male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
676,chrX,122209300,122276000,DEL,66700,0.004793,0.873522,SFARI,SSC12799,14685.fa,30.500000,False,14685.mo,30.083333,proband,male
677,chrX,131051086,131056406,INV,5320,0.000055,0.999905,SFARI,SSC09117,SSC07740,26.083333,False,SSC09122,30.166667,proband,male
678,chrX,154338501,154589000,DUP,250499,0.108324,0.846746,SFARI,SSC00875,11092.fa,29.000000,False,11092.mo,25.083333,proband,male
679,chrY,5605700,6038500,DUP,432800,0.005075,0.615437,SFARI,SSC12095,14541.fa,33.083333,False,14541.mo,28.750000,proband,male


## Score CPX variants

To score CPX variants, the vcf file is need as it contains variant information that is necessary for generating mutated sequences
Since that information requires SFARI permissions, we are not sharing it in this repo

We use the following variables from the INFO column of the vcf file: CHR2, CPX_TYPE, CPX_INTERVALS.

In [29]:
# Get previously calculated scores for CPX variants
dnSVs_scored_CPX = pd.read_csv(f'{repo_dir}data/dnSVs_scored_CPX.tsv', sep = '\t')


# Combine scores for all dnSVs into one df
dnSVs_scored = pd.concat([dnSVs_scored_noCPX, dnSVs_scored_CPX]).reset_index(drop = True)
dnSVs_scored['index'] = dnSVs_scored.index

dnSVs_scored.to_csv(f'{repo_dir}data/dnSVs_scored', sep = '\t', index = False)    

In [None]:
# To score CPX, the following funciton was used to get the REF and ALT allele within SuPreMo

def get_alleles_CPX(CHR, POS, END, CHR2, CPX_TYPE, CPX_INTERVALS):
    
    
    # Get reference and alternate sequence from REF and ALT allele using reference genome 


    # Get reference allele
    REF_var_seq = fasta_open.fetch(CHR, POS - 1, END - 1).upper()


    # Get alternate allele


    if CPX_TYPE in ['delINVdel', 'INVdup', 'delINVdup', 'dupINVdup', 'dupINV', 'dupINVdel', 
                    'delINVdel', 'dupINV', 'dupINVdel', 'dupINVdup', 'INVdup', 'delINVdup']:
        # get sequence for inversion

        # get sequence to invert
        INV_coordinates = CPX_INTERVALS.split('INV_' + CHR2.split('chr')[1] + ':')[1].split(',')[0]
        INV_start = int(INV_coordinates.split('-')[0])
        INV_end = int(INV_coordinates.split('-')[1])

        inv_revcomp = fasta_open.fetch(CHR2, INV_start - 1, INV_end - 1).upper()
        inv_seq = str(Seq(inv_revcomp).reverse_complement())



        # get sequences left and right of inversion


        # Get variant sequence before inversion
        if CPX_TYPE in ['delINVdel', 'INVdup', 'delINVdup']:
            ALT_left = ''

        if CPX_TYPE in ['dupINVdup', 'dupINV', 'dupINVdel']:
            DUP1_coordinates = CPX_INTERVALS.split('DUP_' + CHR2.split('chr')[1] + ':')[1].split(',')[0]
            DUP1_start = int(DUP1_coordinates.split('-')[0])
            DUP1_end = int(DUP1_coordinates.split('-')[1])
            ALT_left = fasta_open.fetch(CHR2, DUP1_start - 1, DUP1_end - 1).upper()


        # Get variant sequence after inversion
        if CPX_TYPE in ['delINVdel', 'dupINV', 'dupINVdel']:
            ALT_right = ''

        if CPX_TYPE in ['dupINVdup', 'INVdup', 'delINVdup']:

            if CPX_TYPE != 'dupINVdup':
                num = 1
            # if there are 2 duplications, get coordinates for the seconds one
            else:
                num = 2

            DUP2_coordinates = CPX_INTERVALS.split('DUP_' + CHR2.split('chr')[1] + ':')[num].split(',')[0]
            DUP2_start = int(DUP2_coordinates.split('-')[0])
            DUP2_end = int(DUP2_coordinates.split('-')[1])

            ALT_right = fasta_open.fetch(CHR2, DUP2_start - 1, DUP2_end - 1).upper()


        # Assemble ALT variant sequence
        ALT_var_seq = ALT_left + inv_seq + ALT_right
        
    else:
        raise ValueError('Currently cannot handle complex variants without inversions')


    return REF_var_seq, ALT_var_seq
    

# Scoring dnSVs near CREints using SuPreMo-Akita with weighted scoring

## Process PLACseq data into CREints for weighting

### Get data

In [7]:
from pybedtools import BedTool

In [8]:
# Get hg38 promoters of protein-coding genes (Kallisto https://pachterlab.github.io/kallisto/)

ensembl = pd.read_csv(f'{repo_dir}data/Homo_sapiens.GRCh38.96.gtf',
                     sep='\t', skiprows=5, header=None, 
                      names=['chrom','source','desc','start','end','score','strand','score2','desc_further'],
                     low_memory=False)
ensembl['chr'] = 'chr' + ensembl['chrom'].astype(str)
ensembl['name'] = ensembl['desc_further'].str.split('gene_name ').str[1].str.split(';').str[0].str.strip('""')
ensembl['biotype'] = ensembl['desc_further'].str.split('biotype ').str[1].str.split(';').str[0].str.strip('""')

# Remove nonsense chromosomes
chromosomes = []
for i in list(range(23))[1:]:
    chrom = 'chr' + str(i)
    chromosomes.append(chrom)
chromosomes.append('chrX')
chromosomes.append('chrY')
ensembl = ensembl[[x in chromosomes for x in ensembl.chr]]

ensembl = ensembl[ensembl.biotype == "protein_coding"]
ensembl_genes = ensembl.query('desc == "gene"').copy()
n_ens_genes = pd.DataFrame(ensembl_genes['name'].value_counts())
n_ens_genes['gene'] = n_ens_genes.index
ensembl_genes_single = n_ens_genes.query('name == 1')['gene'].tolist()
ensembl_genes_single_df = ensembl_genes.query('(name in @ensembl_genes_single)').copy()

# Get ensemble gene ids
ensembl_genes_single_df['gene_id'] = ensembl_genes_single_df['desc_further'].str.split('"').str[1]
ensembl_genes_single_df = ensembl_genes_single_df.rename(columns = {"name":"gene"})
gene_annot = ensembl_genes_single_df[['gene', 'gene_id', 'chr', 'start', 'end', 'strand']]

promoter_annot = gene_annot.copy()

def label_promoter_start(row):
    if row['strand'] == '+' :
        return row['start'] - 1000
    if row['strand'] == '-' :
        return row['end']

promoter_annot['Start'] = promoter_annot.apply(lambda row: label_promoter_start(row), axis=1)
promoter_annot['Start'][promoter_annot['Start'] < 1] = 1
promoter_annot['End'] = promoter_annot['Start'] + 1000

promoter_annot_PC_BED = BedTool.from_dataframe(promoter_annot[['chr', 'Start', 'End', 'gene']])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  promoter_annot['Start'][promoter_annot['Start'] < 1] = 1


In [9]:
# Get excitatory neuron RNAseq data (Song et al 2021)

eN_exp = pd.read_csv(f'{repo_dir}data/eN_avg', sep = '\t').dropna() 

In [10]:
# Get excitatory neuron ATACseq data (Song et al 2021)

ATAC = pd.read_csv(f'{repo_dir}data/eN.ATAC-seq.narrowPeak', sep = '\t', 
                   names = ['chrom', 'start', 'end', 'peak_id', 'score', 'strand', 'signalValue', 'pvalue', 'qvalue', 'peak'])
ATAC_BED = BedTool.from_dataframe(ATAC[['chrom', 'start', 'end']]).sort()

### Get loops and filter into CREints

In [11]:
# Get excitatory neuron H3K4me3 PLACseq loops (Song et al 2021)
loops = pd.read_csv(f'{repo_dir}data/eN.MAPS.peaks.txt', sep = '\t').rename(columns = {'chr1':'chrom'})[['chrom', 'start1', 'end1', 'start2', 'end2']]


In [12]:
# Get loops within predictive window 

dist_cutoff = 900000 #pixel_size*448

center_coord1 = [(x+y)/2 for x,y in zip(loops.start1, loops.end1)]
center_coord2 = [(x+y)/2 for x,y in zip(loops.start2, loops.end2)]

before = len(loops)

distance = [abs(x-y) for x,y in zip(center_coord1, center_coord2)]
loops = loops[[x < dist_cutoff for x in distance]]

print(len(loops)/before*100, '% of loops are within predictive window')


97.77350192413414 % of loops are within predictive window


In [13]:
# Get left and right loops

left_loops = loops[['chrom', 'start1', 'end1']].drop_duplicates().reset_index(drop = True)
left_loops['left_index'] = left_loops.index
loops = loops.merge(left_loops, on = ['chrom', 'start1', 'end1'])

right_loops = loops[['chrom', 'start2', 'end2']].drop_duplicates().reset_index(drop = True)
right_loops['right_index'] = right_loops.index
loops = loops.merge(right_loops, on = ['chrom', 'start2', 'end2'])

In [14]:
# For all loops with the same left anchor, combine right anchors that are within d

d = 10000

for i in left_loops.left_index:
    
    loops_i = loops.query('left_index == @i')[['chrom', 'start2', 'end2', 'right_index']]
    right_loop_BED = BedTool.from_dataframe(loops_i)

    if len(right_loop_BED) > 1:
        right_loop_keep = (right_loop_BED
                           .sort()
                           .merge(d = d, c = 4, o = 'collapse') # collapse indexes of the anchors merged
                           .to_dataframe()
                           .rename(columns = {'name':'right_index'}))
        
        if len(right_loop_keep) < len(loops_i):
            
            right_loop_keep['right_index'] = right_loop_keep['right_index'].str.split(',')
            right_loop_keep = right_loop_keep.explode('right_index')

            for ii in right_loop_keep.right_index.unique():

                loops.loc[(loops.left_index == i) & 
                          (loops.right_index == int(ii)),
                          'start2b'] = int(right_loop_keep.loc[right_loop_keep.right_index == ii,'start'])
                loops.loc[(loops.left_index == i) & 
                          (loops.right_index == int(ii)),
                          'end2b'] = int(right_loop_keep.loc[right_loop_keep.right_index == ii,'end'])

In [15]:
# Repeat for the right anchor

for i in right_loops.right_index:
    
    loops_i = loops.query('right_index == @i')[['chrom', 'start1', 'end1', 'left_index']]
    left_loop_BED = BedTool.from_dataframe(loops_i)

    if len(left_loop_BED) > 1:
        left_loop_keep = (left_loop_BED
                           .sort()
                           .merge(d = 10000, c = 4, o = 'collapse')
                           .to_dataframe()
                           .rename(columns = {'name':'left_index'}))
        
        if len(left_loop_keep) < len(loops_i):
            
            left_loop_keep['left_index'] = left_loop_keep['left_index'].str.split(',')
            left_loop_keep = left_loop_keep.explode('left_index')

            for ii in left_loop_keep.left_index.unique():

                loops.loc[(loops.right_index == i) & 
                          (loops.left_index == int(ii)),
                          'start1b'] = int(left_loop_keep.loc[left_loop_keep.left_index == ii,'start'])
                loops.loc[(loops.right_index == i) & 
                          (loops.left_index == int(ii)),
                          'end1b'] = int(left_loop_keep.loc[left_loop_keep.left_index == ii,'end'])

In [16]:
# Fill in the coordinates of anchors that don't change

loops.loc[np.isnan(loops.start1b), "start1b"] = loops.loc[np.isnan(loops.start1b), "start1"]
loops.loc[np.isnan(loops.end1b), "end1b"] = loops.loc[np.isnan(loops.end1b), "end1"]

loops.loc[np.isnan(loops.start2b), "start2b"] = loops.loc[np.isnan(loops.start2b), "start2"]
loops.loc[np.isnan(loops.end2b), "end2b"] = loops.loc[np.isnan(loops.end2b), "end2"]

In [17]:
# Save the new combined anchors

before = len(loops)
loops = (loops[['chrom', 'start1b', 'end1b', 'start2b', 'end2b']]
         .drop_duplicates()
         .rename(columns = {'start1b':'start1', 'end1b':'end1','start2b':'start2', 'end2b':'end2'})
         .reset_index(drop = True))

loops['Index'] = loops.index

print((100 - len(loops)/before*100), "% of loops were combined")

46.76342423390497 % of loops were combined


In [18]:
# Get window to look for variants so that the E-P loop and variant can fit in the ~1Mb predictive window

dist_cutoff = 900000

center_coord1 = [(x+y)/2 for x,y in zip(loops.start1, loops.end1)]
center_coord2 = [(x+y)/2 for x,y in zip(loops.start2, loops.end2)]

remaining_dist = [dist_cutoff - abs(x-y) for x,y in zip(center_coord1, center_coord2)]

# left and right most positions for variant to be in
loops['left_cutoff'] = [int(x - y) for x,y in zip(center_coord1, remaining_dist)]
loops.loc[loops.left_cutoff < 1,'left_cutoff'] = 1
loops['right_cutoff'] = [int(x + y) for x,y in zip(center_coord2, remaining_dist)]


In [20]:
# Get loops that overlap a promoter
E_or_P = pd.concat(
            [loops[['chrom', 'start1', 'end1', 'Index']].rename(columns = {'start1':'start', 'end1':'end'}),
            loops[['chrom', 'start2', 'end2', 'Index']].rename(columns = {'start2':'start', 'end2':'end'})],
            axis = 0)

E_or_P.start = E_or_P.start.astype('int')
E_or_P.end = E_or_P.end.astype('int')

E_or_P_BED = BedTool.from_dataframe(E_or_P)

promoter_loops = (E_or_P_BED
                  .intersect(promoter_annot_PC_BED, wa = True, wb = True)
                  .to_dataframe()
                  .rename(columns = {'name':'loop_index', 'thickEnd':'gene'})
                  [['loop_index', 'gene']])

before = len(loops)

# Only keep promoter loops
loops = loops[[x in promoter_loops.loop_index for x in loops.Index]]

print(len(loops)/before*100, '% of loops have an anchor at a promoter')

100.0 % of loops have an anchor at a promoter


In [21]:
# Get expression data

tpm_cutoff = 0.5

before = len(eN_exp)

# Get only expressed genes
eN_exp = eN_exp[eN_exp.TPM >= tpm_cutoff]

print((before - len(eN_exp))/before*100, '% of genes are not expressed')

69.78465563837295 % of genes are not expressed


In [22]:
#Filter only loops that overlap promoters of expressed genes
before = len(loops)

loops_to_keep = promoter_loops.loop_index[[x in eN_exp.gene.values for x in promoter_loops.gene]].unique()

loops = loops[[x in loops_to_keep for x in loops.Index]]

print((before - len(loops))/before*100, '% of loops are on promoters that are not expressed')

33.031883292626574 % of loops are on promoters that are not expressed


In [30]:
# find loops near variants
loops_windows_BED = BedTool.from_dataframe(loops[['chrom', 'left_cutoff', 'right_cutoff', 'Index']])

dnSVs_BED = BedTool.from_dataframe(dnSVs_scored[['chrom', 'pos', 'end', 'index']])

loop_var_pairs = (loops_windows_BED
                  .intersect(dnSVs_BED, wa = True, wb = True)
                  .to_dataframe()
                  .rename(columns = {'name':'loop_index', 'thickEnd':'var_index'})
                  [['loop_index', 'var_index']])

loop_var_pairs = loop_var_pairs.drop_duplicates() # If there are duplicates, they would be removed in the next step

In [31]:
# Get E/Ps that each variant overlaps
E_or_P = pd.concat(
            [loops[['chrom', 'start1', 'end1', 'Index']].rename(columns = {'start1':'start', 'end1':'end'}),
            loops[['chrom', 'start2', 'end2', 'Index']].rename(columns = {'start2':'start', 'end2':'end'})],
            axis = 0)
E_or_P.start = E_or_P.start.astype('int')
E_or_P.end = E_or_P.end.astype('int')

E_or_P_BED = BedTool.from_dataframe(E_or_P)

var_on_E_or_P = (E_or_P_BED
                  .intersect(dnSVs_BED, wa = True, wb = True)
                  .to_dataframe()
                  .rename(columns = {'name':'loop_index', 'thickEnd':'var_index'})
                  [['loop_index', 'var_index']])

before = len(loop_var_pairs)

# Filter out variants that overlap with E or P of loop they're in a trio with
loop_var_pairs = pd.concat([loop_var_pairs, var_on_E_or_P, var_on_E_or_P]).drop_duplicates(keep=False)

print(len(loop_var_pairs)/before*100, '% of E-P-variant trios: variant doesnt overlap E/P')

75.80801551389786 % of E-P-variant trios: variant doesnt overlap E/P


In [32]:
# Get all loop and var info in one
loop_var_coord = loop_var_pairs.copy()

# Add info on loops to loop-variant pairs
loop_var_coord = (loops
                  .rename(columns = {'Index':'loop_index'})
                  .drop(['left_cutoff', 'right_cutoff'], axis = 1)
                  .merge(loop_var_coord, how = 'right')
                 )

# Add info on variants to loop-variant pairs
loop_var_coord = (dnSVs_scored
                  .rename(columns = {'index':'var_index'})
                  [['chrom', 'pos', 'end', 'svlen', 'svtype', 'role', 'var_index', 'MSE', 'correlation']]
                  .merge(loop_var_coord, how = 'right')
                 )

loop_var_coord.insert(6, 'center_coord_var', [round((x+y)/2) for x,y in zip(loop_var_coord.end, loop_var_coord.pos)])

# Get leftmost and rightmost coordinates of E, P and variant
loop_var_coord['start'] = [min(x,y) for x,y in zip(loop_var_coord.start1, 
                                                    loop_var_coord.pos)]

loop_var_coord['end'] = [max(x,y) for x,y in zip(loop_var_coord.end2, 
                                                    loop_var_coord.end)]


In [33]:
# For duplications, center the duplicated sequence not the reference sequence
# Extend the variant coordinates on the opposite side of the loop

# Annotate variants based on whether they are on the left or right of the loop
loop_var_coord.loc[loop_var_coord.pos < loop_var_coord.start1,'var_position'] = 'left'
loop_var_coord.loc[loop_var_coord.pos > loop_var_coord.end2,'var_position'] = 'right'

# Var to the left of the loop
loop_var_coord.loc[(loop_var_coord.svtype == 'DUP') & 
                   (loop_var_coord.var_position == 'left'),
                   'start'] -= loop_var_coord.loc[(loop_var_coord.svtype == 'DUP') & 
                                                  (loop_var_coord.var_position == 'left'),
                                                   'svlen']

# Var to the right of the loop
loop_var_coord.loc[(loop_var_coord.svtype == 'DUP') & 
                   (loop_var_coord.var_position == 'right'),
                   'end'] += loop_var_coord.loc[(loop_var_coord.svtype == 'DUP') & 
                                                (loop_var_coord.var_position == 'right'),
                                                 'svlen']

# Remove trios that exceed prediction window
loop_var_coord = loop_var_coord[loop_var_coord.end - loop_var_coord.start < seq_length - (bin_size*64)]


In [34]:
# For duplications, the alternate allele should be able to fit with the loop in the predictive window
loop_var_coord.loc[(loop_var_coord.svtype == 'DUP') & 
                   (loop_var_coord.end - loop_var_coord.start + loop_var_coord.svlen > seq_length-(bin_size*64)),
                   'remove'] = 'yes'
loop_var_coord = loop_var_coord[loop_var_coord.remove != 'yes'].drop('remove', axis = 1)


In [35]:
# % of variants in predictive window with loop
len(loop_var_coord.var_index.unique())/len(dnSVs_scored)*100

50.072150072150066

### Get variant-specific shift values

In [36]:

# Add shift value

# when variant is left of the loop
loop_var_coord.loc[(loop_var_coord.var_position == 'left'),
                   'shift_by'] = [round((loop_end-var_end)/2) for loop_end,var_end in 
                               zip(loop_var_coord[(loop_var_coord.var_position == 'left')].end2, 
                                   loop_var_coord[(loop_var_coord.var_position == 'left')].end)]

# For duplications, consider the variant as the duplicated sequence so it's not out of the window after shifting
loop_var_coord.loc[(loop_var_coord.var_position == 'left') & (loop_var_coord.svtype == 'DUP'),
                   'shift_by'] = [round((loop_end-var_end)/2+svlen/2) for loop_end,var_end,svlen in 
                               zip(loop_var_coord[(loop_var_coord.var_position == 'left') & (loop_var_coord.svtype == 'DUP')].end2, 
                                   loop_var_coord[(loop_var_coord.var_position == 'left') & (loop_var_coord.svtype == 'DUP')].end,
                                   loop_var_coord[(loop_var_coord.var_position == 'left') & (loop_var_coord.svtype == 'DUP')].svlen)]


# when variant is right of the loop
loop_var_coord.loc[(loop_var_coord.var_position == 'right'),
                   'shift_by'] = [-round((var_start-loop_start)/2) for var_start,loop_start in 
                               zip(loop_var_coord[(loop_var_coord.var_position == 'right')].pos, 
                                   loop_var_coord[(loop_var_coord.var_position == 'right')].start1)]

# For duplications, consider the variant as the duplicated sequence so it's not out of the window after shifting
loop_var_coord.loc[(loop_var_coord.var_position == 'right') & (loop_var_coord.svtype == 'DUP'),
                   'shift_by'] = [-round((var_start-loop_start)/2+svlen/2) for var_start,loop_start,svlen in 
                               zip(loop_var_coord[(loop_var_coord.var_position == 'right') & (loop_var_coord.svtype == 'DUP')].pos, 
                                   loop_var_coord[(loop_var_coord.var_position == 'right') & (loop_var_coord.svtype == 'DUP')].start1,
                                   loop_var_coord[(loop_var_coord.var_position == 'right') & (loop_var_coord.svtype == 'DUP')].svlen)]




### Get input and read output

In [58]:
# Get input files for SuPreMo

dnSVs_for_supremo_weighted = loop_var_coord[~np.isnan(loop_var_coord.shift_by)][['chrom', 'pos', 'end', 'svtype', 'svlen', 'shift_by', 'var_index']]
dnSVs_for_supremo_weighted.insert(2, 'REF', '-')
dnSVs_for_supremo_weighted.insert(3, 'ALT', '-')
dnSVs_for_supremo_weighted.columns = ['CHROM', 'POS', 'REF', 'ALT', 'END', 'SVTYPE', 'SVLEN', 'shift_by', 'var_index']

# input file
dnSVs_for_supremo_weighted.drop(['shift_by', 'var_index'], axis = 1).to_csv(f'{repo_dir}variant_scoring/supremo-akita_input_weighted/EP_for_SuPreMo.txt', 
                                           sep = '\t', index = False)

# shift file
dnSVs_for_supremo_weighted[['shift_by']].to_csv(f'{repo_dir}variant_scoring/supremo-akita_input_weighted/EP_for_SuPreMo_shifts.txt', 
                                                sep = '\t', index = False)

# Weights file
weights_file = pd.concat([loop_var_coord[['chrom', 'start1', 'end1']]
                          .rename(columns = {'chrom':'chr', 'start1':'start', 'end1':'end'}),
                          loop_var_coord[['chrom', 'start2', 'end2']]
                          .rename(columns = {'chrom':'chr', 'start2':'start', 'end2':'end'})],
                         axis = 0).reset_index(drop = True)
weights_file.to_csv(f'{repo_dir}variant_scoring/supremo-akita_input_weighted/EP_for_SuPreMo_weights.txt', 
                    sep = '\t', index = False)


Run SuPreMo:

python SuPreMo/scripts/SuPreMo.py variant_scoring/supremo-akita_input_weighted/EP_for_SuPreMo.txt \
--get_Akita_scores \
--shifts_file variant_scoring/supremo-akita_input_weighted/EP_for_SuPreMo_shifts.txt \
--roi variant_scoring/supremo-akita_input_weighted/EP_for_SuPreMo_weights.txt \
--roi_scales 10 1000000 \
--dir variant_scoring/supremo-akita_output_weighted \
--file EP \
--fa SuPreMo/data/hg38.fa

In [48]:
loop_var_coord[~np.isnan(loop_var_coord.shift_by)].drop(['center_coord_var', 'start1', 'end1', 'start2', 'end2', 'start', ], axis = 1)

Unnamed: 0,chrom,pos,end,svlen,svtype,role,var_index,MSE,correlation,start1,end1,start2,end2,loop_index,start,var_position,shift_by
0,chr1,212552883,212955000.0,80,DUP,proband,35,0.002183,0.992207,212855000.0,212860000.0,212950000.0,212955000.0,11,212552803.0,left,40.0
2,chr1,27972501,28114499.0,70999,DUP,proband,5,0.024572,0.939746,27285000.0,27455000.0,27600000.0,27610000.0,13,27285000.0,right,-379250.0
4,chr1,27972501,28114499.0,70999,DUP,proband,5,0.024572,0.939746,27285000.0,27455000.0,27600000.0,27630000.0,15,27285000.0,right,-379250.0
5,chr1,27972501,28114499.0,70999,DUP,proband,5,0.024572,0.939746,27285000.0,27455000.0,27565000.0,27615000.0,16,27285000.0,right,-379250.0
6,chr1,27972501,28114499.0,70999,DUP,proband,5,0.024572,0.939746,27285000.0,27455000.0,27600000.0,27615000.0,17,27285000.0,right,-379250.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4686,chr22,28295850,29190000.0,217774,DEL,sib,644,0.065464,0.858959,29070000.0,29075000.0,29185000.0,29190000.0,15142,28295850.0,left,0.0
4687,chr22,28196671,28720000.0,17812,DEL,proband,643,0.014642,0.975664,28680000.0,28685000.0,28700000.0,28720000.0,15144,28196671.0,left,0.0
4688,chr22,29206135,29234175.0,14020,DUP,sib,645,0.013520,0.965884,28680000.0,28685000.0,28700000.0,28720000.0,15144,28680000.0,right,-270078.0
4689,chr22,28295850,28720000.0,217774,DEL,sib,644,0.065464,0.858959,28680000.0,28685000.0,28700000.0,28720000.0,15144,28295850.0,left,0.0


In [44]:
pd.read_csv(f'{repo_dir}variant_scoring/supremo-akita_output_weighted/EP_scores', 
                                        sep = '\t')

Unnamed: 0,var_index,mse_HFF_shifted,mse_10-weighted_HFF_shifted,mse_1000000-weighted_HFF_shifted,corr_HFF_shifted,corr_unweighted_HFF_shifted,corr_10-weighted_HFF_shifted,corr_1000000-weighted_HFF_shifted
0,0,0.004695,0.004988,0.006108,0.991122,0.011486,0.012042,0.014172
1,1,0.015719,0.018119,0.018889,0.958439,0.045866,0.037107,0.034296
2,2,0.015719,0.018119,0.018889,0.958439,0.045866,0.037107,0.034296
3,3,0.015719,0.018119,0.018889,0.958439,0.045866,0.037107,0.034296
4,4,0.015719,0.018119,0.018889,0.958439,0.045866,0.037107,0.034296
...,...,...,...,...,...,...,...,...
3181,3181,0.016165,0.015632,0.015117,0.956797,0.047436,0.048968,0.050456
3182,3182,0.017810,0.017139,0.015773,0.943461,0.087236,0.071013,0.037968
3183,3183,0.019672,0.019084,0.018457,0.971667,0.038643,0.037469,0.036219
3184,3184,0.031670,0.026324,0.017858,0.936336,0.070892,0.058551,0.039010


In [45]:
pd.concat([loop_var_coord[~np.isnan(loop_var_coord.shift_by)],
                                    pd.read_csv(f'{repo_dir}variant_scoring/supremo-akita_output_weighted/EP_scores', 
                                                sep = '\t')],
                                   axis = 1).columns

Index(['chrom', 'pos', 'end', 'svlen', 'svtype', 'role', 'center_coord_var',
       'var_index', 'MSE', 'correlation', 'start1', 'end1', 'start2', 'end2',
       'loop_index', 'start', 'var_position', 'shift_by', 'var_index',
       'mse_HFF_shifted', 'mse_10-weighted_HFF_shifted',
       'mse_1000000-weighted_HFF_shifted', 'corr_HFF_shifted',
       'corr_unweighted_HFF_shifted', 'corr_10-weighted_HFF_shifted',
       'corr_1000000-weighted_HFF_shifted'],
      dtype='object')

In [61]:
dnSVs_for_supremo_weighted

Unnamed: 0,CHROM,POS,REF,ALT,END,SVTYPE,SVLEN,shift_by,var_index
0,chr1,212552883,-,-,212955000.0,DUP,80,40.0,35
2,chr1,27972501,-,-,28114499.0,DUP,70999,-379250.0,5
4,chr1,27972501,-,-,28114499.0,DUP,70999,-379250.0,5
5,chr1,27972501,-,-,28114499.0,DUP,70999,-379250.0,5
6,chr1,27972501,-,-,28114499.0,DUP,70999,-379250.0,5
...,...,...,...,...,...,...,...,...,...
4686,chr22,28295850,-,-,29190000.0,DEL,217774,0.0,644
4687,chr22,28196671,-,-,28720000.0,DEL,17812,0.0,643
4688,chr22,29206135,-,-,29234175.0,DUP,14020,-270078.0,645
4689,chr22,28295850,-,-,28720000.0,DEL,217774,0.0,644


In [63]:
# Read SuPreMo output and save to file

dnSVs_scored_weighted = pd.concat([dnSVs_for_supremo_weighted.reset_index(drop = True),
                                   pd.read_csv(f'{repo_dir}variant_scoring/supremo-akita_output_weighted/EP_scores', 
                                               sep = '\t').drop('var_index', axis = 1)], 
                                  axis = 1).drop(['REF', 'ALT'], axis = 1)

dnSVs_scored_weighted.columns = ['chrom', 'pos', 'end', 'svtype', 'svlen', 'shift_by', 'var_index',
                                 'MSE', 'MSE_weighted', 'MSE_CREint',
                                 'correlation', 'correlation_unweighted',
                                 'correlation_weighted', 'correlation_CREint']

dnSVs_scored_weighted.to_csv(f'{repo_dir}data/dnSVs_scored_weighted', sep = '\t', index = False)