In [2]:
import pandas as pd
import numpy as np
from pybedtools import BedTool

repo_dir = '../'

# Get data

In [3]:
# Get variants

# all variants
SSC_dnSV_all = pd.read_csv(f'{repo_dir}data/SFARI_SSC_dnSVs.csv').rename(columns = {'#chrom':'chrom'})
SSC_dnSV_all['index_all'] = SSC_dnSV_all.index

# scored variants
SSC_dnSV_scored = pd.read_csv(f'{repo_dir}data/dnSVs_scored', sep = '\t', low_memory=False)

indexes = SSC_dnSV_all.merge(SSC_dnSV_scored, how = 'left')[['index', 'index_all']]

# Add both indexes to both dfs
SSC_dnSV_all = SSC_dnSV_all.merge(indexes, how = 'left')
SSC_dnSV_scored = SSC_dnSV_scored.merge(indexes, how = 'left')

# Get BED files
SSC_dnSV_all_BED = BedTool.from_dataframe(SSC_dnSV_all[['chrom', 'pos', 'end', 'index', 'index_all']])
SSC_dnSV_scored_BED = BedTool.from_dataframe(SSC_dnSV_scored[['chrom', 'pos', 'end', 'index', 'index_all']])

In [44]:
# Variant scores
scores = SSC_dnSV_scored[~np.isnan(SSC_dnSV_scored.correlation)]
weighted_scores = pd.read_csv(f'{repo_dir}data/dnSVs_scored_weighted', sep = '\t')

In [4]:
# Get window around variants
dist = 500000

SV_window = SSC_dnSV_scored[['chrom', 'pos', 'end', 'index']]
SV_window.pos = SV_window.pos - dist
SV_window.loc[SV_window.pos < 0, "pos"] = 0
SV_window.end = SV_window.end + dist
SV_window_BED = BedTool.from_dataframe(SV_window)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  SV_window.pos = SV_window.pos - dist
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  SV_window.end = SV_window.end + dist


In [6]:
# Get exon annotations
gencode_annotations = pd.read_csv(f'{repo_dir}data/gencode.v46.annotation.gtf.gz', 
                                  sep='\t', skiprows=5, header=None, 
                                  names=['chrom','source','desc','start','end','score','strand','score2','desc_further'],
                                  low_memory=False)
gencode_annotations['gene'] = gencode_annotations['desc_further'].str.split('gene_name ').str[1].str.split(';').str[0].str.strip('""')
gencode_annotations['annotation'] = gencode_annotations.gene + '_' + gencode_annotations.desc

gene_exons = gencode_annotations[['exon' in x for x in gencode_annotations.annotation]]

gene_exons_BED = BedTool.from_dataframe(gene_exons[['chrom', 'start', 'end', 'gene']])


# Get variants that fall on exons
vars_on_exons = (SSC_dnSV_all_BED
                 .intersect(gene_exons_BED, wa = True, wb = True)
                 .to_dataframe()
                .rename(columns = {'name':'index', 'score':'index_all', 'itemRgb':'Gene'})
                [['index', 'index_all', 'Gene']]
                .drop_duplicates())

In [11]:
# Get ASD genes (SFARI, Satterstrom, and An)
ASD_genes = pd.read_csv(f'{repo_dir}data/ASD_genes', sep = '\t').Gene.values

# All gene coordinates
gene_coords = pd.read_csv('/pollard/data/projects/kgjoni/genome/hg38/gene_annot_PC')[['chr', 'start', 'end', 'gene']]


In [12]:
ASD_dir = '/pollard/data/projects/kgjoni/Akita/ASD_project/'

# RNAseq
eN_exp = pd.read_csv(f'{ASD_dir}Song2020/shen_bdbag_2019/data/embargo/shen/transcriptome/processed/counts/eN_avg', sep = '\t').dropna() 


# Excitatory neuron regulatory regions

# Poised promoters
H3K27me3 = pd.read_csv(f'{ASD_dir}GSE177154_ENCFF154YOB_pseudoreplicated_peaks_GRCh38.bed', sep = '\t',
                       names = ['chrom', 'start', 'end', 'peak_id', 'score', 'strand', 'signalValue', 'pvalue', 'qvalue', 'peak'])
# Poised enhancers
H3K4me1 = pd.read_csv(f'{ASD_dir}ENCFF904VQK.bed', sep = '\t',
                       names = ['chrom', 'start', 'end', 'peak_id', 'score', 'strand', 'signalValue', 'pvalue', 'qvalue', 'peak'])
# Active enhancers
H3K27Ac = pd.read_csv(f'{ASD_dir}ENCFF857GOV.bed', sep = '\t',
                       names = ['chrom', 'start', 'end', 'peak_id', 'score', 'strand', 'signalValue', 'pvalue', 'qvalue', 'peak'])

# All regulatory_marks
eN_reg_marks = pd.concat([H3K27me3[['chrom', 'start', 'end']],
                          H3K4me1[['chrom', 'start', 'end']],
                          H3K27Ac[['chrom', 'start', 'end']]], axis = 0)

eN_reg_marks_BED = BedTool.from_dataframe(eN_reg_marks).sort().merge()

# CTCF
CTCF = pd.read_csv(f'{ASD_dir}ENCFF579WDY.bed', sep = '\t',
                       names = ['chrom', 'start', 'end', 'peak_id', 'score', 'strand', 'signalValue', 'pvalue', 'qvalue', 'peak'])
CTCF_BED = BedTool.from_dataframe(CTCF[['chrom', 'start', 'end']]).sort()


# Get variants that pass each criteria

We used the following criteria for prioritizing candidate dnSVs.

Required:

1. Not on ASD gene: Variant does not overlap ASD gene exon.
2. No causal dnSV: Proband does not have dnSV overlapping ASD gene exon.
3. Good prediction: Predicted maps for the reference genome around dnSV are similar to NPC experimental HiC maps. This includes dnSVs for which the mean squared error (MSE) between the Akita predicted and the experimental contact maps is less than the 85th percentile of MSE across all scored variants.
4. No sibling dnSV: There are no similar variants in siblings. This includes proband dnSVs that don’t overlap by more than half of the variant region with more than half of any sibling dnSV region.
5. Disrupts CREint: Variant results in changed contact at CREints. This includes dnSVs with weighted disruption scores at CREints (Fig. 2Biv) larger than the 65th percentile across all standard scores (Fig. 1B).
6. Near ASD gene: Variant is within 500 kb of an ASD gene.
7. Disruptive: Variant is disruptive to 3D genome folding of surrounding regions. This includes dnSVs with disruption scores above the 65th percentile across all scores.

Optional:

8. Not on expressed gene: Variant does not overlap expressed gene (TPM > 0.5) exon.
9. Not on ExN RE: Variant does not overlap ExN regulatory elements, namely active enhancers (H3K27Ac), poised enhancers (H3K4me1), and poised promoters (H3K27me3) as defined by their respective ChIP-Seq peaks. 
10. Change on CREint: Variant disruption focused on CREints, meaning weighted score (Fig. 2Biii) is higher than unweighted score (Fig. 2Bii).
11. On CTCF: Variant overlaps less than half of any ExN CTCF ChIP-Seq peak.
12. Deletion: Variant is a deletion. These are the most straightforward to edit in cells.

In [27]:
variant_criteria = SSC_dnSV_scored[['index']]
var_list = {}
criteria = ['disruptive', 'not_ASD_gene', 'no_causal_var', 'good_pred', 'no_sib_var', 
            'disrupt_CREint', 'not_expr_gene', 'not_ExN_RE', 'near_ASD_gene', 'change_on_CREint', 'on_CTCF', 'is_deletion']

In [28]:
# Criteria 1: Variant is disruptive to 3D genome folding of surrounding regions

# Get percentile
# from scipy import stats
# stats.percentileofscore(1 - SSC_results.correlation, 
#                         min(1 - SSC_results[[x in selected_vars for x in SSC_results['index']]].correlation), 
#                         kind='rank')

# Get SVs with disruption scores in the top half
disruption_cutoff = np.nanquantile(1 - scores.correlation, 0.65)
var_list['disruptive'] = scores[1 - scores.correlation > disruption_cutoff]['index'].values

len(var_list['disruptive']), len(scores), len(var_list['disruptive'])/len(scores)*100

(209, 598, 34.94983277591973)

In [29]:
# Criteria 2: Variant is near known ASD genes

ASD_gene_coords_BED = BedTool.from_dataframe(gene_coords[[x in ASD_genes for x in gene_coords.gene]])

var_list['near_ASD_gene'] = np.unique(SV_window_BED
                               .intersect(ASD_gene_coords_BED, wa = True, wb = True)
                               .to_dataframe()
                               .name
                               .values)

len(var_list['near_ASD_gene']), len(scores), len(var_list['near_ASD_gene'])/len(scores)*100

(320, 598, 53.51170568561873)

In [30]:
# Criteria 3: Variant does not overlap exon of ASD-associated gene

# Get variants that don't overlap exons of ASD genes
vars_on_exons_ = vars_on_exons[vars_on_exons['index'] != '.']
vars_on_exons_['index'] = vars_on_exons_['index'].astype('float').astype('int')
exclude = np.unique(vars_on_exons_[[x in ASD_genes for x in vars_on_exons_.Gene]]['index'].values)
var_list['not_ASD_gene'] = variant_criteria[[x not in exclude for x in variant_criteria['index']]]['index'].values

len(var_list['not_ASD_gene']), len(scores), len(var_list['not_ASD_gene'])/len(scores)*100

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vars_on_exons_['index'] = vars_on_exons_['index'].astype('float').astype('int')


(563, 598, 94.14715719063545)

In [31]:
# Criteria 4: Individual does not have exonic de novo variant on ASD gene

# Get individuals with vars on ASD exons
individuals_to_exclude = (vars_on_exons
                          [[x in ASD_genes for x in vars_on_exons.Gene]]
                          [['index_all', 'Gene']]
                          .merge(SSC_dnSV_all[['index_all', 'index', 'sample']], 
                                 how = 'left')
                          ['sample']
                          .values)

# Get variants from individuals without ASD exonic dn vars
var_list['no_causal_var'] = SSC_dnSV_scored[[x not in individuals_to_exclude for x in SSC_dnSV_scored['sample']]]['index'].values

len(var_list['no_causal_var']), len(variant_criteria), len(var_list['no_causal_var'])/len(variant_criteria)*100

(541, 693, 78.06637806637806)

In [36]:
# Criteria 5: Predicted maps for the reference look similar to NPC experimental HiC maps

predicted_vs_experimental_scores = pd.read_table(f'{repo_dir}data/predicted_vs_experimental_scores', sep = '\t')

pred_cutoff = np.nanquantile(predicted_vs_experimental_scores.MSE_HFF_MicroC, 0.85)

var_list['good_pred'] = predicted_vs_experimental_scores[predicted_vs_experimental_scores.MSE_HFF_MicroC < pred_cutoff]['index'].values

len(var_list['good_pred']), len(variant_criteria), len(var_list['good_pred'])/len(variant_criteria)*100

(521, 693, 75.18037518037518)

In [37]:
# Criteria 6: There are no similar variants with similar disruption in siblings

# Proband variants that don't overlap a sibling variant by more than half of each (include sibling not scored variants)
exclude = (BedTool.from_dataframe(SSC_dnSV_scored
                                  [SSC_dnSV_scored.role == 'proband']
                                  [['chrom', 'pos', 'end', 'index']])
           .intersect(BedTool.from_dataframe(SSC_dnSV_all
                                             [SSC_dnSV_all.role == 'sib']
                                             [['chrom', 'pos', 'end']]), f = 0.5, r = True)
           .to_dataframe()
           .name
           .values)

var_list['no_sib_var'] = SSC_dnSV_scored[(SSC_dnSV_scored.role == 'proband') & [x not in exclude for x in SSC_dnSV_scored['index']]]['index'].values

len(var_list['no_sib_var']), len(variant_criteria), len(var_list['no_sib_var'])/len(variant_criteria)*100

(411, 693, 59.307359307359306)

In [41]:
weighted_scores

Unnamed: 0,chrom,pos,end,svtype,svlen,shift_by,MSE,MSE_weighted,MSE_CREint,correlation,correlation_unweighted,correlation_weighted,correlation_CREint
0,chr1,212552883,212955000.0,DUP,80,40.0,0.004695,0.004988,0.006108,0.991122,0.011486,0.012042,0.014172
1,chr1,27972501,28114499.0,DUP,70999,-379250.0,0.015719,0.018119,0.018889,0.958439,0.045866,0.037107,0.034296
2,chr1,27972501,28114499.0,DUP,70999,-379250.0,0.015719,0.018119,0.018889,0.958439,0.045866,0.037107,0.034296
3,chr1,27972501,28114499.0,DUP,70999,-379250.0,0.015719,0.018119,0.018889,0.958439,0.045866,0.037107,0.034296
4,chr1,27972501,28114499.0,DUP,70999,-379250.0,0.015719,0.018119,0.018889,0.958439,0.045866,0.037107,0.034296
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3181,chr22,28295850,29190000.0,DEL,217774,0.0,0.016165,0.015632,0.015117,0.956797,0.047436,0.048968,0.050456
3182,chr22,28196671,28720000.0,DEL,17812,0.0,0.017810,0.017139,0.015773,0.943461,0.087236,0.071013,0.037968
3183,chr22,29206135,29234175.0,DUP,14020,-270078.0,0.019672,0.019084,0.018457,0.971667,0.038643,0.037469,0.036219
3184,chr22,28295850,28720000.0,DEL,217774,0.0,0.031670,0.026324,0.017858,0.936336,0.070892,0.058551,0.039010


In [45]:
# Criteria 7: Variant results in changed contact between CRE pair(s)

var_list['disrupt_CREint'] = np.unique(weighted_scores[1 - weighted_scores['correlation_CREint'] > disruption_cutoff]['var_index'].values)

len(var_list['disrupt_CREint']), len(variant_criteria), len(var_list['disrupt_CREint'])/len(variant_criteria)*100


(312, 693, 45.02164502164502)

In [46]:
# Criteria 8: Variant does not overlap exon of expressed genes

# Get variants that don't overlap exons or promoters of expressed genes
expressed_genes = eN_exp[eN_exp.TPM > 0.5].gene.values
vars_on_exons_ = vars_on_exons[vars_on_exons['index'] != '.']
vars_on_exons_['index'] = vars_on_exons_['index'].astype('float').astype('int')
exclude = np.unique(vars_on_exons_[[x in expressed_genes for x in vars_on_exons_.Gene]]['index'].values)

var_list['not_expr_gene'] = variant_criteria[[x not in exclude for x in variant_criteria['index']]]['index'].values

len(var_list['not_expr_gene']), len(variant_criteria), len(var_list['not_expr_gene'])/len(variant_criteria)*100

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vars_on_exons_['index'] = vars_on_exons_['index'].astype('float').astype('int')


(440, 693, 63.49206349206349)

In [47]:
# Criteria 9: Variant does not overlap excitatory neuron regulatory regions

exclude = np.unique(SSC_dnSV_scored_BED.intersect(eN_reg_marks_BED).to_dataframe().name.values)
var_list['not_ExN_RE'] = variant_criteria[[x not in exclude for x in variant_criteria['index']]]['index'].values

len(var_list['not_ExN_RE']), len(variant_criteria), len(var_list['not_ExN_RE'])/len(variant_criteria)*100

(330, 693, 47.61904761904761)

In [48]:
# Criteria 10: Variant disruption score up-weighting CREints is higher than unweighted score

var_list['change_on_CREint'] = np.unique(weighted_scores[[x>y for x,y in zip(weighted_scores['correlation_weighted'], 
                                                                             weighted_scores['correlation_unweighted'])]]['var_index'].values)

len(var_list['change_on_CREint']), len(variant_criteria), len(var_list['change_on_CREint'])/len(variant_criteria)*100


(205, 693, 29.58152958152958)

In [49]:
# Criteria 11: Variant overlaps CTCF binding site by at least half, on_CTCF

var_list['on_CTCF'] = np.unique(SSC_dnSV_scored_BED.intersect(CTCF_BED, F = 0.5).to_dataframe().name.values)

len(var_list['on_CTCF']), len(variant_criteria), len(var_list['on_CTCF'])/len(variant_criteria)*100

(201, 693, 29.004329004329005)

In [50]:
# Criteria 12: Variant is a deletion

var_list['is_deletion'] = SSC_dnSV_scored[SSC_dnSV_scored.svtype == 'DEL']['index'].values

len(var_list['is_deletion']), len(variant_criteria), len(var_list['is_deletion'])/len(variant_criteria)*100

(440, 693, 63.49206349206349)

In [51]:
# Add vars that corresponds to each criteria to df

for criteria_ in criteria:
    variant_criteria[criteria_] = 0
    variant_criteria.loc[[x in var_list[criteria_] for x in variant_criteria['index']], criteria_] = 1

variant_criteria

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  variant_criteria[criteria_] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  variant_criteria[criteria_] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  variant_criteria[criteria_] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = 

Unnamed: 0,index,disruptive,not_ASD_gene,no_causal_var,good_pred,no_sib_var,disrupt_CREint,not_expr_gene,not_ExN_RE,near_ASD_gene,change_on_CREint,on_CTCF,is_deletion
0,0,0,1,1,0,0,1,1,0,0,1,0,1
1,1,1,0,0,1,1,1,0,0,1,1,0,1
2,2,0,1,1,1,1,1,1,1,1,1,0,1
3,3,0,1,1,1,1,1,1,0,1,1,0,1
4,4,0,1,1,1,1,1,1,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
688,688,0,1,1,0,0,0,1,1,0,0,0,0
689,689,0,1,1,0,0,0,1,0,1,0,1,0
690,690,1,1,1,0,1,0,1,0,0,0,0,0
691,691,0,1,1,0,1,0,0,0,0,0,0,0


In [52]:
variant_criteria.to_csv(f'{repo_dir}variant_prioritization/variant_criteria_met', sep = '\t', index = False)