# Sweep enrichments

We look for enrichments in all the 90%-regions

In [1]:
import re, os, sys, pickle, pickle
from pathlib import Path
import numpy
import scipy
import pandas
from pandas import DataFrame, Series
from sklearn.decomposition import PCA
from collections import Counter, defaultdict
import random, bisect

random.seed(7)

import pyfaidx

# my own libaries
from GenomicWindows import window
import GenomicIntervals

numpy.random.seed(7)

Plotting setup:

In [2]:
%matplotlib inline

# Make inline plots vector graphics instead of raster graphics
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('pdf', 'svg')

import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from matplotlib.patches import Rectangle
from matplotlib.lines import Line2D 
from mpl_toolkits.basemap import Basemap
#matplotlib.rcParams['figure.figsize'] = (20.0, 10.0)

import mpld3

import seaborn as sns
sns.set() # sets seaborn default "prettyness:
sns.set_style("whitegrid")
sns.set_context("paper")

# lowess for plotting
from statsmodels.nonparametric.smoothers_lowess import lowess

set1 = {'red': '#e41a1c', 'blue': '#377eb8', 'green': '#4daf4a',
        'purple': '#984ea3', 'orange': '#ff7f00', 
        'yellow': '#ffff33', 'brown': '#a65628'}

Ignore deprecation warnings from mainly seaborn:

In [3]:
# silence deprecation warnings (lots from seaborn)
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=numpy.VisibleDeprecationWarning)

Analysis dirs:

In [4]:
root_dir = Path(os.environ['HOME'], 'simons/faststorage/people/kmt')
meta_data_dir = Path(os.environ['HOME'], 'simons/faststorage/data/metadata')
steps_dir = root_dir / 'steps'
argweaver_dir = steps_dir / 'argweaver/output'
results_dir = root_dir / 'results'
figures_dir = root_dir / 'figures'
data_dir = root_dir / 'data'
pi_dir = steps_dir / 'pi_stores'
dist_dir = steps_dir / 'dist_stores'
#pi_dir = root_dir / 'old_pi_stores'
male_x_haploid_dir = steps_dir / 'male_x_haploids'

Local code in the scripts dir on the cluster:

In [5]:
scripts_dir = root_dir / 'scripts'
if str(scripts_dir) not in sys.path:
    sys.path.append(str(scripts_dir))

import simons_meta_data
import hg19_chrom_sizes

from toggle_code_and_errors import toggle_code_html, toggle_errors_html

Import variables global to the entire analysis:

In [6]:
import analysis_globals

## Convenience functions

In [7]:
def silent_nanmean(x):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        return numpy.nanmean(x)
    
def ident_scalar(s):
    x = s.unique()
    assert(len(x)) == 1, x
    return x[0]

## Load meta data

In [8]:
# easy loading of meta data in a consistent manner across code
individuals, populations, regions = simons_meta_data.get_meta_data(meta_data_dir=meta_data_dir)

pop_categories = pandas.read_hdf(str(results_dir / 'population_categories.store'), 'sr')
region_categories = pandas.read_hdf(str(results_dir / 'region_categories.store'), 'sr')
# region_colors = dict(zip(list(region_categories), 
#                          ['#e41a1c', '#377eb8', '#4daf4a', '#984ea3', 
#                           '#ff7f00', '#ffff33', '#a65628']))

region_colors = dict(zip(list(region_categories), 
                         ['#e41a1c', '#377eb8',  '#984ea3', '#4daf4a',
                          '#ff7f00', '#ffff33', '#a65628']))

chromosome_lengths = dict((k.replace('chr', ''), v) for k, v in hg19_chrom_sizes.hg19_chrom_sizes.items())

reference_genome_file = Path('/home', 'kmt', 'simons', 
                        'faststorage', 'cteam_lite_public3', 'FullyPublic', 'Href.fa')

In [9]:
def genes_overlapping_intervals(genes, peaks):    
    lst = list()
    for tup in peaks.itertuples():
        df = genes.copy().loc[(genes.start < tup.end) & (genes.end > tup.start)]
        df['peak_start'] = tup.start
        df['peak_end'] = tup.end
        lst.append(df)
    return pandas.concat(lst)
        
def genes_in_intervals(genes, peaks):
    pos = genes.start + (genes.end-genes.start)/2
    idx = numpy.equal(numpy.searchsorted(peaks.start, pos) - 1, numpy.searchsorted(peaks.end, pos, side='left'))
    return genes.copy().iloc[idx]

## Sweep peak and region data

In [10]:
sweep_peaks = pandas.read_hdf(results_dir / 'sweep_peaks.hdf')

In [11]:
extended_peak_regions_subset = (pandas.read_hdf(results_dir / 'extended_peak_regions_90%.hdf')
#                         .loc[lambda df: df.prop_swept >= analysis_globals.min_prop_swept]
                        )
extended_peak_regions_subset.head()
extended_peak_regions_subset['start'] = extended_peak_regions_subset.start_pos
extended_peak_regions_subset['end'] = extended_peak_regions_subset.end_pos

## Biomart gene annotation on chrX

In [12]:
biomart_genes_x = pandas.read_hdf(results_dir / 'biomart_genes.hdf').loc[lambda df: df.chrom == 'X']
print(len(biomart_genes_x))
biomart_genes_x.head()

2392


Unnamed: 0,Gene stable ID,chrom,start,end,strand,Gene type,name
3215,ENSG00000102309,X,71401203,71522776,1,protein_coding,PIN4
3325,ENSG00000186871,X,71424510,71458897,-1,protein_coding,ERCC6L
3382,ENSG00000198034,X,71475529,71497150,-1,protein_coding,RPS4X
3594,ENSG00000102100,X,48760459,48769235,-1,protein_coding,SLC35A2
3815,ENSG00000102096,X,48770459,48776301,-1,protein_coding,PIM2


## Testis genes from human protein atlas

2237 genes show elevated expression in the testis compared to other tissues:

In [13]:
hpatlas_testis_elevated = pandas.read_table(data_dir / 'tissue_specificity_rna_testis_elevated.tsv')
testis_elevated_genes = biomart_genes_x.loc[biomart_genes_x.name.isin(hpatlas_testis_elevated.Gene)]
len(testis_elevated_genes)

191

1079 enriched have at least five-fold higher mRNA levels compared to all other tissues

In [14]:
hpatlas_testis_enriched = pandas.read_table(data_dir / 'tissue_specificity_rna_testis_Tissue.tsv')
testis_enriched_genes = biomart_genes_x.loc[biomart_genes_x.name.isin(hpatlas_testis_enriched.Gene)]
len(testis_enriched_genes)

136

In [15]:
overlap = genes_overlapping_intervals(testis_elevated_genes, extended_peak_regions_subset)
overlap

Unnamed: 0,Gene stable ID,chrom,start,end,strand,Gene type,name,peak_start,peak_end
28692,ENSG00000147081,X,49955406,49965664,-1,protein_coding,AKAP4,49500000,50000000
29201,ENSG00000147082,X,49967364,50094909,1,protein_coding,CCNB3,49500000,50000000
30740,ENSG00000122824,X,51075083,51080377,1,protein_coding,NUDT10,50800000,51300000
30795,ENSG00000187690,X,51149767,51151687,1,protein_coding,CXorf67,50800000,51300000
30809,ENSG00000196368,X,51232863,51239448,-1,protein_coding,NUDT11,50800000,51300000
47112,ENSG00000196632,X,54219256,54385075,-1,protein_coding,WNK3,54000000,54400000
50027,ENSG00000189299,X,55649833,55652621,1,protein_coding,FOXR2,55300000,55800000
50784,ENSG00000204071,X,101395448,101397942,-1,protein_coding,TCEAL6,100900000,101400000
58085,ENSG00000123496,X,114238538,114254540,-1,protein_coding,IL13RA2,114000000,114300000
40148,ENSG00000123165,X,127184943,127186382,-1,protein_coding,ACTRT1,126800000,127400000


Fisher's exact test show no enrichment:

In [16]:
overlap_all = genes_overlapping_intervals(biomart_genes_x, extended_peak_regions_subset)

nr_prot_overlapping = len(overlap_all.loc[lambda df: df['Gene type'] == 'protein_coding'])
nr_prot_biomart = len(biomart_genes_x.loc[lambda df: df['Gene type'] == 'protein_coding'])

table = [[len(overlap), len(testis_elevated_genes)-len(overlap)],
         [nr_prot_overlapping, nr_prot_biomart-nr_prot_overlapping]]
print(table)

#scipy.stats.fisher_exact([[20, 100],[100, 1000]], alternative='greater')
scipy.stats.fisher_exact(table, alternative='greater')

[[13, 178], [88, 742]]


(0.61580694586312568, 0.96189309344982155)



**Entrez Gene Summary for AKAP4 Gene:** 
The A-kinase anchor proteins (AKAPs) are a group of structurally diverse proteins, which have the common function of binding to the regulatory subunit of protein kinase A (PKA) and confining the holoenzyme to discrete locations within the cell. This gene encodes a member of the AKAP family. The encoded protein is localized to the sperm flagellum and may be involved in the regulation of sperm motility. Alternative splicing of this gene results in two transcript variants encoding different isoforms. [provided by RefSeq, Jul 2008]

**GeneCards Summary for AKAP4 Gene:**
AKAP4 (A-Kinase Anchoring Protein 4) is a Protein Coding gene. Diseases associated with AKAP4 include Retinitis Pigmentosa 70. Among its related pathways are Activation of cAMP-Dependent PKA and Signal transduction_PKA signaling. Gene Ontology (GO) annotations related to this gene include protein kinase A binding. An important paralog of this gene is AKAP3.

----

**Entrez Gene Summary for NUDT10 Gene: **
This gene is a member of the nudix (nucleoside diphosphate linked moiety X)-type motif containing family. The encoded protein is a phosphohydrolase and may regulate the turnover of diphosphoinositol polyphosphates. The turnover of these high-energy diphosphoinositol polyphosphates represents a molecular switching activity with important regulatory consequences. Molecular switching by diphosphoinositol polyphosphates may contribute to the regulation of intracellular trafficking. In some populations putative prostate cancer susceptibility alleles have been identified for this gene. Alternatively spliced transcript variants, which differ only in the 5' UTR, have been found for this gene. [provided by RefSeq, Feb 2015]

**GeneCards Summary for NUDT10 Gene:**
NUDT10 (Nudix Hydrolase 10) is a Protein Coding gene. Diseases associated with NUDT10 include Autoimmune Disease Of Endocrine System. Among its related pathways are Metabolism and Inositol phosphate metabolism (REACTOME). Gene Ontology (GO) annotations related to this gene include hydrolase activity and inositol diphosphate tetrakisphosphate diphosphatase activity. An important paralog of this gene is NUDT11.

----

**GeneCards Summary for CXorf67 Gene:**
CXorf67 (Chromosome X Open Reading Frame 67) is a Protein Coding gene. Diseases associated with CXorf67 include Endometrial Stromal Sarcoma and Endometrial Stromal Nodule.

----

**Entrez Gene Summary for WNK3 Gene:**
This gene encodes a protein belonging to the 'with no lysine' family of serine-threonine protein kinases. These family members lack the catalytic lysine in subdomain II, and instead have a conserved lysine in subdomain I. This family member functions as a positive regulator of the transcellular Ca2+ transport pathway, and it plays a role in the increase of cell survival in a caspase-3-dependent pathway. Alternative splicing results in multiple transcript variants. [provided by RefSeq, May 2010]

**GeneCards Summary for WNK3 Gene:**
WNK3 (WNK Lysine Deficient Protein Kinase 3) is a Protein Coding gene. Diseases associated with WNK3 include Syndromic X-Linked Intellectual Disability Siderius Type and Lung Large Cell Carcinoma. Among its related pathways are Diuretics Pathway, Pharmacodynamics and Ion channel transport. Gene Ontology (GO) annotations related to this gene include transferase activity, transferring phosphorus-containing groups and protein tyrosine kinase activity. An important paralog of this gene is WNK2.

**UniProtKB/Swiss-Prot for WNK3 Gene:**
WNK3_HUMAN,Q9BYP7
Serine/threonine kinase which plays an important role in the regulation of electrolyte homeostasis, cell signaling, survival and proliferation. Acts as an activator and inhibitor of sodium-coupled chloride cotransporters and potassium-coupled chloride cotransporters respectively (PubMed:16275913, PubMed:16275911, PubMed:16357011). Phosphorylates WNK4. Regulates the phosphorylation of SLC12A1 and SLC12A2. Increases Ca(2+) influx mediated by TRPV5 and TRPV6 by enhancing their membrane expression level via a kinase-dependent pathway (PubMed:18768590). Inhibits the activity of KCNJ1 by decreasing its expression at the cell membrane in a non-catalytic manner.

WNK3 is associated with the GO term: GO:0043066	**negative regulation of apoptotic process**

----

**GeneCards Summary for TCEAL6 Gene:**
TCEAL6 (Transcription Elongation Factor A Like 6) is a Protein Coding gene. An important paralog of this gene is TCEAL3.

**UniProtKB/Swiss-Prot for TCEAL6 Gene:**
May be involved in transcriptional regulation.

----

**Entrez Gene Summary for IL13RA2 Gene:**
The protein encoded by this gene is closely related to Il13RA1, a subuint of the interleukin 13 receptor complex. This protein binds IL13 with high affinity, but lacks cytoplasmic domain, and does not appear to function as a signal mediator. It is reported to play a role in the internalization of IL13. [provided by RefSeq, Jul 2008]

**GeneCards Summary for IL13RA2 Gene:**
IL13RA2 (Interleukin 13 Receptor Subunit Alpha 2) is a Protein Coding gene. Diseases associated with IL13RA2 include Malignant Glioma and Glioblastoma Multiforme. Among its related pathways are Cytokine Signaling in Immune system and Akt Signaling. Gene Ontology (GO) annotations related to this gene include signal transducer activity and cytokine receptor activity. An important paralog of this gene is IL5RA.

----

**Entrez Gene Summary for ACTRT1 Gene:**
This gene encodes a protein related to the cytoskeletal protein beta-actin. This protein is a major component of the calyx in the perinuclear theca of mammalian sperm heads, and it therefore likely functions in spermatid formation. This gene is intronless and is similar to a related gene located on chromosome 1. A related pseudogene has also been identified approximately 75 kb downstream of this gene on chromosome X. [provided by RefSeq, May 2010]

**GeneCards Summary for ACTRT1 Gene:**
ACTRT1 (Actin Related Protein T1) is a Protein Coding gene. An important paralog of this gene is ACTRT2.

----

**Entrez Gene Summary for H2AFB1 Gene:**
Histones are basic nuclear proteins that are responsible for the nucleosome structure of the chromosomal fiber in eukaryotes. Nucleosomes consist of approximately 146 bp of DNA wrapped around a histone octamer composed of pairs of each of the four core histones (H2A, H2B, H3, and H4). The chromatin fiber is further compacted through the interaction of a linker histone, H1, with the DNA between the nucleosomes to form higher order chromatin structures. This gene encodes a replication-independent histone that is a member of the histone H2A family. **This gene is part of a region that is repeated three times on chromosome X, once in intron 22 of the F8 gene and twice closer to the Xq telomere. This record represents the most centromeric copy which is in intron 22 of the F8 gene.** [provided by RefSeq, Oct 2015]

- H2AFB1 at chrX:154689080-154689596 - (NM_001017990) histone H2A-Bbd type 1
- H2AFB1 at chrX:154610428-154610944 - (NM_001017990) histone H2A-Bbd type 1
- H2AFB1 at chrX:154113317-154113833 - (NM_001017990) histone H2A-Bbd type 1

**GeneCards Summary for H2AFB1 Gene:**
H2AFB1 (H2A Histone Family Member B1) is a Protein Coding gene. Among its related pathways are Mitotic Prophase and Meiosis. Gene Ontology (GO) annotations related to this gene include protein heterodimerization activity. An important paralog of this gene is H2AFB2.

**UniProtKB/Swiss-Prot for H2AFB1 Gene:**
***Atypical histone H2A*** which can replace conventional H2A in some nucleosomes and is associated with active transcription and mRNA processing (PubMed:22795134). Nucleosomes wrap and compact DNA into chromatin, limiting DNA accessibility to the cellular machineries which require DNA as a template. Histones thereby play a central role in transcription regulation, DNA repair, DNA replication and chromosomal stability (PubMed:15257289, PubMed:16287874, PubMed:16957777, PubMed:17591702, PubMed:17726088, PubMed:18329190, PubMed:22795134). Nucleosomes containing this histone are less rigid and organize less DNA than canonical nucleosomes in vivo (PubMed:15257289, PubMed:16957777, PubMed:17591702, PubMed:24336483). They are enriched in actively transcribed genes and associate with the elongating form of RNA polymerase (PubMed:17591702, PubMed:24753410). They associate with spliceosome components and are required for mRNA splicing (PubMed:22795134).




## All overlapping biomart genes

In [17]:
overlap_all['Gene type'].value_counts()

protein_coding          88
pseudogene              72
miRNA                   21
snRNA                   12
lincRNA                 11
misc_RNA                 8
antisense                7
snoRNA                   5
sense_intronic           4
processed_transcript     2
rRNA                     1
sense_overlapping        1
Name: Gene type, dtype: int64

One overlapping protein coding gene is:

**MAGEH1**: This gene belongs to the non-CT (non cancer/testis) subgroup of the melanoma-associated antigen (MAGE) superfamily. The encoded protein is likely associated with apoptosis, cell cycle arrest, growth inhibition or cell differentiation. The protein may be involved in the atRA (all-trans retinoic acid) signaling through the STAT1-alpha (signal transducer and activator of transcription 1-alpha) pathway. [provided by RefSeq, Aug 2013]


Another is **AR**:

**Entrez Gene Summary for AR Gene:**
The androgen receptor gene is more than 90 kb long and codes for a protein that has 3 major functional domains: the N-terminal domain, DNA-binding domain, and androgen-binding domain. The protein functions as a steroid-hormone activated transcription factor. Upon binding the hormone ligand, the receptor dissociates from accessory proteins, translocates into the nucleus, dimerizes, and then stimulates transcription of androgen responsive genes. This gene contains 2 polymorphic trinucleotide repeat segments that encode polyglutamine and polyglycine tracts in the N-terminal transactivation domain of its protein. Expansion of the polyglutamine tract from the normal 9-34 repeats to the pathogenic 38-62 repeats causes spinal bulbar muscular atrophy (SBMA, also known as Kennedy's disease). Mutations in this gene are also associated with complete androgen insensitivity (CAIS). Alternative splicing results in multiple transcript variants encoding different isoforms. [provided by RefSeq, Jan 2017]


Also associated with: GO:2001237, **negative regulation of extrinsic apoptotic signaling pathway**


## All overlapping biomart *non-coding* genes

In [18]:
with pandas.option_context('display.max_rows', None, 'display.max_columns', None):
    display(overlap_all
            #.loc[lambda df: df['Gene type'] == 'protein_coding']
            .loc[lambda df: (df['Gene type'] != 'protein_coding') & (df['Gene type'] != 'pseudogene')]
            .sort_values(['start']))


Unnamed: 0,Gene stable ID,chrom,start,end,strand,Gene type,name,peak_start,peak_end
47346,ENSG00000234129,X,10865996,11129261,-1,lincRNA,RP11-120D5.1,11100000,11500000
47352,ENSG00000207151,X,11134212,11134313,-1,misc_RNA,Y_RNA,11100000,11500000
47356,ENSG00000263652,X,11336734,11336806,-1,miRNA,MIR548AX,11100000,11500000
47660,ENSG00000221278,X,14783097,14783239,-1,miRNA,AC003658.1,14700000,14900000
48163,ENSG00000206663,X,20003176,20003277,-1,misc_RNA,Y_RNA,19600000,20200000
48165,ENSG00000264566,X,20035206,20035305,-1,miRNA,MIR23C,19600000,20200000
48168,ENSG00000201882,X,20154184,20154253,-1,snoRNA,snoU2-30,19600000,20200000
48170,ENSG00000201592,X,20154424,20154503,-1,snoRNA,snoU2_19,19600000,20200000
48171,ENSG00000225037,X,20158086,20158562,1,antisense,EIF1AX-AS1,19600000,20200000
48173,ENSG00000206716,X,21232755,21232861,1,snRNA,RNU6-133P,21100000,21600000



X chromosome inactivation: lincRNA. X-inactivation center: **XIST, TSIX, JPX, FTX**.


----

**Y RNA:** Two functions have been described for Y RNAs in the literature: As a repressor of Ro60, and as an initiation factor for DNA replication. Mutant human Y RNAs lacking the conserved binding site for Ro60 protein still support DNA replication,[3] indicating that binding to Ro protein and promoting DNA replication are two separable functions of Y RNAs. Although Y RNA-derived small RNAs are similar in size to microRNAs, it has been shown that these Y RNA fragments are not involved in the microRNA pathway.[8]

Y RNAs are overexpressed in some human tumours and are required for cell proliferation[11] and small, microRNA-sized breakdown products may be involved in autoimmunity and other pathological conditions.[12] Recent work has demonstrated that Y RNAs are modified at their 3' end by the non-canonical poly(A) polymerase PAPD5, and the short oligo(A) tail added by PAPD5 is a marker for 3' end processing by the ribonuclease PARN/EXOSC10 or for degradation by the exonuclease DIS3L.[13] Since PARN deficiency causes a severe form of the bone marrow disease Dyskeratosis Congenita as well as pulmonary fibrosis,[14][15] it is possible that defects in Y RNA processing contribute to the severe pathology observed in these patients

[From wikipedia](https://en.wikipedia.org/wiki/Y_RNA)

## Overlapping Chalmel genes

In [19]:
chalmel_genes = pandas.read_hdf(results_dir / 'chalmel_genes.hdf')

In [20]:
df = chalmel_genes.loc[lambda df: (df.Pattern.isin(['9', '10', '11', '12', '13'])) & (df.chrom == 'X'), 
                  ['name', 'chrom', 'start', 'end', 'Pattern', 'Expression in Testis']]

genes_overlapping_intervals(df, extended_peak_regions_subset)

Unnamed: 0,name,chrom,start,end,Pattern,Expression in Testis,peak_start,peak_end
2638,MAP7D2,X,20024831,20135035,9,PET,19600000,20200000
3357,XK,X,37545012,37591383,10,IE,37200000,37700000
26,AKAP4,X,49955406,49965664,13,SEHET,49500000,50000000
3018,PAGE4,X,49593863,49598576,10,IE,49500000,50000000
2879,CXorf67,X,51149767,51151687,9,SET,50800000,51300000
5740,XIST,X,73040486,73072588,9,UE,73000000,73500000
379,IL13RA2,X,114238538,114254540,11,IE,114000000,114300000


**Entrez Gene Summary for PAGE4 Gene:**
This gene is a member of the GAGE family. The GAGE genes are expressed in a variety of tumors and in some fetal and reproductive tissues. This gene is strongly expressed in prostate and prostate cancer. It is also expressed in other male and female reproductive tissues including testis, fallopian tube, uterus, and placenta, as well as in testicular cancer and uterine cancer. The protein encoded by this gene shares sequence similarity with other GAGE/PAGE proteins, and also belongs to a family of CT (cancer-testis) antigens. The protein may play a role in benign and malignant prostate diseases. A related pseudogene is located on chromosome 7. Alternate splicing results in multiple transcript variants. [provided by RefSeq, Jan 2016]

**GeneCards Summary for PAGE4 Gene:**
PAGE4 (PAGE Family Member 4) is a Protein Coding gene. Diseases associated with PAGE4 include Testicular Cancer and Prostate Cancer.


## GO enrichment analysis on Gorilla web server

In [21]:
with open(str(results_dir / 'candidate_gene_list.txt'), 'w') as f:
    for name in overlap_all.loc[lambda df: df['Gene type'] == 'protein_coding'].name:
        print(name, file=f)
        
with open(str(results_dir / 'bacground_gene_list.txt'), 'w') as f:
    for name in biomart_genes_x.loc[lambda df: df['Gene type'] == 'protein_coding'].name:
        print(name, file=f)

Enrichment for one cellular process was found, but no enrichmnet for cellulcar functions or components.

| GO term | Description | P-value |	FDR q-value | Enrichment (N, B, n, b) |
|:----|:----|:----|:----|:----|
| GO:0051054 | positive regulation of DNA metabolic process | 2.41E-4 | 1E0 | 6.55 (734,7,80,5) |
    

- DKC1: dyskeratosis congenita 1, dyskerin
- ATRX: alpha thalassemia/mental retardation syndrome x-linked
- FANCB: fanconi anemia, complementation group b
- PAK3: p21 protein (cdc42/rac)-activated kinase 3
- BRCC3: brca1/brca2-containing complex, subunit 3


BRCC3 sits in the same sweep region as the three copies of the H2AFB1 Gene that encodes an atypical histone H2A.

**Entrez Gene Summary for BRCC3 Gene:**
This gene encodes a subunit of the BRCA1-BRCA2-containing complex (BRCC), which is an E3 ubiquitin ligase. This complex plays a role in the DNA damage response, where it is responsible for the stable accumulation of BRCA1 at DNA break sites. The component encoded by this gene can specifically cleave Lys 63-linked polyubiquitin chains, and it regulates the abundance of these polyubiquitin chains in chromatin. The loss of this gene results in abnormal angiogenesis and is associated with syndromic moyamoya, a cerebrovascular angiopathy. Alternative splicing results in multiple transcript variants. A related pseudogene has been identified on chromosome 5. [provided by RefSeq, Jun 2011]

**GeneCards Summary for BRCC3 Gene:**
BRCC3 (BRCA1/BRCA2-Containing Complex Subunit 3) is a Protein Coding gene. Diseases associated with BRCC3 include Moyamoya Disease 4 With Short Stature, Hypergonadotropic Hypogonadism, And Facial Dysmorphism and T-Cell Prolymphocytic Leukemia. Among its related pathways are Metabolism of proteins and DNA Double-Strand Break Repair. Gene Ontology (GO) annotations related to this gene include metallopeptidase activity and obsolete ubiquitin thiolesterase activity.

**UniProtKB/Swiss-Prot for BRCC3 Gene:**
Metalloprotease that specifically cleaves Lys-63-linked polyubiquitin chains (PubMed:19214193, PubMed:20656690, PubMed:24075985, PubMed:26344097). Does not have activity toward Lys-48-linked polyubiquitin chains. Component of the BRCA1-A complex, a complex that specifically recognizes Lys-63-linked ubiquitinated histones H2A and H2AX at DNA lesions sites, leading to target the BRCA1-BARD1 heterodimer to sites of DNA damage at double-strand breaks (DSBs). In the BRCA1-A complex, it specifically removes Lys-63-linked ubiquitin on histones H2A and H2AX, antagonizing the RNF8-dependent ubiquitination at double-strand breaks (DSBs) (PubMed:20656690). Catalytic subunit of the BRISC complex, a multiprotein complex that specifically cleaves Lys-63-linked ubiquitin in various substrates (PubMed:20656690, PubMed:24075985, PubMed:26344097, PubMed:26195665). Mediates the specific Lys-63-specific deubiquitination associated with the COP9 signalosome complex (CSN), via the interaction of the BRISC complex with the CSN complex (PubMed:19214193). The BRISC complex is required for normal mitotic spindle assembly and microtubule attachment to kinetochores via its role in deubiquitinating NUMA1 (PubMed:26195665). Plays a role in interferon signaling via its role in the deubiquitination of the interferon receptor IFNAR1; deubiquitination increases IFNAR1 activity by enhancing its stability and cell surface expression (PubMed:24075985, PubMed:26344097). Down-regulates the response to bacterial lipopolysaccharide (LPS) via its role in IFNAR1 deubiquitination (PubMed:24075985).


## Overlap to ampliconic genes

In [22]:
ampliconic_regions = pandas.read_hdf(results_dir / 'ampliconic_regions.hdf')

In [23]:
GenomicIntervals.interval_jaccard(extended_peak_regions_subset.assign(chrom = 'X'), 
                                  ampliconic_regions.loc[lambda df: df.chrom == 'X'], 
                                  chromosome_sizes=chromosome_lengths, 
                                  samples=1000)

(0.0061704338859588752, 0.86)

Non-random distance to ampliconic reginos not overlapping: Are ampliconic regions not overlapping low pi regions closer the low pi regions than expected?

In [24]:
ampl_reg_not_overlapping = GenomicIntervals.interval_diff(ampliconic_regions.loc[lambda df: df.chrom == 'X'],
                                                          extended_peak_regions_subset.assign(chrom = 'X'))
GenomicIntervals.distance_stat(ampl_reg_not_overlapping, extended_peak_regions_subset.assign(chrom = 'X'))

(0.24316666666666689, 0.036)

## Overlap to low ILS regions

In [25]:
human_chimp_low_ils_regions_chrX = pandas.read_hdf(results_dir / 'human_chimp_low_ils_regions_chrX.hdf')
human_orang_low_ils_regions_chrX = pandas.read_hdf(results_dir / 'human_orang_low_ils_regions_chrX.hdf')

Human-chimp ILS:

In [26]:
GenomicIntervals.interval_jaccard(extended_peak_regions_subset.assign(chrom = 'X'), 
                                  human_chimp_low_ils_regions_chrX, 
                                  chromosome_sizes=chromosome_lengths, 
                                  samples=100000)

p-value is zero smaller than 1e-05. Increase nr samples to get actual p-value.


(0.17391304347826086, 0.0)

Human-orang ILS:

In [27]:
GenomicIntervals.interval_jaccard(extended_peak_regions_subset.assign(chrom = 'X'), 
                                  human_orang_low_ils_regions_chrX, 
                                  chromosome_sizes=chromosome_lengths, 
                                  samples=1000)

(0.072555205047318619, 0.258)

## Total swept with annotations

In [28]:
sweep_data = pandas.read_hdf(results_dir / 'sweep_data.hdf')
missing_regions = pandas.read_hdf(results_dir / 'missing_regions.hdf')

plot_df = (sweep_data
           .groupby(['start', 'end', 'region_1', 'region_label_1'])['swept']
           .aggregate(['sum', 'size'])
           .rename(columns={'sum': 'nr_swept', 'size': 'total'})
           .reset_index(level=['start', 'end', 'region_1', 'region_label_1'])
          )

plot_df.sort_values(by=['start', 'region_1'], inplace=True)

plot_df['cum_nr_swept'] = (plot_df
                           .loc[plot_df.region_label_1 != 'Africa']
                           .groupby(['start', 'end'])['nr_swept']
                           .transform('cumsum')    
                           )
plot_df['cum_total'] = (plot_df
                        .loc[plot_df.region_label_1 != 'Africa']
                        .groupby(['start', 'end'])['total']
                        .transform('sum')    
                        )
plot_df.head(7)

Unnamed: 0,start,end,region_1,region_label_1,nr_swept,total,cum_nr_swept,cum_total
0,0,100000,Africa,Africa,0.0,22,,
1,0,100000,WestEurasia,WestEurasia,0.0,48,0.0,140.0
2,0,100000,SouthAsia,SouthAsia,0.0,31,0.0,140.0
3,0,100000,CentralAsiaSiberia,CentralAsiaSiberia,0.0,10,0.0,140.0
4,0,100000,Oceania,Oceania,0.0,16,0.0,140.0
5,0,100000,EastAsia,EastAsia,0.0,27,0.0,140.0
6,0,100000,America,America,0.0,8,0.0,140.0


In [29]:
class ClickInfo(mpld3.plugins.PluginBase):
    """mpld3 Plugin for getting info on click        """

    JAVASCRIPT = """
    mpld3.register_plugin("clickinfo", ClickInfo);
    ClickInfo.prototype = Object.create(mpld3.Plugin.prototype);
    ClickInfo.prototype.constructor = ClickInfo;
    ClickInfo.prototype.requiredProps = ["id", "urls"];
    function ClickInfo(fig, props){
        mpld3.Plugin.call(this, fig, props);
    };

    ClickInfo.prototype.draw = function(){
        var obj = mpld3.get_element(this.props.id);
        urls = this.props.urls;
        obj.elements().on("mousedown",
                          function(d, i){ 
                            window.open(urls[i], '_blank')});
    }
    """
    def __init__(self, points, urls):
        self.points = points
        self.urls = urls
        if isinstance(points, matplotlib.lines.Line2D):
            suffix = "pts"
        else:            
            suffix = None
        self.dict_ = {"type": "clickinfo",
                      "id": mpld3.utils.get_id(points, suffix),
                      "urls": urls}

ucsc_search = "https://genome-euro.ucsc.edu/cgi-bin/hgTracks?hgsid=226837763_FKVw0jsAcbutCxMf8luSHlzwx2xW&org=Human&db=hg37&position={}&pix=1361"

# fig, ax = plt.subplots()
# points = ax.scatter(numpy.random.rand(50), numpy.random.rand(50),
#                     s=500, alpha=0.3)
# urls = [ucsc_search.format('CCR5') for i in range(50)]

# # mpld3.plugins.connect(fig, ClickInfo(points, urls))
# # mpld3.display(fig)

In [30]:
with sns.axes_style("whitegrid", {'axes.grid' : False}):
#     fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, sharey=True, figsize=(13, 9),                         
#                                    subplot_kw={'xlim':(0, chromosome_lengths['X']), 'ylim':(0, 1)})

    fig, ax1 = plt.subplots(1, 1, sharex=True, sharey=True, figsize=(13, 9),                         
                                   subplot_kw={'xlim':(0, chromosome_lengths['X']), 'ylim':(0, 1.1)})


    zorder = 1

    zorder += 1
    genes = biomart_genes_x
    pos_list = list()
    labels = list()
    for tup in genes.itertuples():
        x = tup.start + (tup.end - tup.start) / 2
        pos_list.append(x)
        labels.append("{}".format(tup.name))
        ax1.add_line(Line2D([x, x], [0, 1.1], color='lightgrey', zorder=zorder))
    zorder += 1        
    scatter = ax1.scatter(pos_list, [1.05 for x in pos_list], c='lightgrey', s=50, zorder=zorder)
    tooltip = mpld3.plugins.PointLabelTooltip(scatter, labels=labels)
    mpld3.plugins.connect(fig, tooltip)

    urls = [ucsc_search.format(tup.name) for tup in genes.itertuples()]
    mpld3.plugins.connect(fig, ClickInfo(scatter, urls))


#     zorder += 1        
#     genes = chalmel_genes_subset
#     pos_list = list()
#     labels = list()
#     for tup in genes.itertuples():
#         x = tup.start + (tup.end - tup.start) / 2
#         pos_list.append(x)
#         labels.append("{} {}".format(tup.name, tup.Pattern))
#         ax1.add_line(Line2D([x, x], [0, 1.1], color='red', zorder=zorder))
#     zorder += 1        
#     scatter = ax1.scatter(pos_list, [1.05 for x in pos_list], c='red', s=50, zorder=zorder)
#     tooltip = mpld3.plugins.PointLabelTooltip(scatter, labels=labels)
#     mpld3.plugins.connect(fig, tooltip)
    
#     urls = [ucsc_search.format(tup.name) for tup in genes.itertuples()]
#     mpld3.plugins.connect(fig, ClickInfo(scatter, urls))

    
#     zorder += 1        
#     genes = trine_line_x_genes
#     pos_list = list()
#     labels = list()
#     for tup in genes.itertuples():
#         x = tup.start + (tup.end - tup.start) / 2
#         pos_list.append(x)
#         labels.append("{}".format(tup.ovlpRepExon))
#         ax1.add_line(Line2D([x, x], [0, 1.1], color='pink', zorder=zorder))
#     zorder += 1        
#     scatter = ax1.scatter(pos_list, [1.05 for x in pos_list], c='pink', s=50, zorder=zorder)
#     tooltip = mpld3.plugins.PointLabelTooltip(scatter, labels=labels)
#     mpld3.plugins.connect(fig, tooltip)
    
#     urls = [ucsc_search.format(tup.ovlpRepExon) for tup in genes.itertuples()]
#     mpld3.plugins.connect(fig, ClickInfo(scatter, urls))

    
    regs = [x for x in plot_df.region_1.cat.categories if x != 'Africa'][::-1]
    for reg in regs:
        df = plot_df.loc[plot_df.region_label_1 == reg]
        zorder += 1        
        for tup in df.itertuples():
            if tup.nr_swept:
                g = ax1.add_patch(Rectangle((tup.start, 0), tup.end-tup.start, tup.cum_nr_swept/tup.cum_total, 
                                  facecolor=region_colors[reg], 
                                  linewidth=0,
                                  edgecolor=None,#region_colors[reg], 
                                  zorder=zorder))

#     df = plot_df.loc[plot_df.region_label_1 == 'Africa']
#     for tup in df.itertuples():
#         if tup.nr_swept:
#             g = ax2.add_patch(Rectangle((tup.start, 0), tup.end-tup.start, tup.nr_swept/tup.total, 
#                               facecolor=region_colors['Africa'], 
#                               edgecolor=None,#region_colors[reg], 
#                               ))

    zorder += 1        
    for tup in missing_regions.loc[missing_regions.is_missing == True].itertuples():
        g = ax1.add_patch(Rectangle((tup.start, 0), tup.end-tup.start, 1, 
                 facecolor='lightgray', 
                 #edgecolor=None,
                  linewidth=0,
                 alpha=0.5,
                 zorder=zorder))
#         g = ax2.add_patch(Rectangle((tup.start, 0), tup.end-tup.start, 1, 
#                  facecolor='lightgray', 
#                  edgecolor=None,
#                  alpha=0.5,
#                  zorder=zorder))

  
    plt.savefig(str(figures_dir / "tmp2.pdf"))
    #plt.close() # closing teh plot suppres automatic plotting without plt.show()

mpld3.display(fig)