# Complilation of auxiliary data

In [1]:
import re, os, sys#, math, pickle, subprocess, resource, random
from pathlib import Path
import numpy
import pandas
from pandas import DataFrame, Series

numpy.random.seed(7)

### Analysis dirs

In [2]:
root_dir = Path(os.environ['HOME'], 'simons/faststorage/people/kmt')
#meta_data_dir = Path(os.environ['HOME'], 'simons/faststorage/data/metadata')
#steps_dir = root_dir / 'steps'
#argweaver_dir = steps_dir / 'argweaver/output'
results_dir = root_dir / 'results'
#figures_dir = root_dir / 'figures'
data_dir = root_dir / 'data'
#pi_dir = steps_dir / 'pi_stores'
#dist_dir = steps_dir / 'dist_stores'
#male_x_haploid_dir = steps_dir / 'male_x_haploids'

### Local code in the scripts dir on the cluster

In [3]:
scripts_dir = root_dir / 'scripts'
if str(scripts_dir) not in sys.path:
    sys.path.append(str(scripts_dir))

import simons_meta_data
import hg19_chrom_sizes

from ChromosomeWindows import window
import genominterv

  import pandas.util.testing as tm


### Convenience functions

In [4]:
def silent_nanmean(x):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        return numpy.nanmean(x)

### Plotting setup

In [5]:
%matplotlib inline

# Make inline plots vector graphics instead of raster graphics
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('pdf', 'svg')

import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from matplotlib.patches import Rectangle
# from mpl_toolkits.basemap import Basemap
#matplotlib.rcParams['figure.figsize'] = (20.0, 10.0)

import mpld3

import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("paper")

# lowess for plotting
from statsmodels.nonparametric.smoothers_lowess import lowess
from scipy.interpolate import UnivariateSpline

In /home/kmt/anaconda3/envs/simons/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The text.latex.preview rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In /home/kmt/anaconda3/envs/simons/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The mathtext.fallback_to_cm rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In /home/kmt/anaconda3/envs/simons/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: Support for setting the 'mathtext.fallback_to_cm' rcParam is deprecated since 3.3 and will be removed two minor releases later; use 'mathtext.fallback : 'cm' instead.
In /home/kmt/anaconda3/envs/simons/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The validate_bool_maybe_none function was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In /home/kmt/anaconda3/e

In [6]:
# silence deprecation warnings (lots from seaborn)
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=numpy.VisibleDeprecationWarning)

# Load genome annotation

### Ampliconic regions and Shriram's regions

In [7]:
ampliconic_regions = (pandas.read_table(str(data_dir / 'coordinates_hg18_hg19_hg38_Amplicons_Gap.txt'))
                      .assign(start=lambda df: df.hg19_start, 
                              end=lambda df: df.hg19_end,
                              chrom='X')
                      .loc[:, ['chrom', 'start', 'end']]
                     )
ampliconic_regions.head()

# ampliconic_regions = (pandas.read_table(str(data_dir / 'AmpliconicRegions_hg19.csv'),
#                                        header=0,
#                                         names=['chrom', 'start', 'end', 'region', 'midpoint',
#                                                         'size', 'alignment', 'pnilsr']
#                                        )
#                       .loc[:, ['chrom', 'start', 'end']]
#                       .assign(chrom = lambda df: [x.replace('chr', '') for x in df.chrom])
#                      )

ampliconic_regions.to_hdf(results_dir / 'ampliconic_regions.hdf', 'df', mode='w')

no_neanderthal_asn = (pandas.read_table(str(data_dir / 'Neanderthal_ASN.csv'), sep=' ', 
                                       header=0, names=['chrom', 'start', 'end'])
                      .assign(chrom = lambda df: [x.replace('chr', '') for x in df.chrom])
                     )
no_neanderthal_asn.to_hdf(results_dir / 'no_neanderthal_asn.hdf', 'df', mode='w')


no_neanderthal_eur = (pandas.read_table(str(data_dir / 'Neanderthal_EUR.csv'), sep=' ',
                                       header=0, names=['chrom', 'start', 'end'])
                      .assign(chrom = lambda df: [x.replace('chr', '') for x in df.chrom])
                     )
no_neanderthal_eur.to_hdf(results_dir / 'no_neanderthal_eur.hdf', 'df', mode='w')

### RefSeq genes

In [8]:
refseq_genes = (pandas.read_table(str(data_dir / 'refseq_genes_hg19_table.txt')))
refseq_genes.chrom = [x.replace('chr', '') for x in refseq_genes.chrom]
refseq_genes.rename(columns={"txStart": "start", "txEnd": 'end'}, inplace=True)
refseq_genes.sort_values(by=['chrom', 'start', 'end'], inplace=True)

refseq_genes.to_hdf(results_dir / 'refseq_genes.hdf', 'df', mode='w')

Density of transcribed Refseq gene

In [9]:
window_size = 100000
@window(size=window_size, fill='hg19')
def gene_density(df):
    if not len(df):
        return 0
    df = genominterv.interval_collapse(df)
    return sum(df.end - df.start) / window_size

gene_density_df = refseq_genes.groupby('chrom').apply(gene_density).reset_index(level=['chrom'])
gene_density_df.head()

Unnamed: 0,chrom,start,end,gene_density
0,1,0,100000,0.0809
1,1,100000,200000,0.05794
2,1,200000,300000,0.0
3,1,300000,400000,0.05629
4,1,400000,500000,0.0


Density of refseq exons

In [10]:
def get_exons(df):
    exon_starts = list()
    exon_ends = list()
    for row in df.itertuples():
        chrom = row.chrom
        exon_starts.extend(list(map(int, row.exonStarts.split(',')[:-1])))
        exon_ends.extend(list(map(int, row.exonEnds.split(',')[:-1])))
    all_exons = DataFrame({'chrom': chrom, 'start': exon_starts, 'end': exon_ends})
    all_exons.sort_values(['start', 'end'], inplace=True)
    collapsed_exons = genominterv.interval_collapse(all_exons)
    return collapsed_exons

# data frame with exon start end coordinates
refseq_exons = refseq_genes.groupby('chrom').apply(get_exons).reset_index(drop=True)

window_size = 100000
@window(size=window_size)
def exon_density(df):
    if not len(df):
        return 0
    return sum(df.end - df.start) / window_size

# windowed exon density
exon_density_df = refseq_exons.groupby('chrom').apply(exon_density).reset_index(level=['chrom'])
exon_density_df.head()

Unnamed: 0,chrom,start,end,exon_density
0,1,0,100000,0.05627
1,1,100000,200000,0.05474
2,1,200000,300000,0.0
3,1,300000,400000,0.05309
4,1,400000,500000,0.0


Merge both into one data frame

In [11]:
refseq_density = pandas.merge(exon_density_df, gene_density_df, on=['chrom', 'start', 'end'])

refseq_density.to_hdf(results_dir / 'refseq_density.hdf', 'df', mode='w')

## mirBase

GFF2 in hg19 ftp://mirbase.org/pub/mirbase/20/genomes/hsa.gff2

In [12]:
mir_base = pandas.read_table(data_dir / 'hsa.gff2.txt', skiprows=8, names=['chrom', 'source', 
                                                                           'method', 'start', 'end', 
                                                                           'score', 'strand',
                                                                          'phase', 'group'])
# turn zero based:
mir_base['start'] -= 1
mir_base['end'] -= 2
mir_base['chrom'] = mir_base.chrom.str.replace('chr', '')
mir_base.to_hdf(results_dir / 'mir_base.hdf', 'df', format='table', mode='w')
mir_base.head()

Unnamed: 0,chrom,source,method,start,end,score,strand,phase,group
0,1,.,miRNA,17368,17434,.,-,.,"ACC=""MI0022705""; ID=""hsa-mir-6859-1"";"
1,1,.,miRNA,30365,30501,.,+,.,"ACC=""MI0006363""; ID=""hsa-mir-1302-2"";"
2,1,.,miRNA,567704,567791,.,-,.,"ACC=""MI0022558""; ID=""hsa-mir-6723"";"
3,1,.,miRNA,1102483,1102576,.,+,.,"ACC=""MI0000342""; ID=""hsa-mir-200b"";"
4,1,.,miRNA,1103242,1103330,.,+,.,"ACC=""MI0000737""; ID=""hsa-mir-200a"";"


### All Ensembl biomart genes:

In [13]:
biomart_genes = pandas.read_table(data_dir / 'biomart_genes_hg19.tsv')
biomart_genes.rename(columns={'Gene name': 'name', 
                   'Chromosome/scaffold name': 'chrom',
                   'Gene start (bp)': 'start', 
                   'Gene end (bp)': 'end', 
                   'Strand': 'strand'}, inplace=True)
biomart_genes.head()

Unnamed: 0,Gene stable ID,chrom,start,end,strand,Gene type,name
0,ENSG00000261657,HG991_PATCH,66119285,66465398,1,protein_coding,SLC25A26
1,ENSG00000223116,13,23551994,23552136,-1,miRNA,AL157931.1
2,ENSG00000233440,13,23708313,23708703,1,pseudogene,HMGA1P6
3,ENSG00000207157,13,23726725,23726825,-1,misc_RNA,RNY3P4
4,ENSG00000229483,13,23743974,23744736,-1,lincRNA,LINC00362


In [14]:
biomart_genes.to_hdf(results_dir / 'biomart_genes.hdf', 'df', format='table', mode='w')

### Ensembl biomart genes with relevant GO ontologies:

In [15]:
names = ['stable_id', 'start', 'end',  'strand',  'name', 'go_acc', 'go_term', 'chrom']
    
biomart_chrX_hg19_go_meiosis = pandas.read_table(str(data_dir / 'biomart_chrX_hg19_go_meiosis.txt'),
                        header=0, names=names)
biomart_chrX_hg19_go_meiosis.to_hdf(results_dir / 'biomart_chrX_hg19_go_meiosis.hdf', 'df', format='table', mode='w')
                                          
biomart_chrX_hg19_go_spermatogenesis = pandas.read_table(str(data_dir / 'biomart_chrX_hg19_go_spermatogenesis.txt'),
                        header=0, names=names)
biomart_chrX_hg19_go_spermatogenesis.to_hdf(results_dir / 'biomart_chrX_hg19_go_spermatogenesis.hdf', 'df', format='table', mode='w')
    
biomart_chrX_hg19_go_apoptosis = pandas.read_table(str(data_dir / 'biomart_chrX_hg19_go_apoptosis.txt'),
                        header=0, names=names)
biomart_chrX_hg19_go_apoptosis.to_hdf(results_dir / 'biomart_chrX_hg19_go_apoptosis.hdf', 'df', format='table', mode='w')
    
# biomart_genes_with_go = pandas.read_table(str(data_dir / 'biomart_genes_hg19_genes_with_releveant_go.txt'),
#                         header=0, names=['stable_id', 'chrom', 'start', 'end', 'strand', 'name', 'go_acc', 'go_term'])
# biomart_genes_with_go.to_hdf(results_dir / 'biomart_genes_with_go.hdf', 'df', format='table', mode='w')

### Phastcons

In [16]:
phastcons_windows = pandas.read_table(data_dir / 'phastCons46way.primates.100kb.tsv',
                 header=0, dtype={'chrom': str}, names=['chrom', 'start', 'end', 'phastcons', 'var', 'skew'])
phastcons_windows.chrom = [x.replace('chr', '') for x in phastcons_windows.chrom]
phastcons_windows = phastcons_windows[['chrom', 'start', 'end', 'phastcons']]

phastcons_windows.to_hdf(results_dir / 'phastcons_windows.hdf', 'df', mode='w')

### Akey 2009 sweep meta analysis

In [17]:
akey_regions = (pandas.read_table(str(data_dir / 'Akey09_TableS1_hg19.tsv'),
                                        header=0,
                                        names=['chrom', 'start', 'end', 'nr_scans', 'references']
                                       )
                      .loc[:, ['chrom', 'start', 'end']]
                      .assign(chrom = lambda df: [x.replace('chr', '') for x in df.chrom])
                     )

akey_regions.to_hdf(results_dir / 'akey_regions.hdf', 'df', mode='w')

# Load great ape pi and ILS

### Pi data from Kiwoong's project

In [18]:
df_list = [pandas.read_table(str(data_dir / 'KiwoongsPiTables/PFW_C.txt')),
           pandas.read_table(str(data_dir / 'KiwoongsPiTables/PFW_G.txt')),
           pandas.read_table(str(data_dir / 'KiwoongsPiTables/PFW_O.txt')),
           pandas.read_table(str(data_dir / 'KiwoongsPiTables/PFW_B.txt'))]
df = pandas.concat(df_list)[['species', 'chr', 'pos', 'n.total', 'pi']]
df.columns = ['species', 'chrom', 'start', 'total', 'pi']
df.chrom = [x.replace('chr', '') for x in df.chrom]
df.start *= 100000
df['end'] = df.start + 100000


# def call_low_ape_pi_regions(df, maxpi, min_analyzed):
#     df = (df
#           .assign(pi = lambda df: df.pi.where(df.total >= min_analyzed)) 
#           .loc[:, ['chrom', 'start', 'end', 'species', 'pi']]
#           .groupby(['chrom', 'start', 'end', 'species'])
#           .aggregate(silent_nanmean)
#           .reset_index()
#           )
    
#     group_df_list = list()
#     for name, group in df.groupby(['chrom', 'species']):
#         group_df = (group
#                   .reset_index()
#                   .assign(run = 1, 
#                           islow = lambda df: df.pi <= maxpi) # turn masked into nan to break runs at missing data
#                   .assign(test = lambda df: (df.islow != df.islow.shift()).cumsum())
#                   .groupby(['chrom', 'species', 'test'])
#                   .aggregate({'start': 'min', 'end': 'max', 'run': 'sum', 'pi': 'mean'})
#                   .reset_index()
#                   .loc[lambda df: df.pi <= maxpi, :]
#              )
#         group_df_list.append(group_df)

#     return pandas.concat(group_df_list)
    
    
def call_low_ape_pi_regions(df, min_analyzed):
    df = (df
          .assign(pi = lambda df: df.pi.where(df.total >= min_analyzed)) 
          .loc[:, ['chrom', 'start', 'end', 'species', 'pi']]
          .groupby(['chrom', 'start', 'end', 'species'])
          .aggregate(silent_nanmean)
          .reset_index()
          )
    
    group_df_list = list()
    for name, group in df.groupby(['chrom', 'species']):
        
        maxpi = group.pi.mean() * 0.2
        
        group_df = (group
                  .reset_index()
                  .assign(run = 1, 
                          islow = lambda df: df.pi <= maxpi) # turn masked into nan to break runs at missing data
                  .assign(test = lambda df: (df.islow != df.islow.shift()).cumsum())
                  .groupby(['chrom', 'species', 'test'])
                  .aggregate({'start': 'min', 'end': 'max', 'run': 'sum', 'pi': 'mean'})
                  .reset_index()
                  .loc[lambda df: df.pi <= maxpi, :]
             )
        group_df_list.append(group_df)

    return pandas.concat(group_df_list)

df = (df.loc[df.chrom == 'X']
          .groupby(['chrom', 'species', 'start', 'end'])
          .aggregate(numpy.mean)
         )    
df = df.reset_index()

#max_pi_fraction_in_low_regions_apes = 0.2
#min_fraction_analyzed_apes = 30000

#low_ape_pi_regions = call_low_ape_pi_regions(df, max_pi_fraction_in_low_regions_apes, min_fraction_analyzed_apes)

max_pi_in_low_regions_apes = 0.0002
min_fraction_analyzed_apes = 30000

#low_ape_pi_regions = call_low_ape_pi_regions(df, max_pi_in_low_regions_apes, min_fraction_analyzed_apes)
low_ape_pi_regions = call_low_ape_pi_regions(df, min_fraction_analyzed_apes)

low_ape_pi_regions.to_hdf(results_dir / 'low_ape_pi_regions.hdf', 'df', mode='w')

In [19]:
low_ape_pi_regions.head()

Unnamed: 0,chrom,species,test,start,end,run,pi
1,X,BO,2,4300000,4500000,2,0.000103
3,X,BO,4,8800000,8900000,1,3.8e-05
5,X,BO,6,9000000,9100000,1,0.000112
7,X,BO,8,10900000,11000000,1,8.8e-05
9,X,BO,10,16000000,16400000,4,6.4e-05


In [20]:
low_ape_pi_regions.species.unique()

array(['BO', 'CC', 'EC', 'ELG', 'NC', 'SO', 'WC', 'WLG'], dtype=object)

## Compute total superset of regions (excl. EWG that was excluded in Nam et al.)

In [21]:
from genominterv import interval_collapse

species_included = ['BO', 'CC', 'EC', 'NC', 'SO', 'WC', 'WLG']#, 'ELG']

df = interval_collapse(low_ape_pi_regions.loc[lambda df: df.species.isin(species_included)]
                       .sort_values(['chrom', 'start', 'end']))

df.assign(length=df.end-df.start).length.sum() / 155270560


0.18805883098508822

### Proportions of ILS in 1Mb windows from great ape ILS project

In [22]:
# great_ape_ils_analysis_dir = Path('/home/kmt/projects/great_ape_ils_maps/analyses')
great_ape_ils_analysis_dir = data_dir / 'great_ape_ils_maps'

window_size = 1000000
col_names = ["start", "end", 'top1', 'top2', 'top3', 'top4']

human_chimp_chrX_ils_path = great_ape_ils_analysis_dir / "multiz_hg19_panTro3_gorGor3_ponAbe2/chr1-22_X.statesBin1000000chrX.import.coalhmm.list_chrHASH4563609074768454572.hdfmerge.minmaf500_maxgap100_maxn02_junkNn.tbl"
human_chimp_chrX_ils = (pandas.read_table(human_chimp_chrX_ils_path, names=col_names)
                        .assign(chrom = 'X',
                                prop_ils = lambda df: (df.top3 + df.top4) / (df.top1 + df.top2 + df.top3 + df.top4),
                                fraction_analyzed = lambda df: (df.top1 + df.top2 + df.top3 + df.top4) / window_size)
                       )

human_orang_chrX_ils_path = great_ape_ils_analysis_dir / "multiz_hg19_ponAbe2_nomLeu1_rheMac2/chr1-22_X.statesBin1000000chrX.import.coalhmm.list_chrHASH-5473120441099014340.hdfmerge.minmaf500_maxgap100_maxn02_junkNn.tbl"
human_orang_chrX_ils = (pandas.read_table(human_orang_chrX_ils_path, names=col_names)
                        .assign(chrom = 'X',
                                prop_ils = lambda df: (df.top3 + df.top4) / (df.top1 + df.top2 + df.top3 + df.top4),
                                fraction_analyzed = lambda df: (df.top1 + df.top2 + df.top3 + df.top4) / window_size)
                       )


human_chimp_chr7_ils_path = great_ape_ils_analysis_dir / "multiz_hg19_panTro3_gorGor3_ponAbe2/chr1-22_X.statesBin1000000chr7.import.coalhmm.list_chrHASH4563609074768454572.hdfmerge.minmaf500_maxgap100_maxn02_junkNn.tbl"
human_chimp_chr7_ils = (pandas.read_table(human_chimp_chr7_ils_path, names=col_names)
                        .assign(chrom = '7',
                                prop_ils = lambda df: (df.top3 + df.top4) / (df.top1 + df.top2 + df.top3 + df.top4),
                                fraction_analyzed = lambda df: (df.top1 + df.top2 + df.top3 + df.top4) / window_size)
                       )

human_orang_chr7_ils_path = great_ape_ils_analysis_dir / "multiz_hg19_ponAbe2_nomLeu1_rheMac2/chr1-22_X.statesBin1000000chr7.import.coalhmm.list_chrHASH-5473120441099014340.hdfmerge.minmaf500_maxgap100_maxn02_junkNn.tbl"
human_orang_chr7_ils = (pandas.read_table(human_orang_chr7_ils_path, names=col_names)
                        .assign(chrom = '7',
                                prop_ils = lambda df: (df.top3 + df.top4) / (df.top1 + df.top2 + df.top3 + df.top4),
                                fraction_analyzed = lambda df: (df.top1 + df.top2 + df.top3 + df.top4) / window_size)
                       )

def call_low_ils_regions(df, max_ils, min_analyzed):
    df = (df
          .assign(prop_ils = lambda df: df.prop_ils.where(df.fraction_analyzed >= min_analyzed)) 
          .loc[:, ['chrom', 'start', 'end', 'prop_ils']]
          .assign(run = 1, 
                  islow = lambda df: df.prop_ils <= max_ils) # turn masked into nan to break runs at missing data
          .assign(test = lambda df: (df.islow != df.islow.shift()).cumsum())
          .groupby(['chrom', 'test'])
                  .aggregate({'start': 'min', 'end': 'max', 'run': 'sum', 'prop_ils': 'mean'})
                  .reset_index()
          .loc[lambda df: df.prop_ils <= max_ils, :]
         )
    return df

max_ils_in_low_regions = 0.05
minimal_fraction_analyzed_ils = 0.3

human_chimp_low_ils_regions_chrX = call_low_ils_regions(human_chimp_chrX_ils, 
                                                        max_ils_in_low_regions, 
                                                        minimal_fraction_analyzed_ils)
human_chimp_low_ils_regions_chrX.to_hdf(results_dir / 'human_chimp_low_ils_regions_chrX.hdf', 'df', mode='w')

human_orang_low_ils_regions_chrX = call_low_ils_regions(human_orang_chrX_ils, 
                                                        max_ils_in_low_regions, 
                                                        minimal_fraction_analyzed_ils)
human_orang_low_ils_regions_chrX.to_hdf(results_dir / 'human_orang_low_ils_regions_chrX.hdf', 'df', mode='w')


human_chimp_low_ils_regions_chr7 = call_low_ils_regions(human_chimp_chr7_ils, 
                                                        max_ils_in_low_regions, 
                                                        minimal_fraction_analyzed_ils)
human_chimp_low_ils_regions_chr7.to_hdf(results_dir / 'human_chimp_low_ils_regions_chr7.hdf', 'df', mode='w')

human_orang_low_ils_regions_chr7 = call_low_ils_regions(human_orang_chr7_ils, 
                                                        max_ils_in_low_regions, 
                                                        minimal_fraction_analyzed_ils)
human_orang_low_ils_regions_chr7.to_hdf(results_dir / 'human_orang_low_ils_regions_chr7.hdf', 'df', mode='w')


-----------------

In [23]:
# human_chimp_chr3_ils_path = great_ape_ils_analysis_dir / "multiz_hg19_panTro3_gorGor3_ponAbe2/results/chr1-22_X.statesBin1000000chr3.import.coalhmm.list_chrHASH4563609074768454572.hdfmerge.minmaf500_maxgap100_maxn02_junkNn.tbl"
# human_chimp_chr3_ils = (pandas.read_table(human_chimp_chr3_ils_path, names=col_names)
#                         .assign(chrom = '3',
#                                 prop_ils = lambda df: (df.top3 + df.top4) / (df.top1 + df.top2 + df.top3 + df.top4),
#                                 fraction_analyzed = lambda df: (df.top1 + df.top2 + df.top3 + df.top4) / window_size)
#                        )
# df = human_chimp_chr3_ils#.loc[(human_chimp_chr9_ils.start > 80e6) & (human_chimp_chr9_ils.start < 100e6)]
# plt.plot(df.start, df.prop_ils)

In [24]:
# human_chimp_chr9_ils_path = great_ape_ils_analysis_dir / "multiz_hg19_panTro3_gorGor3_ponAbe2/results/chr1-22_X.statesBin1000000chr9.import.coalhmm.list_chrHASH4563609074768454572.hdfmerge.minmaf500_maxgap100_maxn02_junkNn.tbl"
# human_chimp_chr9_ils = (pandas.read_table(human_chimp_chr9_ils_path, names=col_names)
#                         .assign(chrom = '9',
#                                 prop_ils = lambda df: (df.top3 + df.top4) / (df.top1 + df.top2 + df.top3 + df.top4),
#                                 fraction_analyzed = lambda df: (df.top1 + df.top2 + df.top3 + df.top4) / window_size)
#                        )
# df = human_chimp_chr9_ils#.loc[(human_chimp_chr9_ils.start > 80e6) & (human_chimp_chr9_ils.start < 100e6)]
# plt.plot(df.start, df.prop_ils) ;

In [25]:
# human_orang_chr9_ils_path = great_ape_ils_analysis_dir / "multiz_hg19_ponAbe2_nomLeu1_rheMac2/results/chr1-22_X.statesBin1000000chr9.import.coalhmm.list_chrHASH-5473120441099014340.hdfmerge.minmaf500_maxgap100_maxn02_junkNn.tbl"
# human_orang_chr9_ils = (pandas.read_table(human_orang_chr9_ils_path, names=col_names)
#                         .assign(chrom = '9',
#                                 prop_ils = lambda df: (df.top3 + df.top4) / (df.top1 + df.top2 + df.top3 + df.top4),
#                                 fraction_analyzed = lambda df: (df.top1 + df.top2 + df.top3 + df.top4) / window_size)
#                        )
# df = human_orang_chr9_ils.loc[(human_chimp_chr9_ils.start > 80e6) & (human_chimp_chr9_ils.start < 110e6)]
# plt.plot(df.start, df.prop_ils) ; 

-----------------

In [26]:
# df = call_low_ils_regions(human_orang_chrX_ils, 
#                            max_ils_in_low_regions, 
#                            minimal_fraction_analyzed_ils)

# df#.loc[(df.start > 50e6) & (df.end < 60e6)]

In [27]:
human_chimp_chrX_ils.loc[human_chimp_chrX_ils.fraction_analyzed >= minimal_fraction_analyzed_ils].prop_ils.mean()

0.1333229928153677

In [28]:
df = (human_chimp_chrX_ils
      .loc[human_chimp_chrX_ils.fraction_analyzed >= minimal_fraction_analyzed_ils]
      .loc[human_chimp_chrX_ils.prop_ils < 0.02]
     )
len(df) / 155.270560

0.11592667663464343

## Load Trine Line's circRNA results

In [29]:
trine_line_genes = pandas.read_csv(data_dir / 'trine_line_data/AnnoTable.csv')
trine_line_genes.drop(trine_line_genes.columns[[0]], axis=1, inplace=True)
trine_line_x_genes = trine_line_genes.loc[trine_line_genes.chr == 'chrX']
trine_line_x_genes.to_hdf(results_dir / 'trine_line_x_genes.hdf', 'df', format='table', mode='w')

# Chalmel genes

Read gene table from Chalmel paper SOM (I exported csv from the excel table):

In [30]:
chalmel_genes = pandas.read_csv(data_dir / 'chalmel_genes.csv', sep=';')
chalmel_genes.head()

Unnamed: 0,ID,GeneName,Description,biological_process terms,molecular_function terms,cellular_component terms,Human_Phenotype,Pattern,Expression in Testis,TF,...,tonsil-non_germinal_center_cells,tonsil-squamous_epithelial_cells,urinary_bladder-urothelial_cells,uterus_post_menopause-cells_in_endometrial_stroma,uterus_post_menopause-glandular_cells,uterus_pre_menopause-cells_in_endometrial_stroma,uterus_pre_menopause-glandular_cells,vagina-squamous_epithelial_cells,vulva_anal_skin-epidermal_cells,Unnamed: 180
0,210262_at,CRISP2,cysteine-rich secretory protein 2,-,-,extracellular space (0005615),-,12,PEHET,-,...,0,0,0,0,0,0,0,0,0,
1,206112_at,ANKRD7,ankyrin repeat domain 7,male gonad development (0008584),-,-,-,11,SEHET,-,...,-,-,-,-,-,-,-,-,-,
2,1558262_at,-,-,-,-,-,-,13,SEHET,-,...,-,-,-,-,-,-,-,-,-,
3,207116_s_at,GAPDHS,"glyceraldehyde-3-phosphate dehydrogenase, sper...",gluconeogenesis (0006094);glycolysis (0006096)...,glyceraldehyde-3-phosphate dehydrogenase (NAD+...,cytosol (0005829);microtubule-based flagellum ...,-,13,SEHET,-,...,-,-,-,-,-,-,-,-,-,
4,231612_at,C4orf35,chromosome 4 open reading frame 35,-,-,-,-,13,SEHET,-,...,-,-,-,-,-,-,-,-,-,


In [31]:
chalmel_genes = chalmel_genes.merge(biomart_genes, left_on=['GeneName'], right_on=['name'], how='inner')
chalmel_genes.head()

Unnamed: 0,ID,GeneName,Description,biological_process terms,molecular_function terms,cellular_component terms,Human_Phenotype,Pattern,Expression in Testis,TF,...,vagina-squamous_epithelial_cells,vulva_anal_skin-epidermal_cells,Unnamed: 180,Gene stable ID,chrom,start,end,strand,Gene type,name
0,210262_at,CRISP2,cysteine-rich secretory protein 2,-,-,extracellular space (0005615),-,12,PEHET,-,...,0,0,,ENSG00000124490,6,49660073,49681274,-1,protein_coding,CRISP2
1,206112_at,ANKRD7,ankyrin repeat domain 7,male gonad development (0008584),-,-,-,11,SEHET,-,...,-,-,,ENSG00000106013,7,117854727,117882785,1,protein_coding,ANKRD7
2,207116_s_at,GAPDHS,"glyceraldehyde-3-phosphate dehydrogenase, sper...",gluconeogenesis (0006094);glycolysis (0006096)...,glyceraldehyde-3-phosphate dehydrogenase (NAD+...,cytosol (0005829);microtubule-based flagellum ...,-,13,SEHET,-,...,-,-,,ENSG00000105679,19,36024314,36036218,1,protein_coding,GAPDHS
3,243208_x_at,ACTL9,actin-like 9,-,protein binding (0005515),cytoplasm (0005737);cytoskeleton (0005856),-,13,SEHET,-,...,2,2,,ENSG00000263000,HG729_PATCH,8762287,8763706,-1,protein_coding,ACTL9
4,243208_x_at,ACTL9,actin-like 9,-,protein binding (0005515),cytoplasm (0005737);cytoskeleton (0005856),-,13,SEHET,-,...,2,2,,ENSG00000181786,19,8807751,8809172,-1,protein_coding,ACTL9


In [32]:
#chalmel_genes.loc[(chalmel_genes.chrom == '3') & (chalmel_genes.start > 45e6) & (chalmel_genes.end < 55e6)].sort_values('start')

In [33]:
chalmel_genes.to_hdf(results_dir / 'chalmel_genes.hdf', 'df', format='table', mode='w')