In [1]:
import sys
sys.path.append('../src')

import pandas as pd

# Table S1 - Sample overview

In [2]:
# Read sample overview.
samples = pd.read_csv('../data/raw/samples.txt', sep='\t')

# Rename T2onc strains.
strain_map = {1868: 'chr15-donor',
              1869: 'chr1-donor'}
samples['t2onc_type'] = samples['t2onc_type'].map(strain_map)

# Remove unneeded columns.
del samples['rnaseq_full_name']
del samples['tumor_type']

# Summarize metastases per mouse.
metastases = pd.read_csv('../data/raw/metastases.txt', sep='\t')
metastases = metastases.ix[metastases['mouse'].isin(samples['mouse'])]

# Merge metastasis data with samples.
met_sites = (metastases.groupby('mouse')
                       .agg({'metastasis_site': lambda s: ', '.join(s)})
                       .reset_index())

samples = pd.merge(samples, met_sites, on='mouse', how='left')
samples.to_excel('../reports/supplemental/tables/table_s1_samples.xlsx', index=False)

# Table S2 - Candidate overview

In [4]:
import pybiomart
from nbsupport import insertions as nb_ins


# Read insertions.
insertions = (pd.read_csv('../data/processed/shear_splink/all/insertions.cis.rbm.txt', sep='\t')
                .pipe(nb_ins.annotate_with_clonality))
insertions_1868 = (pd.read_csv('../data/processed/shear_splink/1868/insertions.cis.rbm.txt', sep='\t')
                     .pipe(nb_ins.annotate_with_clonality))
insertions_1869 = (pd.read_csv('../data/processed/shear_splink/1869/insertions.cis.rbm.txt', sep='\t')
                     .pipe(nb_ins.annotate_with_clonality))

# Summarize various statistics (including donor bias) for genes
# that were identified in the overall CIS analysis.
ranked_summary = (
    pd.concat([
        nb_ins.gene_statistics(insertions), 
        nb_ins.gene_sample_count(insertions_1868, name='n_samples_chr15_donor'),
        nb_ins.gene_sample_count(insertions_1869, name='n_samples_chr1_donor')], axis=1)
    .dropna(subset=['n_samples']))

ranked_summary.index.name = 'gene_name'
ranked_summary = ranked_summary.reset_index()

# Annotate location using biomart.
bm_dataset = pybiomart.Dataset(name='mmusculus_gene_ensembl',
                               host='http://www.ensembl.org')

bm_annotation = bm_dataset.query(
    attributes=['external_gene_name', 'chromosome_name',
                'start_position', 'end_position', 'strand'],
    use_attr_names=True)

bm_annotation = bm_annotation.rename(
    columns={'external_gene_name': 'gene_name'})

ranked_summary = pd.merge(ranked_summary, bm_annotation, on='gene_name', how='left')

# Re-order columns for legibility.
ranked_summary = ranked_summary[
    ['gene_name', 'chromosome_name', 'start_position', 'end_position',
     'strand', 'n_samples', 'n_samples_chr15_donor', 'n_samples_chr1_donor',
     'mean_clonality', 'sense_fraction_weighted']]

# Sort by frequency/clonality.
ranked_summary.sort_values(['n_samples', 'mean_clonality'],
                           ascending=False, inplace=True)

ranked_summary.to_excel('../reports/supplemental/tables/table_s2_candidate_overview.xlsx', index=False)

In [5]:
from IPython.display import display

# Print donor chromosome genes.
for chrom in ['1', '15']:
    print('Chromosome {}'.format(chrom))
    display(ranked_summary.query('chromosome_name == {!r}'.format(chrom)))

Chromosome 1


Unnamed: 0,gene_name,chromosome_name,start_position,end_position,strand,n_samples,n_samples_chr15_donor,n_samples_chr1_donor,mean_clonality,sense_fraction_weighted
28,Trp53bp2,1,182409172,182462432,1,17.0,3.0,14.0,0.723605,0.866325
18,Ppp1r12b,1,134754658,134955942,-1,9.0,4.0,5.0,0.461452,0.210206


Chromosome 15


Unnamed: 0,gene_name,chromosome_name,start_position,end_position,strand,n_samples,n_samples_chr15_donor,n_samples_chr1_donor,mean_clonality,sense_fraction_weighted
29,Trps1,15,50654752,50890463,-1,56.0,36.0,20.0,0.598344,0.0489
14,Myh9,15,77760587,77842175,-1,32.0,21.0,10.0,0.774573,0.175493


# Table S5 - Expression

In [3]:
sb_counts = pd.read_csv('../data/processed/expression/expression.sb.txt', sep='\t', index_col=0)
sb_counts = sb_counts[samples['sample']]

kb1p_counts = pd.read_csv('../data/external/mouse-models/expression.kb1p.txt', sep='\t', index_col=0)
pten_counts = pd.read_csv('../data/external/mouse-models/expression.pten.txt', sep='\t', index_col=0)

with pd.ExcelWriter('../reports/supplemental/tables/table_s5_expression.xlsx') as writer:
    sb_counts.to_excel(writer, sheet_name='SB samples')
    kb1p_counts.to_excel(writer, sheet_name='KB1P samples')
    pten_counts.to_excel(writer, sheet_name='EcadPten samples')

# Table S6 - Insertions

In [4]:
insertions = pd.read_csv('../data/processed/shear_splink/all/insertions.txt', sep='\t')
insertions_annotated = pd.read_csv('../data/processed/shear_splink/all/insertions.cis.rbm.txt', sep='\t')

cis_sites = pd.read_csv('../data/processed/shear_splink/all/insertions.cis.sites.txt', sep='\t')
cis_insertions = pd.read_csv('../data/processed/shear_splink/all/insertions.cis.txt', sep='\t')

cis_mapping = cis_insertions[['id', 'cis_id']]

with pd.ExcelWriter('../reports/supplemental/tables/table_s6_insertions.xlsx') as writer:
    insertions.to_excel(writer, sheet_name='insertions', index=False)
    cis_sites.to_excel(writer, sheet_name='cis_sites', index=False)
    cis_mapping.to_excel(writer, sheet_name='cis_mapping', index=False)
    insertions_annotated.to_excel(writer, sheet_name='insertions_annotated', index=False)