# Figshare dataset

This notebook produces the gene expression and insertion tables included in the Figshare dataset.

In [1]:
%matplotlib inline

%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../src')

import pandas as pd
import seaborn as sns

sns.set_style('white')

First we copy over the sample overview from the supplemental tables, which describes the samples in both datasets.

In [2]:
! mkdir -p ../reports/figshare
! cp ../reports/supplemental/tables/table_s1_samples.xlsx ../reports/figshare/sample_overview.xlsx

Next, we create an additional Excel file that contains the raw RNA-seq counts for the three different mouse datasets (SB, EcadPten and KB1P).

In [None]:
sb_counts = pd.read_csv('../data/processed/sb/rnaseq/gene_counts.txt', sep='\t', index_col=0)
kb1p_counts = pd.read_csv('../data/processed/kb1p/gene_counts.txt', sep='\t', index_col=0)
pten_counts = pd.read_csv('../data/processed/pten/gene_counts.txt', sep='\t', index_col=0)

with pd.ExcelWriter('../reports/figshare/expression.xlsx') as writer:
    sb_counts.to_excel(writer, sheet_name='SB samples')
    kb1p_counts.to_excel(writer, sheet_name='KB1P samples')
    pten_counts.to_excel(writer, sheet_name='EcadPten samples')

Finally, we create an Excel file that contains the insertions, annotated insertions, cis sites, cis insertions and insertion-to-cis mapping for the insertion dataset.

In [None]:
insertions = pd.read_csv('../data/processed/sb/shear_splink/'
                         'subset/all/insertions.txt', sep='\t')
insertions_annotated = pd.read_csv('../data/processed/sb/shear_splink/'
                                   'subset/all/insertions.cis.rbm.txt', sep='\t')

cis_sites = pd.read_csv('../data/processed/sb/shear_splink/subset/'
                        'all/insertions.cis.sites.txt', sep='\t')
cis_insertions = pd.read_csv('../data/processed/sb/shear_splink/subset/'
                             'all/insertions.cis.txt', sep='\t')

cis_mapping = cis_insertions[['id', 'cis_id']]

with pd.ExcelWriter('../reports/figshare/insertions.xlsx') as writer:
    insertions.to_excel(writer, sheet_name='insertions', index=False)
    cis_sites.to_excel(writer, sheet_name='cis_sites', index=False)
    cis_mapping.to_excel(writer, sheet_name='cis_mapping', index=False)
    insertions_annotated.to_excel(writer, sheet_name='insertions_annotated', index=False)