In [1]:
import os
import sys
import pandas as pd
import matplotlib
import seaborn as sns
import pickle
import numpy as np
import matplotlib as mpl
mpl.style.use('fivethirtyeight')
from matplotlib import pyplot as plt
from itertools import product
import glob
import re
from matplotlib_venn import venn2
from matplotlib import rcParams
import inspect
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sys.setrecursionlimit(3000)
sns.set_context('notebook')
%load_ext autoreload
%autoreload 2
rcParams['figure.figsize'] = 8, 6
#rcParams['font.family'] = 'sans-serif'
#rcParams['font.sans-serif'] = ['Tahoma']
import tqdm
### notebook specific configuration ###
from os.path import basename
from tqdm import tqdm_notebook

from Modules.Homer import *

sys.path.append('/home/isshamie/software/NGS-Pipeline')

data_folder = '/data/isshamie/CH_tissue_TSS/'

ref_fa = '/data/genome/hamster/picr/picr.fa'
annotation = '/data/genome/hamster/picr/updated_final_sort.gff3'
tss_annotation = '/data/isshamie/genome/start_site_mRNA_updated_final_sort.tsv'
mrna_peak = '/data/isshamie/TSS_CHO/mRNA.peak'

tissues = ['BloondnegSpleen','BMDM1hKLA1','BMDMwt','Brain','Brain7neg1',
          'FemaleReproductive','Heart','Intestine','Kidney','Liver','Lung',
           'MiscOrgans','Muscle','Pancreas','Skin','Spleen','Total','CHO']

len(tissues)

### Load in meta_sampe

meta_samples = pd.read_csv('Results/meta_samples.csv',index_col=0)

long_meta_samples = pd.read_csv('Results/long_meta_samples.csv',index_col=0)


This call to matplotlib.use() has no effect because the backend has already
been chosen; matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.



In [2]:
txn_df = pickle.load(open('Results/txn_df.p','rb'))

In [3]:
gene_df = pickle.load(open('Results/gene_df.p','rb'))

In [4]:
gene_id_df = pickle.load(open('Results/gene_id_df.p','rb'))

In [5]:
## Only use genes with a peak
gene_df = gene_df[gene_df['hasGene']]

### Load gene_tissue_matrix

In [6]:
gene_tissue_matrix = pd.read_csv('Results/merged_samples/genes_tissues.csv',index_col=0)
gene_tissue_matrix.fillna(0,inplace=True)
## Remove the genes with no peaks, if any
gene_tissue_matrix = gene_tissue_matrix[~(gene_tissue_matrix==0).all(axis=1)]
gene_tissue_matrix.head()

Unnamed: 0,BloondnegSpleen/GRO/f04_peaks/merge_bg_2.peak,BMDM1hKLA1/GRO/f04_peaks/merge_bg_2.peak,BMDMwt/GRO/f04_peaks/merge_bg_2.peak,Brain/GRO/f04_peaks/merge_bg_2.peak,Brain/mSTART/f04_peaks/merge_bg_2.peak,FemaleReproductive/mSTART/f04_peaks/merge_bg_2.peak,Heart/mSTART/f04_peaks/merge_bg_2.peak,Kidney/GRO/f04_peaks/merge_bg_2.peak,Kidney/mSTART/f04_peaks/merge_bg_2.peak,Liver/GRO/f04_peaks/merge_bg_2.peak,Liver/mSTART/f04_peaks/merge_bg_2.peak,Lung/GRO/f04_peaks/merge_bg_2.peak,Lung/mSTART/f04_peaks/merge_bg_2.peak,MiscOrgans/mSTART/f04_peaks/merge_bg_2.peak,Muscle/mSTART/f04_peaks/merge_bg_2.peak,Pancreas/mSTART/f04_peaks/merge_bg_2.peak,Spleen/mSTART/f04_peaks/merge_bg_2.peak,CHO/GRO/f04_peaks/merge_bg_2.peak,CHO/mSTART/f04_peaks/merge_bg_2.peak
SERHL,188.199997,13.3,90.199997,333.5,501.799988,1089.349976,777.75,453.299988,18793.900391,374.600006,337.950012,294.100006,282.199982,697.300049,593.450012,529.299988,201.0,971.449951,819.599976
RRP7A,0.0,0.0,0.0,25.299999,27.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,29.65,0.0,0.0,91.099998,0.0,34.299999
POLDIP3,135.199997,31.799999,97.699997,122.599998,367.649994,148.149994,79.5,84.199997,0.0,85.099998,0.0,132.199997,0.0,174.800003,105.100006,0.0,135.5,317.799988,324.100006
CYB5R3,0.0,91.400002,45.099998,0.0,0.0,14.16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.85
A4GALT,30.799999,324.600006,257.299988,59.900002,0.0,136.25,345.149994,85.300003,48.950001,0.0,0.0,95.199997,0.0,98.949997,291.75,0.0,37.400002,0.0,0.0


### Different ways to separate in CHO and not in CHO
1. Genes expressed in at least one tissue and not in CHO
2. Genes that were expressed in every other tissue and not in CHO samples.    
3. Genes expressed in >= 50% of the tissues and not in CHO samples.  
4. Genes with counts greater than 10 in every other tissue and CHO has counts less than 10.  
5. Genes with counts greater than 10 in >= 50% of the tissuea and CHO has counts less than 10.  

In [7]:
#######################################
## Merge the tissues together to take the max. 
def merge_tissues(tissue_df,tissues):
    ''' Merging samples with the same tissue'''
    gene_tissue_df = pd.DataFrame(index=tissue_df.index,columns=tissues)
    for t in tissues:
        curr_cols = tissue_df.loc[:,tissue_df.columns.str.contains(t)]
        gene_tissue_df.at[:,t] = curr_cols.max(axis=1)

    gene_tissue_df = gene_tissue_df.loc[:,~((gene_tissue_df.isnull()).all(axis=0))] #if any tissues were not processed
    
    return gene_tissue_df


#######################################
def at_least_one_not_cho(tissue_df,cho_df):
    '''Parameters:
        tissue_df: The gene-by-tissue expression matrix. Sample names should have tissue in name
        tissues: List of tissues to consider.
        cho_df: Same rows, but columns are the cho samples.
        A tissue could have more than one sample, and if at least one is present then it is ok.'''
    
    in_cho = cho_df[((cho_df>0).any(axis=1))].index.values
    not_in_cho = tissue_df[((tissue_df>0).any(axis=1)) & ((cho_df==0).all(axis=1))].index.values
    
    return in_cho, not_in_cho


#######################################
def all_but_cho(tissue_df,cho_df):
    '''Parameters:
        tissue_df: The gene-by-tissue expression matrix. Sample names should have tissue in name
        tissues: List of tissues to consider.
        cho_df: Same rows, but columns are the cho samples.
        A tissue could have more than one sample, and if at least one is present then it is ok.'''
    
    in_cho = cho_df[((cho_df>0).any(axis=1))].index.values
    not_in_cho = tissue_df[((tissue_df>0).all(axis=1)) & ((cho_df==0).all(axis=1))].index.values
    
    return in_cho, not_in_cho


#######################################
def half_and_not_cho(tissue_df,cho_df):
    '''Parameters:
        tissue_df: The genexsample expression matrix. 
        A tissue could have more than one sample, and if at least one is present then it is ok.
    '''
    
    fifty = len(tissues)/2
    in_cho = cho_df[((cho_df>0).any(axis=1))].index.values
    not_in_cho = tissue_df[(np.sum(tissue_df>0,axis=1)>fifty) & ((cho_df==0).all(axis=1))].index.values
    
    return in_cho, not_in_cho



#######################################
def count_n_all_but_cho(tissue_df,cho_df,n=10):
    '''Parameters:
        tissue_df: The genexsample expression matrix. 
        A tissue could have more than one sample, and if at least one is present then it is ok.
    '''
    
    in_cho = cho_df[((cho_df>n).any(axis=1))].index.values
    not_in_cho = tissue_df[(np.sum(tissue_df>n,axis=1)==tissue_df.shape[1]) & ((cho_df<=n).all(axis=1))].index.values
    
    return in_cho, not_in_cho

#######################################
def count_ten_half_and_not_cho(tissue_df,cho_df,n=10):
    '''Parameters:
        tissue_df: The genexsample expression matrix. 
        A tissue could have more than one sample, and if at least one is present then it is ok.
    '''
    
    fifty = len(tissues)/2
    in_cho = cho_df[((cho_df>n).any(axis=1))].index.values
    not_in_cho = tissue_df[(np.sum(tissue_df>n,axis=1) >fifty) & ((cho_df <= 0).all(axis=1))].index.values
    
    return in_cho, not_in_cho

In [8]:
gene_tissue_df = merge_tissues(gene_tissue_matrix,tissues)
cho = gene_tissue_df.loc[:,'CHO']
cho = pd.DataFrame(cho)
gene_tissue_df = gene_tissue_df.drop('CHO',axis=1) #Remove CHO columns

In [9]:
in_cho1, not_in_cho1 = at_least_one_not_cho(gene_tissue_df,cho)
print('1')
print(len(in_cho1))
print(len(not_in_cho1))

1
7175
4744


In [10]:
in_cho2, not_in_cho2 = all_but_cho(gene_tissue_df,cho)
print('2')
print(len(in_cho2))
print(len(not_in_cho2))

2
7175
42


In [11]:
in_cho3, not_in_cho3 = half_and_not_cho(gene_tissue_df,cho)
print('3')
print(len(in_cho3))
print(len(not_in_cho3))

3
7175
318


In [12]:
in_cho4, not_in_cho4 = count_n_all_but_cho(gene_tissue_df,cho)
print('4')
print(len(in_cho4))
print(len(not_in_cho4))

4
6834
44


In [13]:
in_cho5, not_in_cho5 = count_ten_half_and_not_cho(gene_tissue_df,cho)
print('5')
print(len(in_cho5))
print(len(not_in_cho5))

5
6834
304


### Apoptotic genes

In [31]:
import xml.etree.ElementTree as et

In [48]:
'100682525 cge'.split()

['100682525', 'cge']

In [58]:
## Load in apoptotic genes
apop1_f  = '/data/isshamie/CH_tissue_TSS/Supplemental_Data/gene_sets/cge04210.xml'
apop2_f = '/data/isshamie/CH_tissue_TSS/Supplemental_Data/gene_sets/cge04215.xml'

apop1 = set()
tree = et.parse(apop1_f)
root = tree.getroot()
for child in root:
    #print child.tag, child.attrib
    if 'name' in child.attrib and 'cge:' in child.attrib['name'] : 
        apop1.add(child.attrib['name'].split(':')[1].split()[0]) #Take the first cge:number 
        
apop2 = set()
tree = et.parse(apop2_f)
root = tree.getroot()
for child in root:
    #print child.tag, child.attrib
    if 'name' in child.attrib and 'cge:' in child.attrib['name']: 
        apop2.add(child.attrib['name'].split(':')[1].split()[0])

In [62]:
tree = et.parse(apop1_f)
root = tree.getroot()
for child in root:
    print child.tag, child.attrib

entry {'type': 'map', 'link': 'http://www.kegg.jp/dbget-bin/www_bget?cge04115', 'id': '1', 'name': 'path:cge04115'}
entry {'type': 'map', 'link': 'http://www.kegg.jp/dbget-bin/www_bget?cge04668', 'id': '2', 'name': 'path:cge04668'}
entry {'type': 'gene', 'link': 'http://www.kegg.jp/dbget-bin/www_bget?cge:100689371', 'id': '3', 'name': 'cge:100689371'}
entry {'type': 'gene', 'link': 'http://www.kegg.jp/dbget-bin/www_bget?cge:100762210', 'id': '4', 'name': 'cge:100762210'}
entry {'type': 'gene', 'link': 'http://www.kegg.jp/dbget-bin/www_bget?cge:100754226', 'id': '5', 'name': 'cge:100754226'}
entry {'type': 'gene', 'link': 'http://www.kegg.jp/dbget-bin/www_bget?cge:100682525', 'id': '6', 'name': 'cge:100682525'}
entry {'type': 'gene', 'link': 'http://www.kegg.jp/dbget-bin/www_bget?cge:100770499', 'id': '7', 'name': 'cge:100770499'}
entry {'type': 'gene', 'link': 'http://www.kegg.jp/dbget-bin/www_bget?cge:100771367', 'id': '8', 'name': 'cge:100771367'}
entry {'type': 'gene', 'link': 'http

In [61]:
print(len(apop2))
print(len(apop1))

22
81


In [60]:
apop2.intersection(apop1)

{'100689032',
 '100689061',
 '100689366',
 '100689368',
 '100689369',
 '100689370',
 '100689371',
 '100753932',
 '100754043',
 '100754361',
 '100754606',
 '100754770',
 '100758500',
 '100760629',
 '100765017',
 '100766857',
 '100774599'}

#### Convert from id to symbol

In [70]:
gene2refseq = pd.read_csv('/data/isshamie/CH_tissue_TSS/Supplemental_Data/gene2refseq',sep='\t',header=0,
                         usecols=['GeneID','Symbol'])
gene2refseq.head()

Unnamed: 0,GeneID,Symbol
0,1246500,repA1
1,1246501,repA2
2,1246502,leuA
3,1246503,leuB
4,1246504,leuC


In [76]:
gene2refseq.shape

(5122505, 2)

In [74]:
apop1_symbol = set()
for i in apop1:
    print(i)
    print(gene2refseq[gene2refseq['GeneID'] == int(i)])

100751775
Empty DataFrame
Columns: [GeneID, Symbol]
Index: []
100689386
Empty DataFrame
Columns: [GeneID, Symbol]
Index: []
100768492
Empty DataFrame
Columns: [GeneID, Symbol]
Index: []
100759228
Empty DataFrame
Columns: [GeneID, Symbol]
Index: []
100750778
Empty DataFrame
Columns: [GeneID, Symbol]
Index: []
100762099
Empty DataFrame
Columns: [GeneID, Symbol]
Index: []
100762210
Empty DataFrame
Columns: [GeneID, Symbol]
Index: []
100682526
Empty DataFrame
Columns: [GeneID, Symbol]
Index: []
100771367
Empty DataFrame
Columns: [GeneID, Symbol]
Index: []
100758620
Empty DataFrame
Columns: [GeneID, Symbol]
Index: []
100774599
Empty DataFrame
Columns: [GeneID, Symbol]
Index: []
100763514
Empty DataFrame
Columns: [GeneID, Symbol]
Index: []
100753147
Empty DataFrame
Columns: [GeneID, Symbol]
Index: []
100772477
Empty DataFrame
Columns: [GeneID, Symbol]
Index: []
100689403
Empty DataFrame
Columns: [GeneID, Symbol]
Index: []
100754226
Empty DataFrame
Columns: [GeneID, Symbol]
Index: []
10077049

### Get list of SecM genes

In [14]:
secM = pd.read_csv('070870-1_SecretoryMachinery.csv',index_col=1,skiprows=4)
secM.head()

Unnamed: 0_level_0,ensgid,Subsystem,category.HPA,tissues.HPA,category.GTEx,tissues.GTEx,fpkm.HPA.adrenal,fpkm.HPA.appendices,fpkm.HPA.bonemarrow,fpkm.HPA.brain,...,fpkm.GTEx.prostate,fpkm.GTEx.skeletalmuscle,fpkm.GTEx.skin,fpkm.GTEx.smallintestine,fpkm.GTEx.spleen,fpkm.GTEx.stomach,fpkm.GTEx.testis,fpkm.GTEx.thyroid,fpkm.GTEx.uterus,fpkm.GTEx.vagina
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
RAB44,ENSG00000255587,Post-Golgi trafficking,Tissue enriched,bonemarrow,Enhanced,blood,0.04,0.49,27.67,0.01,...,0.46,0.06,0.68,0.56,0.89,0.37,0.13,0.05,0.39,0.54
B4GALNT1,ENSG00000135454,Golgi glycosylation,Tissue enriched,brain,Group enriched,"brain.cerebellum,brain.cerebrum,nerve",0.56,0.62,0.03,12.69,...,1.73,2.63,0.26,0.95,0.22,0.48,0.56,0.83,1.21,1.09
NAPB,ENSG00000125814,COPII,Tissue enriched,brain,Group enriched,"brain.cerebellum,brain.cerebrum",21.39,4.8,5.52,156.18,...,5.64,2.05,4.75,5.05,5.22,3.72,7.49,6.72,6.05,6.5
NSF,ENSG00000073969,COPII,Tissue enriched,brain,Expressed in all,,14.46,13.96,4.66,117.52,...,9.25,11.03,9.75,9.38,9.92,8.49,14.16,8.88,6.93,7.71
AGAP2,ENSG00000135439,Trafficking regulation,Tissue enriched,brain,Group enriched,"brain.cerebellum,brain.cerebrum",1.47,7.62,7.22,65.72,...,1.59,3.04,1.03,5.51,9.41,1.36,0.95,1.96,1.05,0.96


#### secM silenced in CHO

In [30]:
secM[secM.index.isin(not_in_cho1)]

Unnamed: 0_level_0,ensgid,Subsystem,category.HPA,tissues.HPA,category.GTEx,tissues.GTEx,fpkm.HPA.adrenal,fpkm.HPA.appendices,fpkm.HPA.bonemarrow,fpkm.HPA.brain,...,fpkm.GTEx.prostate,fpkm.GTEx.skeletalmuscle,fpkm.GTEx.skin,fpkm.GTEx.smallintestine,fpkm.GTEx.spleen,fpkm.GTEx.stomach,fpkm.GTEx.testis,fpkm.GTEx.thyroid,fpkm.GTEx.uterus,fpkm.GTEx.vagina
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B4GALNT1,ENSG00000135454,Golgi glycosylation,Tissue enriched,brain,Group enriched,"brain.cerebellum,brain.cerebrum,nerve",0.56,0.62,0.03,12.69,...,1.73,2.63,0.26,0.95,0.22,0.48,0.56,0.83,1.21,1.09
AGAP2,ENSG00000135439,Trafficking regulation,Tissue enriched,brain,Group enriched,"brain.cerebellum,brain.cerebrum",1.47,7.62,7.22,65.72,...,1.59,3.04,1.03,5.51,9.41,1.36,0.95,1.96,1.05,0.96
AP3B2,ENSG00000103723,Post-Golgi trafficking,Tissue enriched,brain,Group enriched,"brain.cerebellum,brain.cerebrum,pituitary",2.37,0.28,0.05,17.70,...,1.38,0.18,0.74,0.63,0.83,0.64,4.90,1.22,0.78,0.75
DNAJC6,ENSG00000116675,Post-Golgi trafficking,Tissue enriched,brain,Group enriched,"brain.cerebellum,brain.cerebrum,pituitary",10.31,0.83,2.53,83.68,...,0.46,0.16,0.66,0.52,0.74,0.88,0.94,0.61,0.29,0.64
RAB6B,ENSG00000154917,Post-Golgi trafficking,Tissue enriched,brain,Enhanced,brain.cerebrum,9.89,2.38,6.01,106.17,...,0.91,0.37,0.53,1.27,2.16,1.72,2.63,2.40,1.54,1.07
SNAP25,ENSG00000132639,Post-Golgi trafficking,Tissue enriched,brain,Group enriched,"brain.cerebellum,brain.cerebrum",39.99,3.75,0.03,916.53,...,4.32,0.27,0.57,1.45,0.05,2.23,3.48,0.70,5.51,1.23
CRYAA,ENSG00000160202,Protein folding,Tissue enriched,kidney,Tissue enriched,kidney,0.00,0.00,0.00,0.05,...,0.01,0.00,0.02,0.02,0.03,0.01,0.20,0.01,0.02,0.03
HSPA4L,ENSG00000164070,ERAD,Tissue enriched,testis,Tissue enriched,testis,9.51,0.96,1.03,7.31,...,2.17,0.64,5.85,0.99,0.57,1.44,57.43,1.62,0.87,4.79
ABO,ENSG00000175164,Golgi glycosylation,Mixed,,Enhanced,colon,3.85,4.92,0.40,0.26,...,2.02,0.12,0.74,7.80,0.39,1.29,1.60,7.94,0.76,4.02
PSD4,ENSG00000125637,Trafficking regulation,Mixed,,Mixed,,7.73,16.61,7.83,1.77,...,17.57,0.20,10.51,24.51,35.05,13.41,4.87,16.69,1.13,4.91


In [20]:
secM[secM.index.isin(not_in_cho2)]

Unnamed: 0_level_0,ensgid,Subsystem,category.HPA,tissues.HPA,category.GTEx,tissues.GTEx,fpkm.HPA.adrenal,fpkm.HPA.appendices,fpkm.HPA.bonemarrow,fpkm.HPA.brain,...,fpkm.GTEx.prostate,fpkm.GTEx.skeletalmuscle,fpkm.GTEx.skin,fpkm.GTEx.smallintestine,fpkm.GTEx.spleen,fpkm.GTEx.stomach,fpkm.GTEx.testis,fpkm.GTEx.thyroid,fpkm.GTEx.uterus,fpkm.GTEx.vagina
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HSPA4L,ENSG00000164070,ERAD,Tissue enriched,testis,Tissue enriched,testis,9.51,0.96,1.03,7.31,...,2.17,0.64,5.85,0.99,0.57,1.44,57.43,1.62,0.87,4.79


In [25]:
secM[secM.index.isin(not_in_cho3)]

Unnamed: 0_level_0,ensgid,Subsystem,category.HPA,tissues.HPA,category.GTEx,tissues.GTEx,fpkm.HPA.adrenal,fpkm.HPA.appendices,fpkm.HPA.bonemarrow,fpkm.HPA.brain,...,fpkm.GTEx.prostate,fpkm.GTEx.skeletalmuscle,fpkm.GTEx.skin,fpkm.GTEx.smallintestine,fpkm.GTEx.spleen,fpkm.GTEx.stomach,fpkm.GTEx.testis,fpkm.GTEx.thyroid,fpkm.GTEx.uterus,fpkm.GTEx.vagina
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SNAP25,ENSG00000132639,Post-Golgi trafficking,Tissue enriched,brain,Group enriched,"brain.cerebellum,brain.cerebrum",39.99,3.75,0.03,916.53,...,4.32,0.27,0.57,1.45,0.05,2.23,3.48,0.7,5.51,1.23
HSPA4L,ENSG00000164070,ERAD,Tissue enriched,testis,Tissue enriched,testis,9.51,0.96,1.03,7.31,...,2.17,0.64,5.85,0.99,0.57,1.44,57.43,1.62,0.87,4.79
RAB40B,ENSG00000141542,Post-Golgi trafficking,Mixed,,Mixed,,5.37,1.85,0.42,26.88,...,5.74,7.39,5.88,2.93,2.49,6.62,3.46,3.7,4.54,4.87
RAB11FIP1,ENSG00000156675,Trafficking regulation,Expressed in all,,Expressed in all,,4.35,16.61,30.77,0.88,...,4.8,1.85,13.69,18.17,7.1,10.22,4.41,6.65,2.25,6.86


In [26]:
secM[secM.index.isin(not_in_cho4)]

Unnamed: 0_level_0,ensgid,Subsystem,category.HPA,tissues.HPA,category.GTEx,tissues.GTEx,fpkm.HPA.adrenal,fpkm.HPA.appendices,fpkm.HPA.bonemarrow,fpkm.HPA.brain,...,fpkm.GTEx.prostate,fpkm.GTEx.skeletalmuscle,fpkm.GTEx.skin,fpkm.GTEx.smallintestine,fpkm.GTEx.spleen,fpkm.GTEx.stomach,fpkm.GTEx.testis,fpkm.GTEx.thyroid,fpkm.GTEx.uterus,fpkm.GTEx.vagina
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HSPA4L,ENSG00000164070,ERAD,Tissue enriched,testis,Tissue enriched,testis,9.51,0.96,1.03,7.31,...,2.17,0.64,5.85,0.99,0.57,1.44,57.43,1.62,0.87,4.79


In [27]:
secM[secM.index.isin(not_in_cho5)]

Unnamed: 0_level_0,ensgid,Subsystem,category.HPA,tissues.HPA,category.GTEx,tissues.GTEx,fpkm.HPA.adrenal,fpkm.HPA.appendices,fpkm.HPA.bonemarrow,fpkm.HPA.brain,...,fpkm.GTEx.prostate,fpkm.GTEx.skeletalmuscle,fpkm.GTEx.skin,fpkm.GTEx.smallintestine,fpkm.GTEx.spleen,fpkm.GTEx.stomach,fpkm.GTEx.testis,fpkm.GTEx.thyroid,fpkm.GTEx.uterus,fpkm.GTEx.vagina
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SNAP25,ENSG00000132639,Post-Golgi trafficking,Tissue enriched,brain,Group enriched,"brain.cerebellum,brain.cerebrum",39.99,3.75,0.03,916.53,...,4.32,0.27,0.57,1.45,0.05,2.23,3.48,0.7,5.51,1.23
HSPA4L,ENSG00000164070,ERAD,Tissue enriched,testis,Tissue enriched,testis,9.51,0.96,1.03,7.31,...,2.17,0.64,5.85,0.99,0.57,1.44,57.43,1.62,0.87,4.79
RAB40B,ENSG00000141542,Post-Golgi trafficking,Mixed,,Mixed,,5.37,1.85,0.42,26.88,...,5.74,7.39,5.88,2.93,2.49,6.62,3.46,3.7,4.54,4.87
RAB11FIP1,ENSG00000156675,Trafficking regulation,Expressed in all,,Expressed in all,,4.35,16.61,30.77,0.88,...,4.8,1.85,13.69,18.17,7.1,10.22,4.41,6.65,2.25,6.86


### Golgi glycosylation genes

In [15]:
np.sum(secM.loc[secM_silenced_in_CHO.index,'Subsystem'] == 'Golgi glycosylation')

15

In [16]:
### machinery expressed in CHO
len(secM_in_CHO.index)

318

In [20]:
print('SecM genes not in CHO but in all other tissues')
secM.loc[only_not_in_cho[only_not_in_cho.index.isin(secM.index)].index]

SecM genes not in CHO but in all other tissues


Unnamed: 0,ensgid,Subsystem,category.HPA,tissues.HPA,category.GTEx,tissues.GTEx,fpkm.HPA.adrenal,fpkm.HPA.appendices,fpkm.HPA.bonemarrow,fpkm.HPA.brain,...,fpkm.GTEx.prostate,fpkm.GTEx.skeletalmuscle,fpkm.GTEx.skin,fpkm.GTEx.smallintestine,fpkm.GTEx.spleen,fpkm.GTEx.stomach,fpkm.GTEx.testis,fpkm.GTEx.thyroid,fpkm.GTEx.uterus,fpkm.GTEx.vagina
ERLEC1,ENSG00000068912,Protein folding,Expressed in all,,Expressed in all,,44.69,29.09,16.54,38.64,...,27.4,11.73,14.55,20.24,22.7,21.47,30.54,32.03,25.53,22.53
HSPA4L,ENSG00000164070,ERAD,Tissue enriched,testis,Tissue enriched,testis,9.51,0.96,1.03,7.31,...,2.17,0.64,5.85,0.99,0.57,1.44,57.43,1.62,0.87,4.79
RAB11FIP1,ENSG00000156675,Trafficking regulation,Expressed in all,,Expressed in all,,4.35,16.61,30.77,0.88,...,4.8,1.85,13.69,18.17,7.1,10.22,4.41,6.65,2.25,6.86
