In [1]:
import os
import sys
import pandas as pd
import matplotlib
import seaborn as sns
import pickle
import numpy as np
import matplotlib as mpl
mpl.style.use('fivethirtyeight')
from matplotlib import pyplot as plt
from itertools import product
import glob
import re
from matplotlib_venn import venn2
from matplotlib import rcParams
import inspect
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sys.setrecursionlimit(3000)
sns.set_context('notebook')
%load_ext autoreload
%autoreload 2
rcParams['figure.figsize'] = 8, 6
#rcParams['font.family'] = 'sans-serif'
#rcParams['font.sans-serif'] = ['Tahoma']
import tqdm
### notebook specific configuration ###
from os.path import basename
from tqdm import tqdm_notebook

from Modules.Homer import *

sys.path.append('/home/isshamie/software/NGS-Pipeline')

data_folder = '/data/isshamie/CH_tissue_TSS/'

ref_fa = '/data/genome/hamster/picr/picr.fa'
annotation = '/data/genome/hamster/picr/updated_final_sort.gff3'
tss_annotation = '/data/isshamie/genome/start_site_mRNA_updated_final_sort.tsv'
mrna_peak = '/data/isshamie/TSS_CHO/mRNA.peak'

tissues = ['BloondnegSpleen','BMDM1hKLA1','BMDMwt','Brain','Brain7neg1',
          'FemaleReproductive','Heart','Intestine','Kidney','Liver','Lung',
           'MiscOrgans','Muscle','Pancreas','Skin','Spleen','Total','CHO']

len(tissues)

### Load in meta_sampe

meta_samples = pd.read_csv('Results/meta_samples.csv',index_col=0)

long_meta_samples = pd.read_csv('Results/long_meta_samples.csv',index_col=0)


This call to matplotlib.use() has no effect because the backend has already
been chosen; matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.



In [2]:
txn_df = pickle.load(open('Results/txn_df.p','rb'))

In [3]:
gene_df = pickle.load(open('Results/gene_df.p','rb'))

In [4]:
gene_id_df = pickle.load(open('Results/gene_id_df.p','rb'))

In [5]:
## Only use genes with a peak
gene_df = gene_df[gene_df['hasGene']]

### Load gene_tissue_matrix

In [6]:
gene_tissue_matrix = pd.read_csv('Results/merged_samples/genes_tissues.csv',index_col=0)
gene_tissue_matrix.fillna(0,inplace=True)
gene_tissue_matrix.head()

Unnamed: 0,BloondnegSpleen/GRO/f04_peaks/merge_bg_2.peak,BMDM1hKLA1/GRO/f04_peaks/merge_bg_2.peak,BMDMwt/GRO/f04_peaks/merge_bg_2.peak,Brain/GRO/f04_peaks/merge_bg_2.peak,Brain/mSTART/f04_peaks/merge_bg_2.peak,FemaleReproductive/mSTART/f04_peaks/merge_bg_2.peak,Heart/mSTART/f04_peaks/merge_bg_2.peak,Kidney/GRO/f04_peaks/merge_bg_2.peak,Kidney/mSTART/f04_peaks/merge_bg_2.peak,Liver/GRO/f04_peaks/merge_bg_2.peak,Liver/mSTART/f04_peaks/merge_bg_2.peak,Lung/GRO/f04_peaks/merge_bg_2.peak,Lung/mSTART/f04_peaks/merge_bg_2.peak,MiscOrgans/mSTART/f04_peaks/merge_bg_2.peak,Muscle/mSTART/f04_peaks/merge_bg_2.peak,Pancreas/mSTART/f04_peaks/merge_bg_2.peak,Spleen/mSTART/f04_peaks/merge_bg_2.peak,CHO/GRO/f04_peaks/merge_bg_2.peak,CHO/mSTART/f04_peaks/merge_bg_2.peak
SERHL,188.199997,13.3,90.199997,333.5,501.799988,1089.349976,777.75,453.299988,18793.900391,374.600006,337.950012,294.100006,282.199982,697.300049,593.450012,529.299988,201.0,971.449951,819.599976
RRP7A,0.0,0.0,0.0,25.299999,27.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,29.65,0.0,0.0,91.099998,0.0,34.299999
POLDIP3,135.199997,31.799999,97.699997,122.599998,367.649994,148.149994,79.5,84.199997,0.0,85.099998,0.0,132.199997,0.0,174.800003,105.100006,0.0,135.5,317.799988,324.100006
CYB5R3,0.0,91.400002,45.099998,0.0,0.0,14.16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.85
A4GALT,30.799999,324.600006,257.299988,59.900002,0.0,136.25,345.149994,85.300003,48.950001,0.0,0.0,95.199997,0.0,98.949997,291.75,0.0,37.400002,0.0,0.0


### Groupby in cho and not in cho

In [7]:
## Remove the genes with no peaks
gene_tissue_matrix = gene_tissue_matrix[~(gene_tissue_matrix==0).all(axis=1)]

In [8]:
gene_tissue_matrix.head()

Unnamed: 0,BloondnegSpleen/GRO/f04_peaks/merge_bg_2.peak,BMDM1hKLA1/GRO/f04_peaks/merge_bg_2.peak,BMDMwt/GRO/f04_peaks/merge_bg_2.peak,Brain/GRO/f04_peaks/merge_bg_2.peak,Brain/mSTART/f04_peaks/merge_bg_2.peak,FemaleReproductive/mSTART/f04_peaks/merge_bg_2.peak,Heart/mSTART/f04_peaks/merge_bg_2.peak,Kidney/GRO/f04_peaks/merge_bg_2.peak,Kidney/mSTART/f04_peaks/merge_bg_2.peak,Liver/GRO/f04_peaks/merge_bg_2.peak,Liver/mSTART/f04_peaks/merge_bg_2.peak,Lung/GRO/f04_peaks/merge_bg_2.peak,Lung/mSTART/f04_peaks/merge_bg_2.peak,MiscOrgans/mSTART/f04_peaks/merge_bg_2.peak,Muscle/mSTART/f04_peaks/merge_bg_2.peak,Pancreas/mSTART/f04_peaks/merge_bg_2.peak,Spleen/mSTART/f04_peaks/merge_bg_2.peak,CHO/GRO/f04_peaks/merge_bg_2.peak,CHO/mSTART/f04_peaks/merge_bg_2.peak
SERHL,188.199997,13.3,90.199997,333.5,501.799988,1089.349976,777.75,453.299988,18793.900391,374.600006,337.950012,294.100006,282.199982,697.300049,593.450012,529.299988,201.0,971.449951,819.599976
RRP7A,0.0,0.0,0.0,25.299999,27.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,29.65,0.0,0.0,91.099998,0.0,34.299999
POLDIP3,135.199997,31.799999,97.699997,122.599998,367.649994,148.149994,79.5,84.199997,0.0,85.099998,0.0,132.199997,0.0,174.800003,105.100006,0.0,135.5,317.799988,324.100006
CYB5R3,0.0,91.400002,45.099998,0.0,0.0,14.16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.85
A4GALT,30.799999,324.600006,257.299988,59.900002,0.0,136.25,345.149994,85.300003,48.950001,0.0,0.0,95.199997,0.0,98.949997,291.75,0.0,37.400002,0.0,0.0


In [9]:
in_cho = gene_tissue_matrix[(gene_tissue_matrix['CHO/GRO/f04_peaks/merge_bg_2.peak']>0) | (gene_tissue_matrix['CHO/mSTART/f04_peaks/merge_bg_2.peak']>0)]
not_in_cho = gene_tissue_matrix[~((gene_tissue_matrix['CHO/GRO/f04_peaks/merge_bg_2.peak']>0) | (gene_tissue_matrix['CHO/mSTART/f04_peaks/merge_bg_2.peak']>0))]

In [10]:
print(len(in_cho))
print(len(not_in_cho))
print(len(not_in_cho) + len(in_cho))
print(gene_tissue_matrix.shape)

7175
4744
11919
(11919, 19)


### Get list of SecM genes

In [11]:
secM = pd.read_csv('070870-1_SecretoryMachinery.csv',index_col=1,skiprows=4)
secM.head()

Unnamed: 0_level_0,ensgid,Subsystem,category.HPA,tissues.HPA,category.GTEx,tissues.GTEx,fpkm.HPA.adrenal,fpkm.HPA.appendices,fpkm.HPA.bonemarrow,fpkm.HPA.brain,...,fpkm.GTEx.prostate,fpkm.GTEx.skeletalmuscle,fpkm.GTEx.skin,fpkm.GTEx.smallintestine,fpkm.GTEx.spleen,fpkm.GTEx.stomach,fpkm.GTEx.testis,fpkm.GTEx.thyroid,fpkm.GTEx.uterus,fpkm.GTEx.vagina
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
RAB44,ENSG00000255587,Post-Golgi trafficking,Tissue enriched,bonemarrow,Enhanced,blood,0.04,0.49,27.67,0.01,...,0.46,0.06,0.68,0.56,0.89,0.37,0.13,0.05,0.39,0.54
B4GALNT1,ENSG00000135454,Golgi glycosylation,Tissue enriched,brain,Group enriched,"brain.cerebellum,brain.cerebrum,nerve",0.56,0.62,0.03,12.69,...,1.73,2.63,0.26,0.95,0.22,0.48,0.56,0.83,1.21,1.09
NAPB,ENSG00000125814,COPII,Tissue enriched,brain,Group enriched,"brain.cerebellum,brain.cerebrum",21.39,4.8,5.52,156.18,...,5.64,2.05,4.75,5.05,5.22,3.72,7.49,6.72,6.05,6.5
NSF,ENSG00000073969,COPII,Tissue enriched,brain,Expressed in all,,14.46,13.96,4.66,117.52,...,9.25,11.03,9.75,9.38,9.92,8.49,14.16,8.88,6.93,7.71
AGAP2,ENSG00000135439,Trafficking regulation,Tissue enriched,brain,Group enriched,"brain.cerebellum,brain.cerebrum",1.47,7.62,7.22,65.72,...,1.59,3.04,1.03,5.51,9.41,1.36,0.95,1.96,1.05,0.96


#### secM silenced in CHO

In [12]:
secM_silenced_in_CHO = not_in_cho[not_in_cho.index.isin(secM.index)]

In [13]:
secM_in_CHO = in_cho[in_cho.index.isin(secM.index)]

In [14]:
secM.loc[secM_silenced_in_CHO.index]

Unnamed: 0,ensgid,Subsystem,category.HPA,tissues.HPA,category.GTEx,tissues.GTEx,fpkm.HPA.adrenal,fpkm.HPA.appendices,fpkm.HPA.bonemarrow,fpkm.HPA.brain,...,fpkm.GTEx.prostate,fpkm.GTEx.skeletalmuscle,fpkm.GTEx.skin,fpkm.GTEx.smallintestine,fpkm.GTEx.spleen,fpkm.GTEx.stomach,fpkm.GTEx.testis,fpkm.GTEx.thyroid,fpkm.GTEx.uterus,fpkm.GTEx.vagina
GALNT6,ENSG00000139629,Golgi glycosylation,Enhanced,stomach,Mixed,,2.63,7.72,4.53,3.23,...,0.78,0.07,2.79,2.88,3.75,7.73,0.46,1.04,0.28,1.01
COPZ1,ENSG00000111481,COPI,Expressed in all,,Expressed in all,,54.86,45.58,28.14,39.07,...,35.75,17.96,32.27,27.99,32.14,32.24,18.72,44.80,31.41,29.72
PGAP1,ENSG00000197121,GPI biosynthesis,Mixed,,Mixed,,8.36,1.15,1.00,7.35,...,2.65,0.59,4.29,2.07,0.52,1.27,1.68,1.75,2.38,2.97
RAB31,ENSG00000168461,Post-Golgi trafficking,Expressed in all,,Expressed in all,,89.24,38.21,30.58,87.18,...,15.42,1.66,7.37,15.76,16.94,18.60,21.74,18.51,30.39,26.52
SMCHD1,ENSG00000101596,Trafficking regulation,Expressed in all,,Expressed in all,,6.45,23.04,19.12,5.89,...,9.68,4.55,5.94,12.57,16.69,6.99,15.16,7.67,11.72,10.36
RABGAP1L,ENSG00000152061,Trafficking regulation,Expressed in all,,Expressed in all,,7.77,20.21,16.30,20.52,...,2.06,0.72,3.12,4.07,3.62,2.38,3.29,2.47,2.33,2.29
MGAT4C,ENSG00000182050,Golgi glycosylation,Enhanced,"brain,thyroid",Enhanced,"thyroid,testis",2.79,0.18,0.00,6.53,...,0.07,0.00,0.02,0.05,0.00,0.04,2.42,2.58,0.02,0.06
RAB29,ENSG00000117280,Post-Golgi trafficking,Expressed in all,,Expressed in all,,11.73,17.98,3.77,13.63,...,8.87,2.22,3.53,8.90,14.33,7.00,2.42,14.61,5.14,4.77
PIGN,ENSG00000197563,GPI biosynthesis,Expressed in all,,Expressed in all,,7.88,7.65,3.19,6.50,...,4.33,1.88,5.01,2.66,2.43,4.73,4.42,3.78,2.68,4.66
RAB2B,ENSG00000129472,COPI,Expressed in all,,Expressed in all,,11.15,7.29,10.85,10.08,...,12.24,2.40,9.00,8.37,9.58,6.77,11.35,14.07,9.26,8.66


### Golgi glycosylation genes

In [15]:
np.sum(secM.loc[secM_silenced_in_CHO.index,'Subsystem'] == 'Golgi glycosylation')

15

In [16]:
### machinery expressed in CHO
len(secM_in_CHO.index)

318

## List of genes active in Most others but not in CHO

In [17]:
gene_tissue_matrix_not_CHO = gene_tissue_matrix.iloc[:,:-2]
GRO_gene_tissue_matrix_not_CHO = gene_tissue_matrix_not_CHO.loc[:,gene_tissue_matrix_not_CHO.columns.str.contains('GRO')]
gene_CHO_matrix = gene_tissue_matrix.iloc[:,-2:] 

only_not_in_cho = gene_tissue_matrix[((GRO_gene_tissue_matrix_not_CHO > 0).all(axis=1)) & (gene_CHO_matrix == 0).all(axis=1)]
#gene_tissue_matrix[(np.sum(GRO_gene_tissue_matrix_not_CHO > 0,axis=1)>10) & (gene_tissue_matrix.iloc[:,-2:] == 0).all(axis=1)]

In [18]:
print('Not in CHO but in all other tissues: ',len(only_not_in_cho))

('Not in CHO but in all other tissues: ', 178)


In [19]:
for i in only_not_in_cho.index:
    print(i)

TTLL1
MAPK12
KRT18
NCKAP1L
ERBB4
DOCK10
SP140L
SH2D1B
DNM3
AXDND1
MGAT4A
PI4K2B
HS3ST1
ABAT
SEC14L5
MAZ
ARHGAP26
GPR155
KIF5C
ENPP5
RASGEF1B
PLEKHF1
ZNF570
GAS6
ERLEC1
UGP2
GM11992
NACAD
PGAP3
ZCCHC12
FAM213B
PLSCR1
ZNF717
PLEKHA4
CD37
ZNF160
HA2J_MOUSE
H2-EB1
ATG9B
AKR1C9
ECHDC3
CAMK1D
MAP3K8
MPP7
GVIN1
SNX20
BBS2
TPPP3
NUDT7
MGLL
LILRA5
NILR1
LILRB3
SLC37A2
NXPE2
ARHGAP4
TSEN34
FAM213A
IL17RD
CLEC4A
CLIP3
ENTPD6
SLC4A11
MYEF2
GATM
INO80
DDB2
SPI1
PPP1R3C
MPEG1
MS4A6B
PNMA8B
SAMD9L
ABLIM1
GK
TMEM47
GJA6
GPM6B
ITM2A
ZBTB25
TMEM156
RASL11B
KDR
STARD5
CTSC
UCP2
MAATS1
FAM105A
GPC2
CREB3L2
SLCO3A1
NTRK3
STK32C
TUBB2A
IRF4
CMAH
HIST1H1A
HIST1H2AF
HIST1H1B
F11R
ID4
HSPA4L
CPM
ABCA17
ENPP2
MYSM1
PKD2L2
VGLL4
RASSF4
WISP1
FAM49B
ALOX5AP
N4BP2L1
PDE7B
ARFGEF3
FUCA2
SEPT9
RGS9
TBX3
IL1RN
SLC16A10
TAL1
RBM38
SLC2A10
ZBTB7A
NUDT16
MANF
RTP3
TRMT10B
UNC13B
EPHA7
OGFRL1
GUCY1B3
RBM46
NES
GON4L
MYO9A
SEMA4F
MID1IP1
CLCN5
IDS
CCR1
OPLAH
CH082_RAT
NCF4
LRRC8D
RASGRP3
ZNF280C
CLEC7A
WBP11
PCDH9
TOM1L1


In [20]:
print('SecM genes not in CHO but in all other tissues')
secM.loc[only_not_in_cho[only_not_in_cho.index.isin(secM.index)].index]

SecM genes not in CHO but in all other tissues


Unnamed: 0,ensgid,Subsystem,category.HPA,tissues.HPA,category.GTEx,tissues.GTEx,fpkm.HPA.adrenal,fpkm.HPA.appendices,fpkm.HPA.bonemarrow,fpkm.HPA.brain,...,fpkm.GTEx.prostate,fpkm.GTEx.skeletalmuscle,fpkm.GTEx.skin,fpkm.GTEx.smallintestine,fpkm.GTEx.spleen,fpkm.GTEx.stomach,fpkm.GTEx.testis,fpkm.GTEx.thyroid,fpkm.GTEx.uterus,fpkm.GTEx.vagina
ERLEC1,ENSG00000068912,Protein folding,Expressed in all,,Expressed in all,,44.69,29.09,16.54,38.64,...,27.4,11.73,14.55,20.24,22.7,21.47,30.54,32.03,25.53,22.53
HSPA4L,ENSG00000164070,ERAD,Tissue enriched,testis,Tissue enriched,testis,9.51,0.96,1.03,7.31,...,2.17,0.64,5.85,0.99,0.57,1.44,57.43,1.62,0.87,4.79
RAB11FIP1,ENSG00000156675,Trafficking regulation,Expressed in all,,Expressed in all,,4.35,16.61,30.77,0.88,...,4.8,1.85,13.69,18.17,7.1,10.22,4.41,6.65,2.25,6.86


### Active in greater than 5 tissues but not CHO

In [21]:
GRO_gene_tissue_matrix_not_CHO.columns.values

array(['BloondnegSpleen/GRO/f04_peaks/merge_bg_2.peak',
       'BMDM1hKLA1/GRO/f04_peaks/merge_bg_2.peak',
       'BMDMwt/GRO/f04_peaks/merge_bg_2.peak',
       'Brain/GRO/f04_peaks/merge_bg_2.peak',
       'Kidney/GRO/f04_peaks/merge_bg_2.peak',
       'Liver/GRO/f04_peaks/merge_bg_2.peak',
       'Lung/GRO/f04_peaks/merge_bg_2.peak'], dtype=object)

In [24]:
#ten_not_in_cho = gene_tissue_matrix[((GRO_gene_tissue_matrix_not_CHO > 0).all(axis=1)) & (gene_CHO_matrix == 0).all(axis=1)]
four_gene_tissue_matrix = gene_tissue_matrix[(np.sum(GRO_gene_tissue_matrix_not_CHO > 0,axis=1)>4) & (gene_CHO_matrix == 0).all(axis=1)]
four_gene_tissue_matrix.head()
print(len(four_gene_tissue_matrix))

590


### Apoptotic genes