In [1]:
#Download dependencies before proceeding

import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
from arboreto.algo import grnboost2
import os
import umap
from matplotlib_venn import venn2

sns.set(rc={'figure.figsize':(12,8)})
sns.set(style='whitegrid')

  from .autonotebook import tqdm as notebook_tqdm


Load in files for analysis

In [2]:
wd = '/home/users/jvs15/MachineLearningProstate'
d = {}
for name in os.listdir(f'{wd}/data/timeseries/'):
    nametag = name.split('_')[1]
    nametag = nametag.split('.')[0]
    if 'Clone' in nametag:
        d[nametag] = pd.read_csv(f'{wd}/data/timeseries/{name}')
        print(d[nametag].head())

      Gene  LNCaP_Clone07_04uM  LNCaP_Clone07_16uM  LNCaP_Clone07_50uM
0      LBH            0.833386            0.000000           14.909345
1     HPGD            0.000000            3.364998           29.818690
2   CHRDL2            0.833386            3.364998           79.516507
3    KRT75            1.666772            3.364998          120.517207
4  CACNA1A            0.000000            0.000000            1.242445
      Gene  LNCaP_Clone10_04uM  LNCaP_Clone10_16uM  LNCaP_Clone10_50uM
0      LBH                 0.0            0.000000           82.704033
1     HPGD                 0.0            0.000000            4.531728
2   CHRDL2                 0.0            1.988285           21.525707
3    KRT75                 0.0            4.970713           60.045394
4  CACNA1A                 0.0            0.994143            0.000000
      Gene  LNCaP_Clone03_04uM  LNCaP_Clone03_16uM  LNCaP_Clone03_50uM
0      LBH            0.776808            4.322279            3.427525
1     

Compare gene expression by time point and generate lists of genes that are continuously up or down

In [33]:
concs = ['04uM', '16uM', '50uM']

# Make a new column called 'Trajectory' that keeps track of gene expression changes between states

# NOTE: This excludes values that have 0 values at two different time points (for now)

genelist = {}
for k, v in d.items():
    cols = [f'LNCaP_{k}_{conc}' for conc in concs]
    v['Trajectory'] = np.select([
                        (v[cols[0]] < v[cols[1]]) & (v[cols[1]] < v[cols[2]]),
                        (v[cols[0]] > v[cols[1]]) & (v[cols[1]] > v[cols[2]]),
                        (v[cols[0]] < v[cols[1]]) & (v[cols[1]] > v[cols[2]]),
                        (v[cols[0]] > v[cols[1]]) & (v[cols[1]] < v[cols[2]])
                        ],
                        ['up', 'down', 'updown', 'downup'])
    # NOTE: Likely important to adjust log math in the future or on case by case basis
    v['FC'] = np.log2((v[cols[2]] + 1) / (v[cols[0]] + 1))
    genelist[k] = {}
    genelist[k]['up'] = v[['Gene', 'FC']].loc[v['Trajectory'] == 'up']
    genelist[k]['down'] = v[['Gene', 'FC']].loc[v['Trajectory'] == 'down']
    

             Gene        FC
148         PARP6 -0.642347
360       GALNT13 -0.353495
399         AP3B2 -0.142725
499    AP000472.2 -2.294442
1108         GIPR -0.697651
...           ...       ...
16958       GSDMC -2.772668
16960       INHBE -2.757845
16961      GPR126 -5.071248
16962       CNTN5 -6.756760
16964     SLITRK3 -7.246436

[2940 rows x 2 columns]
           Gene         FC
370        ATF3  -1.240730
457     NOSTRIN  -0.456668
494       DDIT3  -0.297587
541        LMO2  -1.406958
779    MIR4477A  -0.267455
...         ...        ...
16960     INHBE  -6.211818
16961    GPR126  -6.359012
16962     CNTN5  -8.645336
16963      DSC3  -5.976801
16964   SLITRK3 -10.347906

[3044 rows x 2 columns]
              Gene        FC
20            HES7 -0.572174
321          PDGFC -1.188774
443    RP1-199J3.5 -1.188774
465          PGBD5 -1.203482
523          KCND1 -1.751591
...            ...       ...
16956        CNTN2 -5.155262
16960        INHBE -3.014480
16961       GPR126 -6.001924


Make .rnk files for running GSEA -- should also be helpful for making Venn diagrams in R?

Grab lists of genes that are continuously up or down

In [22]:
def print_genes(genelist, sample, trajectory):
    """
    genelist: nested dictionary. Keys are samples, keys of keys are 'up' or 'down' gene lists
    sample: string representing sample type
    trajectory: string representing geneset type
    """
    for gene in genelist[sample][trajectory]:
        print(gene)
    return

print_genes(genelist, 'Clone03', 'up')

HPGD
CHRDL2
KRT75
CCBE1
ADRA2A
THSD7A
ACP5
MIR34A
RP11-11N9.4
PTGER3
CTAG2
SERPINH1
PCGEM1
SPOCK1
SHISA2
CTNNA2
CTD-2334D19.1
VCX2
CPNE4
C15orf56
COX7B2
RP11-431M3.1
CTC-499J9.1
RP11-368M16.3
MRGPRE
CLEC2B
APOA5
SPRY4
BVES-AS1
TMEFF2
IQSEC3
TRPA1
PRR16
AC011298.2
DIO1
TMEM178A
POPDC3
GLRB
RP13-492C18.2
OAS1
FGF13
COL23A1
TRIM22
RP11-86H7.7
CLEC7A
ZNF626
KCNC2
MAT1A
RP11-760D2.5
SAMD9
LINC00842
VIM
RP11-62C7.2
MID1
DNASE1L3
RPL29P11
HERC5
TRGC1
ASGR1
MIR3189
SLC44A5
HSD17B14
KIRREL
COL3A1
CYP3A5
PSMB8
ASTN1
CRISP3
RP11-214K3.18
AF131217.1
C3orf52
BRDT
WWTR1
TAPSAR1
MAF
DIO3
IL12A
RFX6
MAFF
AKR1C3
ASIP
PARP6
UBA7
FANCD2OS
ORM1
DUSP15
HLA-J
FAM13C
MDK
ZNF385B
ENTPD1
RP11-459C13.1
RHOBTB1
RP11-317N12.1
RSPO4
NT5DC4
ORM2
BDH2
EMID1
LY96
GPNMB
AC090043.1
ERC2
MT-CO1
CXXC4
VEGFB
NPR3
PDE6B
DAB1
AC147651.3
CHN2
MACROD1
JAKMIP1
MYBPC1
PIFO
FABP6
SYT13
RP11-112N23.1
MT-CO2
MIR24-2
RP11-294O2.2
hsa-mir-6723
BRSK1
EFNB3
RP11-475J5.4
DIO3OS
PTPRR
AZGP1
ZNF728
RPL39P38
RP13-258O15.1
MTRNR2L12
BEX5
P