In [2]:
#Download dependencies before proceeding

import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
from arboreto.algo import grnboost2
import os
import umap

sns.set(rc={'figure.figsize':(12,8)})
sns.set(style='whitegrid')

Load in files for analysis

In [3]:
wd = '/home/users/jvs15/MachineLearningProstate'
d = {}
for name in os.listdir(f'{wd}/data/timeseries/'):
    nametag = name.split('_')[1]
    nametag = nametag.split('.')[0]
    if 'Clone' in nametag:
        d[nametag] = pd.read_csv(f'{wd}/data/timeseries/{name}')

      Gene  LNCaP_Clone07_04uM  LNCaP_Clone07_16uM  LNCaP_Clone07_50uM
0      LBH            0.833386            0.000000           14.909345
1     HPGD            0.000000            3.364998           29.818690
2   CHRDL2            0.833386            3.364998           79.516507
3    KRT75            1.666772            3.364998          120.517207
4  CACNA1A            0.000000            0.000000            1.242445
      Gene  LNCaP_Clone10_04uM  LNCaP_Clone10_16uM  LNCaP_Clone10_50uM
0      LBH                 0.0            0.000000           82.704033
1     HPGD                 0.0            0.000000            4.531728
2   CHRDL2                 0.0            1.988285           21.525707
3    KRT75                 0.0            4.970713           60.045394
4  CACNA1A                 0.0            0.994143            0.000000
      Gene  LNCaP_Clone03_04uM  LNCaP_Clone03_16uM  LNCaP_Clone03_50uM
0      LBH            0.776808            4.322279            3.427525
1     

Compare gene expression by time point and generate lists of genes that are continuously up or down

In [4]:
concs = ['04uM', '16uM', '50uM']

# Make a new column called 'Trajectory' that keeps track of gene expression changes between states

# NOTE: This excludes values that have 0 values at two different time points (for now)

genelist = {}
for k, v in d.items():
    cols = [f'LNCaP_{k}_{conc}' for conc in concs]
    v['Trajectory'] = np.select([
                        (v[cols[0]] < v[cols[1]]) & (v[cols[1]] < v[cols[2]]),
                        (v[cols[0]] > v[cols[1]]) & (v[cols[1]] > v[cols[2]]),
                        (v[cols[0]] < v[cols[1]]) & (v[cols[1]] > v[cols[2]]),
                        (v[cols[0]] > v[cols[1]]) & (v[cols[1]] < v[cols[2]])
                        ],
                        ['up', 'down', 'updown', 'downup'])
    # NOTE: Likely important to adjust log math in the future or on case by case basis
    v['FC'] = np.log2((v[cols[2]] + 1) / (v[cols[0]] + 1))
    genelist[k] = {}
    genelist[k]['up'] = v[['Gene', 'FC']].loc[v['Trajectory'] == 'up']
    genelist[k]['down'] = v[['Gene', 'FC']].loc[v['Trajectory'] == 'down']
    

Make .rnk files for running GSEA -- should also be helpful for making Venn diagrams in R?

In [10]:
def make_rnk(genelist, sample, trajectory):
    df = genelist[sample][trajectory].sort_values(by = 'FC')
    df.to_csv(f'{sample}_{trajectory}.rnk', sep='\t', index=False, header=False)
    return

make_rnk(genelist, 'Clone03', 'up')

Grab lists of genes that are continuously up or down

In [7]:
def print_genes(genelist, sample, trajectory):
    """
    genelist: nested dictionary. Keys are samples, keys of keys are 'up' or 'down' gene lists
    sample: string representing sample type
    trajectory: string representing geneset type
    """
    for gene in genelist[sample][trajectory]:
        print(gene)
    return



Gene
FC
