In [1]:
'''
We have a set of ~300 genes with names and scores.

Given an RPKM file, normalize, extract the genes of interest,
and determine score.
'''

'\nWe have a set of ~300 genes with names and scores.\n\nGiven an RPKM file, normalize, extract the genes of interest,\nand determine score.\n'

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import subprocess

In [5]:
colorset = ['#763C87','#1FA2F4','#0569CE','#F48C36','#E54028']
greens = ['#37535e', '#3b748a', '#4095b5', '#52aec9', '#72bfc4', '#93d0bf']

mpl.rcParams.update({
'backend': 'MacOSX',
'text.usetex': 'false',
'font.family': 'Myriad Pro',
'font.size': 18, 
'figure.subplot.hspace': .4,
'figure.subplot.bottom': 0.125, 
'savefig.dpi': 600,
'savefig.format': 'pdf', 
'pdf.fonttype': 42,
'figure.edgecolor': (1, 1, 1, 0), 
'figure.facecolor': (1, 1, 1, 0), 
'figure.figsize': (15.0, 10.0),
'font.weight': 'bold',
'axes.labelweight': 'bold',
'axes.titlesize': 36,
'lines.markersize': 11.200000000000001,
 'ytick.major.width': 1.6,
 'xtick.major.width': 1.6,
 'lines.markeredgewidth': 0.0,
 'xtick.major.pad': 11.200000000000001,
 'ytick.minor.width': 0.8,
 'grid.linewidth': 1.6,
 'lines.linewidth': 2.8000000000000003,
 'ytick.major.pad': 11.200000000000001,
})

In [8]:
gse = 'and_sorted'
filename = '../../data/sorted/rpkm.txt'
data = pd.io.parsers.read_csv(filename, 
                                    sep='\t', 
                                    header=0, index_col=0)

print(data.shape)

# Get gene column
data['Gene symbol'] = [s.split('|')[0] for s in data['Annotation/Divergence']]

cols = ['Tcell-CD69pos-K99A10-22h-ES FPKM',
'Tcell-CD69pos-K99A100-22h-ES FPKM',
'Tcell-CD69pos-PCC01-22h-ES FPKM',
'Tcell-CD69pos-PCC1-22h-ES FPKM']

labels = ['CD69+ 10uM K99A', 
          'CD69+ 100uM K99A', 
          'CD69+ 0.1uM PCC', 
          'CD69+ 1uM PCC']
data.head()

(24453, 15)


Unnamed: 0_level_0,chr,start,end,strand,Length,Copies,Annotation/Divergence,Tcell-CD69neg-K99A10-22h-ES FPKM,Tcell-CD69neg-nopept-22h-ES FPKM,Tcell-CD69neg-PCC01-22h-ES FPKM,Tcell-CD69neg-PCC1-22h-ES FPKM,Tcell-CD69pos-K99A100-22h-ES FPKM,Tcell-CD69pos-K99A10-22h-ES FPKM,Tcell-CD69pos-PCC01-22h-ES FPKM,Tcell-CD69pos-PCC1-22h-ES FPKM,Gene symbol
Transcript/RepeatID (cmd=analyzeRepeats.pl rna mm10 -strand both -count exons -d Tcell-CD69neg-K99A10-22h-ES Tcell-CD69neg-nopept-22h-ES Tcell-CD69neg-PCC01-22h-ES Tcell-CD69neg-PCC1-22h-ES Tcell-CD69pos-K99A100-22h-ES Tcell-CD69pos-K99A10-22h-ES Tcell-CD69pos-PCC01-22h-ES Tcell-CD69pos-PCC1-22h-ES -rpkm -condenseGenes),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
NM_001038996,chr6,41354105,41357944,+,883,1,Try10|-|-|6|6 B1|protein-coding,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,Try10
NM_001013751,chr5,134557254,134560171,-,2915,1,Syna|Gm453|Gm52|syncytin-A|-|5|5 G2|protein-co...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.049,Syna
NM_009838,chr5,129787356,129846443,+,2156,1,Cct6a|CCT-zeta|CCT-zeta-1|Cct6|Cctz-1|TCP-1-ze...,18.312,0.231,112.499,0,52.881,0.168,117.648,146.131,Cct6a
NM_001252516,chr5,106901889,106925890,-,1426,1,Hfm1|A330009G12Rik|Gm1046|Mer3|Sec63d1|-|5|5 E...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,Hfm1
NM_172779,chrX,56454839,56507843,+,3789,1,Ddx26b|4930535D10Rik|6330505F04Rik|D130066O12|...,7.933,0.0,0.062,0,0.973,0.0,2.995,2.934,Ddx26b


In [9]:
cols_ordered = cols #[item for sublist in zip(cols, meki_cols) for item in sublist]
cols_ordered

['Tcell-CD69pos-K99A10-22h-ES FPKM',
 'Tcell-CD69pos-K99A100-22h-ES FPKM',
 'Tcell-CD69pos-PCC01-22h-ES FPKM',
 'Tcell-CD69pos-PCC1-22h-ES FPKM']

In [10]:
subdata = data[['Gene symbol'] + cols_ordered]
filename = 'data/{}_gene_values.txt'.format(gse)
subdata.to_csv(filename,
               header=True, index=False, sep='\t')
scores = subprocess.check_output('python ../calculate_activation_signature_score.py -f ' +
                                '{}'.format(filename), shell=True)
print(scores)



In [11]:
import sys
sys.path.append('../')
from calculate_activation_signature_score import ActivationOptionParser

parser = ActivationOptionParser()
filename = 'data/{}_gene_values.txt'.format(gse)
scores = parser.scores_from_file(filename)




In [14]:
x_range = range(0, len(scores))
plt.figure()

bars = plt.bar([x + .2 for x in x_range], scores, color=list(colorset[1:]))

# Label scores above bars
for i, score in enumerate(scores):
    if score < 0:
        align = 'top'
        height = score - .05
    else: 
        align = 'bottom'
        height = score + .05
    plt.text(i + .6, height, '%.2f' % score,
                ha='center', va=align, fontsize=14)

plt.ylim([-1.1, 1.1])
plt.xlim([0, len(scores) + .2])
plt.ylabel('Activation Signature Score')

plt.xticks([x + .6 for x in x_range], labels)

title_str = 'Activation Signature Scores across Peptide Conditions for Sorted Cells'
plt.title(title_str, y=1.01)
plt.savefig('figures/{}'.format(title_str.replace(' ', '_')), bbox_inches='tight')