# GMS Intro to Stats
## Excercise 2a: Dealing with uncertainty from raw experimental data
### Comparing gene expression of TNF, for different doses of LPS, for crispr-edited, cell lines.

#### Background on the data
+ Ankylosing Spondylitis is a chronic inflammatory disorder.
+ Through GWAS, the gene *TNFRSF1A* has been implicated in this disease.
+ Would like to understand the regulatory elements around this gene.
+ Using CRISPR/Cas9 to delete putative enhancers and then measured TNF expression for different doses of LPS using qPCR in the resulting cell lines.

In [None]:
# Import the packages we need
# Manipulate the data
import pandas as pd
import numpy as np
import scipy as sp
# Plot the data
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [None]:
# Get the data in
rep1 = pd.read_csv('TNF Dose response Rep 1 summary.csv', index_col=[0,1])
rep1

In [None]:
# Let's find the mean between the two CQ
rep1['TNF_mean'] = rep1[['TNF Cq 1', 'TNF Cq 2']].mean(axis=1)
rep1['b-actin_mean'] = rep1[['b-actin Cq 1', 'b-actin Cq 2']].mean(axis=1)
rep1

In [None]:
# Take away the house keeper gene's expression
rep1['DCT'] = rep1['TNF_mean'] - rep1['b-actin_mean']
rep1

In [None]:
# Take away the unstimulated cell lines' expression
ddct = []
unstimulated_value = 0
for i in rep1.index.values:
    if 'unstimulated' in i:
        unstimulated_value = rep1['DCT'].loc[i]
    ddct.append(rep1['DCT'].loc[i] - unstimulated_value)
rep1['DDCT'] = ddct
rep1['ExDDCT_rep1'] = 2**-rep1['DDCT']
rep1

In [None]:
# Repeat for 2nd replicate
rep2 = pd.read_csv('TNF Dose response Rep 2 summary.csv', index_col=[0,1])
rep2['TNF_average'] = rep2[['TNF Cq 1', 'TNF Cq 2']].mean(axis=1)
rep2['b-actin_average'] = rep2[['b-actin Cq 1', 'b-actin Cq 2']].mean(axis=1)
rep2['DCT'] = rep2['TNF_average'] - rep2['b-actin_average']
ddct = []
unstimulated_value = 0
for i in rep2.index.values:
    if 'unstimulated' in i:
        unstimulated_value = rep2['DCT'].loc[i]
    ddct.append(rep2['DCT'].loc[i] - unstimulated_value)
rep2['DDCT'] = ddct
rep2['ExDDCT_rep2'] = 2**-rep2['DDCT']
rep2

In [None]:
# Repeat 3rd replicate
rep3 = pd.read_csv('TNF Dose response Rep 3 summary.csv', index_col=[0,1])
rep3['TNF_average'] = rep3[['TNF Cq 1', 'TNF Cq 2']].mean(axis=1)
rep3['b-actin_average'] = rep3[['b-actin Cq 1', 'b-actin Cq 2']].mean(axis=1)
rep3['DCT'] = rep3['TNF_average'] - rep3['b-actin_average']
ddct = []
unstimulated_value = 0
for i in rep3.index.values:
    if 'unstimulated' in i:
        unstimulated_value = rep3['DCT'].loc[i]
    ddct.append(rep3['DCT'].loc[i] - unstimulated_value)
rep3['DDCT'] = ddct
rep3['ExDDCT_rep3'] = 2**-rep3['DDCT']
rep3

In [None]:
# Lets collect the data together
all_reps = pd.concat([rep1['ExDDCT_rep1'],rep2['ExDDCT_rep2'],rep3['ExDDCT_rep3']],axis=1)
all_reps

In [None]:
# Calculate what we need for error bars
all_reps['Mean'] = all_reps[['ExDDCT_rep1','ExDDCT_rep2','ExDDCT_rep3']].mean(axis=1)
all_reps['StDev'] = all_reps[['ExDDCT_rep1','ExDDCT_rep2','ExDDCT_rep3']].std(axis=1)
all_reps['StErr'] = all_reps['StDev'] / np.sqrt(3)
all_reps['CI_95'] = all_reps['StErr'] * 4.303
all_reps

In [None]:
# Lets plot them!
fig, ax = plt.subplots()
cell_lines = {'Intron enh C5':'aqua', 
              'Intergenic enh C8.4':'orange', 
              'Intergenic enh G1':'springgreen', 
              'SFC840-03-03 Ctrl':'hotpink'}

w = 0.2
ind = np.arange(len(all_reps.loc['Intron enh C5'].index))
for c,cell_line in enumerate(cell_lines):
    ax.bar(ind + ((c-1.33)*w), 
            all_reps['Mean'].loc[cell_line].values,
            width = w,
            yerr=all_reps['StErr'].loc[cell_line].values,
            align='center',
            color=cell_lines[cell_line], 
            label=cell_line)


plt.xticks(ind, all_reps.loc['Intron enh C5'].index.values)
plt.legend(loc=2)