In [3]:
import glob
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import numpy as np
import seaborn as sns
import scipy
from Bio import SeqIO

In [4]:
pwd

'/Users/katharineshalvarjian/Documents/gtdb/3-notebooks'

### 1. Make dataframes of FPKM values for UAG-containing hits and the Pyl machinery 
genomic_out.tsv is a table containing all hits of TAG containing genes generated using our pyl_id.py script. Using the locus ID's from this hit matrix, we filter the expression_matrix from the RNA-Seq experiments to include only those hits. Note that the expression matrix represents the log2(FPKM) value. 

In [5]:
hits = pd.read_csv('../test/genome_extension/genomic_out.tsv', sep='\t')
genes = hits['gene'].tolist()

In [7]:
# load in expression matrix
transcripts = pd.read_excel('../../expression_matrix.xlsx')

In [8]:
# make uag-containing transcript dataframe (print shape)
uag_df = transcripts[transcripts['feature_ids'].isin(genes)]
print(uag_df.shape)
uag_df.head()

(241, 13)


Unnamed: 0,feature_ids,WWM60_TMA_Rep1_expression,WWM60_TMA_Rep2_expression,WWM60_TMA_Rep3_expression,WWM60_MeOH_Rep1_expression,WWM60_MeOH_Rep2_expression,WWM60_MeOH_Rep3_expression,DDN121_Rep1_expression,DDN121_Rep2_expression,DDN121_Rep3_expression,DDN146_Rep1_expression,DDN146_Rep2_expression,DDN146_Rep3_expression
11,MA_RS00060,2.991921,3.04646,3.051049,3.495374,3.097134,3.68779,3.044368,3.911412,4.515122,3.505683,3.070746,3.344965
141,MA_RS00790,0.742586,0.76311,0.907807,0.862242,0.923554,0.965543,1.036992,1.513853,1.704193,1.083109,1.159477,0.920982
146,MA_RS00820,5.443322,5.730322,5.462516,4.224349,3.93154,4.105108,2.032303,2.247768,1.996783,9.05013,9.082222,9.603266
164,MA_RS00920,1.274057,1.552652,1.652701,1.957446,1.435616,1.490524,1.906173,2.221504,2.067296,2.178587,1.590837,1.5392
223,MA_RS01240,0.824632,0.796562,0.509461,0.688995,0.823883,0.432366,1.414651,1.428151,1.969847,0.788444,1.039891,0.38154


In [9]:
# make pyl dataframe (print shape)
pyl = ['MA_RS00820', 'MA_RS00815', 'MA_RS00810', 'MA_RS00825']
pyl_df = transcripts[transcripts['feature_ids'].isin(pyl)]
print(pyl_df.shape)
pyl_df.head()

(4, 13)


Unnamed: 0,feature_ids,WWM60_TMA_Rep1_expression,WWM60_TMA_Rep2_expression,WWM60_TMA_Rep3_expression,WWM60_MeOH_Rep1_expression,WWM60_MeOH_Rep2_expression,WWM60_MeOH_Rep3_expression,DDN121_Rep1_expression,DDN121_Rep2_expression,DDN121_Rep3_expression,DDN146_Rep1_expression,DDN146_Rep2_expression,DDN146_Rep3_expression
144,MA_RS00810,4.139353,4.47692,4.29857,3.161213,2.034482,2.892552,2.663174,2.395765,2.652914,7.457873,7.683467,8.268377
145,MA_RS00815,4.987418,5.226584,5.232328,3.716822,3.329227,3.517945,0.0,0.0,0.0,8.694005,8.778274,9.278483
146,MA_RS00820,5.443322,5.730322,5.462516,4.224349,3.93154,4.105108,2.032303,2.247768,1.996783,9.05013,9.082222,9.603266
147,MA_RS00825,5.221569,5.437206,5.385096,5.03357,4.579803,4.671564,7.280291,7.906289,7.905697,5.133447,4.80576,5.073752


### 2. Generate expression ratio 
For each replicate in the RNA-Seq data, generate an expression ratio (i.e. Pyl supply/Pyl demand). Note that we transform FPKM values out of the log2(FPKM).

In [16]:
# TMA replicates
reps_tma = {}
reps_tma_list = []

for i in range(3):
    name = f"WWM60_TMA_Rep{i+1}_expression"
    pyl_indiv = dict(zip(pyl_df['feature_ids'], 2**pyl_df[name]))
    uag_indiv = dict(zip(uag_df['feature_ids'], 2**uag_df[name]))

    pyl_value = max(pyl_indiv.values())
    uag_sum = sum(uag_indiv.values())

    print(f"{name} ratio:\t {pyl_value/uag_sum}")
    reps_tma[name] = pyl_value/uag_sum
    reps_tma_list.append(float(pyl_value/uag_sum))

WWM60_TMA_Rep1_expression ratio:	 0.006413164655245085
WWM60_TMA_Rep2_expression ratio:	 0.006308769762690296
WWM60_TMA_Rep3_expression ratio:	 0.005547893700122758


In [17]:
# MeOH replicates
reps_meoh = {}
reps_meoh_list = []

for i in range(3):
    name = f"WWM60_MeOH_Rep{i+1}_expression"
    pyl_indiv = dict(zip(pyl_df['feature_ids'], 2**pyl_df[name]))
    tx_indiv = dict(zip(uag_df['feature_ids'], 2**uag_df[name]))

    pyl_value = max(pyl_indiv.values())
    uag_sum = sum(tx_indiv.values())

    print(f"{name} ratio:\t {pyl_value/uag_sum}")
    reps_meoh[name] = pyl_value/uag_sum
    reps_meoh_list.append(float(pyl_value/uag_sum))


WWM60_MeOH_Rep1_expression ratio:	 0.01280020701970208
WWM60_MeOH_Rep2_expression ratio:	 0.011156487009686992
WWM60_MeOH_Rep3_expression ratio:	 0.010294800936306785


In [21]:
# t-test on the ratios for meoh v. tma
from scipy.stats import ttest_ind

tma = np.array(reps_tma_list)
meoh = np.array(reps_meoh_list)

t_statistic, p_value = stats.ttest_ind(tma, meoh, equal_var=False)
print(t_statistic, p_value)

-6.796125993918543 0.01082949664447211


### 3. Calculate supply ratio by demand ratio 
This is a calculated ratio for MeOH by TMA 

In [49]:
# for MeOH and TMA
for i in range(3): 
    name1 = f"WWM60_MeOH_Rep{i+1}_expression"
    name2 = f"WWM60_TMA_Rep{i+1}_expression"
    
    pyl_meoh = dict(zip(pyl_df['feature_ids'], 2**pyl_df[name1]))
    tx_meoh = dict(zip(uag_df['feature_ids'], 2**uag_df[name1]))
    
    pyl_tma = dict(zip(pyl_df['feature_ids'], 2**pyl_df[name2]))
    tx_tma = dict(zip(uag_df['feature_ids'], 2**uag_df[name2]))

    
    prod_tma = max(pyl_tma.values())
    prod_meoh = max(pyl_meoh.values())
    uag_tma = sum(tx_tma.values())
    uag_meoh = sum(tx_meoh.values())

    prod_ratio = prod_meoh/prod_tma
    tx_ratio = uag_tma/uag_meoh
    
    print(f"{i+1}:\tproduction: {prod_ratio}\ttranscripts: {tx_ratio}\tratio: {prod_ratio*tx_ratio}")


1:	production: 0.7527528753719643	transcripts: 2.651503349779745	ratio: 1.9959267706050985
2:	production: 0.4504630152692365	transcripts: 3.9257591024940925	ratio: 1.7684092825301405
3:	production: 0.577962361972435	transcripts: 3.210630040640744	ratio: 1.8556233217083795
