In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

# Construct strain-by-gene table

In [35]:
df_strain_by_gene = pd.read_csv('processed_count_table_0915.csv', index_col=0)
df_strain_by_gene['Strain'] = [s+'-'+str(r) for s,r in zip(df_strain_by_gene.Strain, df_strain_by_gene.Replicate)]
df_strain_by_gene = pd.pivot_table(data=df_strain_by_gene, index='Strain', columns='Gene', values='Count', aggfunc='mean')
df_strain_by_gene.index.name = 'StrainID'
df_strain_by_gene.head()

Gene,aat,accA,accB,accC,aceE,aceF,aceK,acnA,acnB,acoB,...,xylZ,yajC,yciB,yciI,ygdP,yqgF,zipA,znuB,znuC,zwf
StrainID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
F22031-1,289.161797,5033.645712,2622.370895,7503.071524,13120.616952,6197.4622,2204.958274,12885.623206,3989.31757,165.690506,...,31.06697,1464.13053,2284.617171,1613.092668,3105.103812,466.004548,6169.581586,657.982491,881.823992,31023.95409
F22031-2,311.422038,5257.53676,2169.878848,6585.660158,13501.061298,6064.486218,2279.792508,10613.995815,4663.087047,143.803706,...,43.965464,1456.356002,2266.969248,1445.364636,3174.672894,404.84865,5901.447622,490.947684,697.035797,26298.675168
F22031-3,286.399693,5859.684186,2226.958361,7899.278262,16098.33938,7128.40806,2298.692115,8397.131934,4834.533883,134.366959,...,44.967428,1527.821914,2349.012809,1484.460465,3159.497174,418.090019,5793.303697,545.497733,757.487039,31551.074965
F30658-1,297.756237,9995.607628,3090.663575,16719.358931,24307.526597,16494.310613,2168.542516,10382.229098,5987.439369,153.494494,...,24.235973,2463.990565,2645.183314,1763.455543,3621.546789,489.335831,8340.636916,753.623344,1129.857969,41852.062696
F30658-2,395.079591,10077.42606,4729.369186,15427.800117,24041.92552,10112.183795,2229.129425,10654.404466,5694.475636,141.348124,...,24.330415,2708.786173,2551.217772,1939.481631,4131.536138,475.022383,7916.653514,566.551086,773.938906,44338.125764


In [24]:
df_strain_by_gene.shape

(33, 6154)

# Construct gene-by-function table

In [34]:
df_gene_by_function = pd.read_csv('kegg_pathway_mapping_square_form_simplified.csv', index_col=0)
df_gene_by_function['other'] = 0
df_gene_by_function = df_gene_by_function.T
missing_genes = set(df_strain_by_gene.columns)-set(df_gene_by_function.columns)
for m in missing_genes:
    df_gene_by_function[m] = 0
    df_gene_by_function.loc['other', m] = 1
df_gene_by_function = df_gene_by_function.T
df_gene_by_function = df_gene_by_function.loc[df_strain_by_gene.columns]
df_gene_by_function=df_gene_by_function.loc[:, (df_gene_by_function != 0).any(axis=0)] # remove pathways that are not assigned to any gene
df_gene_by_function.index.name = 'GeneID'
df_gene_by_function.head()

Unnamed: 0_level_0,pau02020,pau00970,pau00550,pau01100,pau00785,pau01240,pau03030,pau03430,pau03440,pau00860,...,pau00471,pau01053,pau00600,pau00473,pau00332,pau00791,pau00523,pau00525,pau03020,other
GeneID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
aat,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
accA,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
accB,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
accC,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aceE,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
df_gene_by_function.shape

(6154, 123)

# Construct strain-by-phenotype table

In [36]:
df_strain_by_rhamnolipid_production = pd.read_excel('../data/rhamnolipids/rhamnMat.xlsx')[['strain','rhamn3cats']]
lines = []
for x,y in zip(df_strain_by_rhamnolipid_production.strain, df_strain_by_rhamnolipid_production.rhamn3cats):
    lines.append([x+'-1', y])
    lines.append([x+'-2', y])
    lines.append([x+'-3', y])
df_strain_by_phenotype = pd.DataFrame(lines, columns=['strain','rhamn3cats'])
df_strain_by_phenotype = df_strain_by_phenotype.set_index('strain').rename({'PA14-1':'UCBPP-PA14-1','PA14-2':'UCBPP-PA14-2','PA14-3':'UCBPP-PA14-3'})
df_strain_by_phenotype = df_strain_by_phenotype.loc[df_strain_by_gene.index]
df_strain_by_phenotype.index.name = 'StrainID'
df_strain_by_phenotype.head()

Unnamed: 0_level_0,rhamn3cats
StrainID,Unnamed: 1_level_1
F22031-1,2
F22031-2,2
F22031-3,2
F30658-1,1
F30658-2,1


In [37]:
df_strain_by_phenotype.shape

(33, 1)

# Make RLQ tables

In [38]:
df_gene_by_function.isna().any().any()

False

In [39]:
df_strain_by_gene.isna().any().any()

False

In [40]:
df_strain_by_phenotype.isna().any().any()

False

In [41]:
df_gene_by_function.to_csv('tab_gene_by_function.csv')
df_strain_by_gene.to_csv('tab_strain_by_gene.csv')
df_strain_by_phenotype.to_csv('tab_strain_by_phenotype.csv')