In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

# Construct gene-by-function table

In [11]:
df_gene_by_function = pd.read_csv('kegg_pathway_mapping_square_form_simplified.csv', index_col=0)
df_gene_by_function.index.name = 'GeneID'
df_gene_by_function = df_gene_by_function.loc[~(df_gene_by_function==0).all(axis=1)]
df_gene_by_function.head()

Unnamed: 0_level_0,pau02020,pau00970,pau00550,pau01100,pau00785,pau01240,pau03030,pau03430,pau03440,pau00860,...,pau00622,pau00471,pau01053,pau00600,pau00332,pau00791,pau00473,pau00523,pau00525,pau03020
GeneID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
dnaA,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
glyQ,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
dacC,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
lipB,0,0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
holA,0,0,0,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
df_tmp = df_gene_by_function.loc[['lasA','lasB']]
df_tmp.loc[:, (df_tmp != 0).any(axis=0)]

Unnamed: 0_level_0,pau02024,pau01503
GeneID,Unnamed: 1_level_1,Unnamed: 2_level_1
lasA,1,0
lasB,1,1


# Construct strain-by-gene table

In [3]:
df_strain_by_gene = pd.read_csv('processed_count_table_0915.csv', index_col=0)
df_strain_by_gene['Strain'] = [s+'-'+str(r) for s,r in zip(df_strain_by_gene.Strain, df_strain_by_gene.Replicate)]
df_strain_by_gene = pd.pivot_table(data=df_strain_by_gene, index='Strain', columns='Gene', values='Count', aggfunc='mean')
df_strain_by_gene.index.name = 'StrainID'
df_strain_by_gene = df_strain_by_gene[df_gene_by_function.index]
df_strain_by_gene.head()

Gene,dnaA,glyQ,dacC,lipB,holA,leuS,hemL,thiE,thiD,PA14_12490,...,PA14_01100,PA14_11760,eutB,PA14_11810,PA14_01110,PA14_11860,PA14_11970,proA,nadD,pbpA
StrainID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
F22031-1,5488.498015,1303.219558,4475.236843,1792.325186,869.078568,14153.792848,2078.300627,736.844799,1430.673793,1968.371349,...,732.065265,560.002047,504.240819,3.982945,557.61228,742.420922,830.842297,2565.016489,1258.610575,1804.274021
F22031-2,5612.008316,1093.640922,4435.932148,1729.308259,1471.011156,12239.802044,2059.04924,653.98628,1486.582258,1863.036545,...,5267.612179,525.753676,536.745042,2.747842,3222.302147,642.078967,797.789986,2107.59444,1120.20339,2137.820697
F22031-3,5460.330596,1228.574384,4286.359518,1917.539627,1137.568874,14750.922506,2015.504382,714.12559,1438.422383,1735.528607,...,1217.867853,463.592774,510.701509,2.141306,920.76163,695.924488,814.231651,2668.602747,1155.769976,2409.504707
F30658-1,7013.428883,2121.224665,4990.302203,1940.031916,1152.939848,21530.776574,2203.165334,879.419583,2199.703053,2277.027347,...,219.277849,378.542813,490.489925,9.232752,138.491273,746.69878,1253.346021,3866.214704,1986.195673,2638.25875
F30658-2,7369.798479,1926.737128,6500.855096,2050.706384,1759.899998,23770.815185,3257.958391,816.80678,2097.050031,2446.944566,...,202.753456,293.123568,457.643515,4.634365,162.202765,859.674654,1311.525213,3428.271294,2262.728569,3567.302235


In [4]:
df_strain_by_gene.shape

(33, 1474)

# Construct strain-by-phenotype table

In [5]:
df_strain_by_rhamnolipid_production = pd.read_excel('../data/rhamnolipids/rhamnMat.xlsx')[['strain','rhamn3cats']]
lines = []
for x,y in zip(df_strain_by_rhamnolipid_production.strain, df_strain_by_rhamnolipid_production.rhamn3cats):
    lines.append([x+'-1', y])
    lines.append([x+'-2', y])
    lines.append([x+'-3', y])
df_strain_by_phenotype = pd.DataFrame(lines, columns=['strain','rhamn3cats'])
df_strain_by_phenotype = df_strain_by_phenotype.set_index('strain').rename({'PA14-1':'UCBPP-PA14-1','PA14-2':'UCBPP-PA14-2','PA14-3':'UCBPP-PA14-3'})
df_strain_by_phenotype = df_strain_by_phenotype.loc[df_strain_by_gene.index]
df_strain_by_phenotype.index.name = 'StrainID'
df_strain_by_phenotype.head()

Unnamed: 0_level_0,rhamn3cats
StrainID,Unnamed: 1_level_1
F22031-1,2
F22031-2,2
F22031-3,2
F30658-1,1
F30658-2,1


In [6]:
df_strain_by_phenotype.shape

(33, 1)

# Make RLQ tables

In [7]:
df_gene_by_function.isna().any().any()

False

In [8]:
df_strain_by_gene.isna().any().any()

False

In [9]:
df_strain_by_phenotype.isna().any().any()

False

In [10]:
df_gene_by_function.to_csv('tab_gene_by_function.csv')
df_strain_by_gene.to_csv('tab_strain_by_gene.csv')
df_strain_by_phenotype.to_csv('tab_strain_by_phenotype.csv')