In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline 
import seaborn as sns
from copy import deepcopy
import collections
import scipy as sp
import os
import sys
import numpy as np

# Read rhamnolipid production

In [2]:
df_rhl = pd.read_excel('../data/rhamnolipids/rhamnMat.xlsx', index_col=0)
df_rhl = df_rhl.rename(index={'PA14':'UCBPP-PA14'})
df_rhl.head()

Unnamed: 0_level_0,rhamn3cats,rhamn2cats
strain,Unnamed: 1_level_1,Unnamed: 2_level_1
F22031,2,1
F23197,2,1
F30658,1,1
F34365,2,1
F5677,0,0


# Find orthologous genes

In [3]:
# process reciprocal blast result
df_orth = pd.DataFrame()
for index,strain in enumerate(df_rhl.index):
    if strain=='UCBPP-PA14':
        continue
    infile = '../reciprocal_blast_ref_PA14/PA14_vs_'+strain+'.txt'
    df = pd.read_csv(infile, sep=',', index_col=0).reset_index()
    df = df[df.BBH == '<=>'] # select only bidirectional hit
    df = df[['gene','subject']]
    df.columns = [strain, 'PA14']
    df = df.set_index('PA14')
    if index==0:
        df_orth = df
    else:
        df_orth = pd.merge(df_orth, df, left_index=True, right_index=True, how='outer')
df_orth.index = [x.rstrip('|') for x in df_orth.index]

#  add PATRIC annotation
df_PA14_feature = pd.read_csv('../PATRIC_GMM/UCBPP-PA14/PATRIC_genome_feature.txt', sep='\t')
df_PA14_feature = df_PA14_feature[['Product','PATRIC ID',]].set_index('PATRIC ID')
df_orth = pd.merge(df_orth, df_PA14_feature, left_index=True, right_index=True, how='left')
df_orth = df_orth[['Product']+[x for x in df_orth.columns if x != 'Product']]

df_orth.head()

Unnamed: 0,Product,F22031,F23197,F30658,F34365,F5677,F63912,F9670,H27930,H47921,...,T63266,W16407,W25637,W36662,W45909,W60856,W70332,W91453,X78812,X9820
fig|287.6770.peg.1000,Membrane-bound lytic murein transglycosylase B,fig|287.6613.peg.976,fig|287.6611.peg.983,fig|287.6614.peg.1064,fig|287.6616.peg.962,fig|287.6612.peg.1000,fig|287.6618.peg.1008,fig|287.6615.peg.1001,fig|287.6617.peg.1030,fig|287.6621.peg.1073,...,fig|287.6635.peg.988,fig|287.6633.peg.1064,fig|287.6634.peg.1066,fig|287.6637.peg.1116,fig|287.6636.peg.982,fig|287.6620.peg.5475,fig|287.6630.peg.1000,fig|287.6639.peg.1067,fig|287.6638.peg.992,fig|287.6627.peg.974
fig|287.6770.peg.1001,Septum-associated rare lipoprotein A,fig|287.6613.peg.977,fig|287.6611.peg.984,fig|287.6614.peg.1065,fig|287.6616.peg.963,fig|287.6612.peg.1001,fig|287.6618.peg.1009,fig|287.6615.peg.1002,,fig|287.6621.peg.1074,...,fig|287.6635.peg.989,fig|287.6633.peg.1065,fig|287.6634.peg.1067,fig|287.6637.peg.1117,fig|287.6636.peg.983,fig|287.6620.peg.5474,fig|287.6630.peg.1001,fig|287.6639.peg.1068,fig|287.6638.peg.993,fig|287.6627.peg.975
fig|287.6770.peg.1002,D-alanyl-D-alanine carboxypeptidase (EC 3.4.16.4),fig|287.6613.peg.978,fig|287.6611.peg.985,fig|287.6614.peg.1066,fig|287.6616.peg.964,fig|287.6612.peg.1002,fig|287.6618.peg.1010,fig|287.6615.peg.1003,fig|287.6617.peg.1033,fig|287.6621.peg.1075,...,fig|287.6635.peg.990,fig|287.6633.peg.1066,fig|287.6634.peg.1068,fig|287.6637.peg.1118,fig|287.6636.peg.984,fig|287.6620.peg.5473,fig|287.6630.peg.1002,fig|287.6639.peg.1069,fig|287.6638.peg.994,fig|287.6627.peg.976
fig|287.6770.peg.1003,Proposed lipoate regulatory protein YbeD,fig|287.6613.peg.979,fig|287.6611.peg.986,fig|287.6614.peg.1067,fig|287.6616.peg.965,fig|287.6612.peg.1003,fig|287.6618.peg.1011,fig|287.6615.peg.1004,fig|287.6617.peg.1034,fig|287.6621.peg.1076,...,fig|287.6635.peg.991,fig|287.6633.peg.1067,fig|287.6634.peg.1069,fig|287.6637.peg.1119,fig|287.6636.peg.985,fig|287.6620.peg.5472,fig|287.6630.peg.1003,fig|287.6639.peg.1070,fig|287.6638.peg.995,fig|287.6627.peg.977
fig|287.6770.peg.1004,Octanoate-[acyl-carrier-protein]-protein-N-oct...,fig|287.6613.peg.980,fig|287.6611.peg.987,fig|287.6614.peg.1068,fig|287.6616.peg.966,fig|287.6612.peg.1004,fig|287.6618.peg.1012,fig|287.6615.peg.1005,fig|287.6617.peg.1035,fig|287.6621.peg.1077,...,fig|287.6635.peg.992,fig|287.6633.peg.1068,fig|287.6634.peg.1070,fig|287.6637.peg.1120,fig|287.6636.peg.986,fig|287.6620.peg.5471,fig|287.6630.peg.1004,fig|287.6639.peg.1071,fig|287.6638.peg.996,fig|287.6627.peg.978


# Read in PA14 gene annotation from Patric and Prokka

In [4]:
# Patric gene annotation
df_PA14_annot = pd.read_excel('genome_comparison_PA14_prokka.xlsx',index_col=0)
df_PA14_annot = df_PA14_annot[df_PA14_annot['comp_genome_1_hit']=='bi (<->)']
df_PA14_annot = df_PA14_annot[['ref_genome_patric_id','comp_genome_1_gene_name','comp_genome_1_locus_tag']].set_index('ref_genome_patric_id').fillna('')
df_PA14_annot.columns = ['Gene_Patric','locus']
df_PA14_annot.head()

Unnamed: 0_level_0,Gene_Patric,locus
ref_genome_patric_id,Unnamed: 1_level_1,Unnamed: 2_level_1
fig|287.6770.peg.1,dnaA,PA14_00010
fig|287.6770.peg.2,dnaN,PA14_00020
fig|287.6770.peg.3,recF,PA14_00030
fig|287.6770.peg.4,gyrB,PA14_00050
fig|287.6770.peg.6,,PA14_00060


In [5]:
# Prokka gene annotation
df_PA14_locustag = pd.read_csv('locustag_annotation_patric.csv')
df_PA14_locustag = df_PA14_locustag.rename(columns={'Gene':'Gene_Prokka'})
df_PA14_annot = df_PA14_annot.reset_index().merge(df_PA14_locustag,how='left').set_index('ref_genome_patric_id').fillna('')
df_orth_blast = pd.merge(df_PA14_annot, df_orth, left_index=True, right_index=True, how='right')
df_orth_blast.head()

Unnamed: 0,Gene_Patric,locus,Gene_Prokka,Product,F22031,F23197,F30658,F34365,F5677,F63912,...,T63266,W16407,W25637,W36662,W45909,W60856,W70332,W91453,X78812,X9820
fig|287.6770.peg.1000,sltB1,PA14_12080,mltB_1,Membrane-bound lytic murein transglycosylase B,fig|287.6613.peg.976,fig|287.6611.peg.983,fig|287.6614.peg.1064,fig|287.6616.peg.962,fig|287.6612.peg.1000,fig|287.6618.peg.1008,...,fig|287.6635.peg.988,fig|287.6633.peg.1064,fig|287.6634.peg.1066,fig|287.6637.peg.1116,fig|287.6636.peg.982,fig|287.6620.peg.5475,fig|287.6630.peg.1000,fig|287.6639.peg.1067,fig|287.6638.peg.992,fig|287.6627.peg.974
fig|287.6770.peg.1001,,PA14_12090,rlpA_1,Septum-associated rare lipoprotein A,fig|287.6613.peg.977,fig|287.6611.peg.984,fig|287.6614.peg.1065,fig|287.6616.peg.963,fig|287.6612.peg.1001,fig|287.6618.peg.1009,...,fig|287.6635.peg.989,fig|287.6633.peg.1065,fig|287.6634.peg.1067,fig|287.6637.peg.1117,fig|287.6636.peg.983,fig|287.6620.peg.5474,fig|287.6630.peg.1001,fig|287.6639.peg.1068,fig|287.6638.peg.993,fig|287.6627.peg.975
fig|287.6770.peg.1002,dacC,PA14_12100,dacC,D-alanyl-D-alanine carboxypeptidase (EC 3.4.16.4),fig|287.6613.peg.978,fig|287.6611.peg.985,fig|287.6614.peg.1066,fig|287.6616.peg.964,fig|287.6612.peg.1002,fig|287.6618.peg.1010,...,fig|287.6635.peg.990,fig|287.6633.peg.1066,fig|287.6634.peg.1068,fig|287.6637.peg.1118,fig|287.6636.peg.984,fig|287.6620.peg.5473,fig|287.6630.peg.1002,fig|287.6639.peg.1069,fig|287.6638.peg.994,fig|287.6627.peg.976
fig|287.6770.peg.1003,,PA14_12110,group_12410,Proposed lipoate regulatory protein YbeD,fig|287.6613.peg.979,fig|287.6611.peg.986,fig|287.6614.peg.1067,fig|287.6616.peg.965,fig|287.6612.peg.1003,fig|287.6618.peg.1011,...,fig|287.6635.peg.991,fig|287.6633.peg.1067,fig|287.6634.peg.1069,fig|287.6637.peg.1119,fig|287.6636.peg.985,fig|287.6620.peg.5472,fig|287.6630.peg.1003,fig|287.6639.peg.1070,fig|287.6638.peg.995,fig|287.6627.peg.977
fig|287.6770.peg.1004,lipB,PA14_12120,lipB,Octanoate-[acyl-carrier-protein]-protein-N-oct...,fig|287.6613.peg.980,fig|287.6611.peg.987,fig|287.6614.peg.1068,fig|287.6616.peg.966,fig|287.6612.peg.1004,fig|287.6618.peg.1012,...,fig|287.6635.peg.992,fig|287.6633.peg.1068,fig|287.6634.peg.1070,fig|287.6637.peg.1120,fig|287.6636.peg.986,fig|287.6620.peg.5471,fig|287.6630.peg.1004,fig|287.6639.peg.1071,fig|287.6638.peg.996,fig|287.6627.peg.978


# Find genes in all rhamnolipid producers

In [6]:
df_orth_blast_producers = df_orth_blast[['Gene_Patric','Gene_Prokka','locus','Product']+[x for x in list(df_rhl[df_rhl.rhamn2cats==1].index) if x!='UCBPP-PA14']]
df_orth_blast_producers_all_contain = df_orth_blast_producers[~df_orth_blast_producers.isnull().any(axis=1)]
df_orth_blast_producers_all_contain.head()

Unnamed: 0,Gene_Patric,Gene_Prokka,locus,Product,F22031,F23197,F30658,F34365,F9670,H47921,...,T52373,T6313,T63266,W16407,W25637,W45909,W70332,W91453,X78812,X9820
fig|287.6770.peg.1000,sltB1,mltB_1,PA14_12080,Membrane-bound lytic murein transglycosylase B,fig|287.6613.peg.976,fig|287.6611.peg.983,fig|287.6614.peg.1064,fig|287.6616.peg.962,fig|287.6615.peg.1001,fig|287.6621.peg.1073,...,fig|287.6632.peg.985,fig|287.6629.peg.1078,fig|287.6635.peg.988,fig|287.6633.peg.1064,fig|287.6634.peg.1066,fig|287.6636.peg.982,fig|287.6630.peg.1000,fig|287.6639.peg.1067,fig|287.6638.peg.992,fig|287.6627.peg.974
fig|287.6770.peg.1001,,rlpA_1,PA14_12090,Septum-associated rare lipoprotein A,fig|287.6613.peg.977,fig|287.6611.peg.984,fig|287.6614.peg.1065,fig|287.6616.peg.963,fig|287.6615.peg.1002,fig|287.6621.peg.1074,...,fig|287.6632.peg.986,fig|287.6629.peg.1079,fig|287.6635.peg.989,fig|287.6633.peg.1065,fig|287.6634.peg.1067,fig|287.6636.peg.983,fig|287.6630.peg.1001,fig|287.6639.peg.1068,fig|287.6638.peg.993,fig|287.6627.peg.975
fig|287.6770.peg.1002,dacC,dacC,PA14_12100,D-alanyl-D-alanine carboxypeptidase (EC 3.4.16.4),fig|287.6613.peg.978,fig|287.6611.peg.985,fig|287.6614.peg.1066,fig|287.6616.peg.964,fig|287.6615.peg.1003,fig|287.6621.peg.1075,...,fig|287.6632.peg.987,fig|287.6629.peg.1080,fig|287.6635.peg.990,fig|287.6633.peg.1066,fig|287.6634.peg.1068,fig|287.6636.peg.984,fig|287.6630.peg.1002,fig|287.6639.peg.1069,fig|287.6638.peg.994,fig|287.6627.peg.976
fig|287.6770.peg.1003,,group_12410,PA14_12110,Proposed lipoate regulatory protein YbeD,fig|287.6613.peg.979,fig|287.6611.peg.986,fig|287.6614.peg.1067,fig|287.6616.peg.965,fig|287.6615.peg.1004,fig|287.6621.peg.1076,...,fig|287.6632.peg.988,fig|287.6629.peg.1081,fig|287.6635.peg.991,fig|287.6633.peg.1067,fig|287.6634.peg.1069,fig|287.6636.peg.985,fig|287.6630.peg.1003,fig|287.6639.peg.1070,fig|287.6638.peg.995,fig|287.6627.peg.977
fig|287.6770.peg.1004,lipB,lipB,PA14_12120,Octanoate-[acyl-carrier-protein]-protein-N-oct...,fig|287.6613.peg.980,fig|287.6611.peg.987,fig|287.6614.peg.1068,fig|287.6616.peg.966,fig|287.6615.peg.1005,fig|287.6621.peg.1077,...,fig|287.6632.peg.989,fig|287.6629.peg.1082,fig|287.6635.peg.992,fig|287.6633.peg.1068,fig|287.6634.peg.1070,fig|287.6636.peg.986,fig|287.6630.peg.1004,fig|287.6639.peg.1071,fig|287.6638.peg.996,fig|287.6627.peg.978


# Find genes that are missing in at least one non-producer

In [7]:
df_orth_blast_nonproducers = df_orth_blast[['Gene_Patric','Gene_Prokka','locus','Product']+list(df_rhl[df_rhl.rhamn2cats==0].index)]
df_orth_blast_nonproducers_missing = df_orth_blast_nonproducers[df_orth_blast_nonproducers.isnull().any(axis=1)]
df_orth_blast_nonproducers_missing.head()

Unnamed: 0,Gene_Patric,Gene_Prokka,locus,Product,F5677,F63912,H27930,M1608,M55212,S86968,W36662,W60856
fig|287.6770.peg.1001,,rlpA_1,PA14_12090,Septum-associated rare lipoprotein A,fig|287.6612.peg.1001,fig|287.6618.peg.1009,,fig|287.6622.peg.5056,fig|287.6624.peg.1066,fig|287.6631.peg.1043,fig|287.6637.peg.1117,fig|287.6620.peg.5474
fig|287.6770.peg.1009,,,,hypothetical protein,,fig|287.6618.peg.1017,fig|287.6617.peg.1040,fig|287.6622.peg.5048,,fig|287.6631.peg.1051,fig|287.6637.peg.1125,fig|287.6620.peg.5466
fig|287.6770.peg.100,,group_5316,PA14_01130,FIG00965783: hypothetical protein,,,,fig|287.6622.peg.153,,,,
fig|287.6770.peg.1014,,,,FIG00960788: hypothetical protein,fig|287.6612.peg.1014,fig|287.6618.peg.1022,,fig|287.6622.peg.5043,,,fig|287.6637.peg.1130,
fig|287.6770.peg.1015,,group_12433,PA14_12260,hypothetical protein,fig|287.6612.peg.1015,fig|287.6618.peg.1023,,fig|287.6622.peg.5042,,,fig|287.6637.peg.1131,


# The intersection of the two sets of genes are those only missing in non-producers

In [8]:
overlaps = list(set(df_orth_blast_producers_all_contain.index).intersection(df_orth_blast_nonproducers_missing.index))
df_orth_blast_nonproducers_missing_to_focus = df_orth_blast_nonproducers_missing.loc[overlaps]
df_orth_blast_nonproducers_missing_to_focus.head()

Unnamed: 0,Gene_Patric,Gene_Prokka,locus,Product,F5677,F63912,H27930,M1608,M55212,S86968,W36662,W60856
fig|287.6770.peg.4371,,baeR_2,PA14_52250,Two-component transcriptional response regulat...,fig|287.6612.peg.4529,fig|287.6618.peg.4583,,fig|287.6622.peg.1865,fig|287.6624.peg.3662,fig|287.6631.peg.4655,fig|287.6637.peg.4655,fig|287.6620.peg.1805
fig|287.6770.peg.4682,,sctC_3,PA14_55920,Flp pilus assembly protein CpaC,,fig|287.6618.peg.4865,fig|287.6617.peg.4693,fig|287.6622.peg.1550,fig|287.6624.peg.3242,fig|287.6631.peg.5051,fig|287.6637.peg.5039,fig|287.6620.peg.1391
fig|287.6770.peg.3188,,cmoA_1,PA14_38210,SAM-dependent methyltransferase,fig|287.6612.peg.3299,fig|287.6618.peg.3332,fig|287.6617.peg.3161,,fig|287.6624.peg.4814,fig|287.6631.peg.3310,fig|287.6637.peg.2603,fig|287.6620.peg.3845
fig|287.6770.peg.3222,,atoE,PA14_38610,Short-chain fatty acids transporter,fig|287.6612.peg.3333,fig|287.6618.peg.3366,fig|287.6617.peg.3195,,fig|287.6624.peg.4779,fig|287.6631.peg.3346,fig|287.6637.peg.2569,fig|287.6620.peg.3879
fig|287.6770.peg.3235,pqqD,pqqD,PA14_38790,Coenzyme PQQ synthesis protein D,fig|287.6612.peg.3347,fig|287.6618.peg.3380,fig|287.6617.peg.3209,,fig|287.6624.peg.4765,fig|287.6631.peg.3360,fig|287.6637.peg.2554,fig|287.6620.peg.3893


In [9]:
df_orth_blast_nonproducers_missing_to_focus.to_csv('genes_missing_only_in_nonproducers.csv')