In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline 
import seaborn as sns
from copy import deepcopy
import collections
import scipy as sp
import os
import sys
import numpy as np
from Bio.Blast.Applications import NcbiblastpCommandline
from Bio.Blast import NCBIWWW, NCBIXML
from Bio import AlignIO, SeqIO
import Bio.Align.Applications
from Bio.Align.Applications import ClustalwCommandline

# Read rhamnolipid production

In [2]:
df_rhl = pd.read_excel('../data/rhamnolipids/rhamnMat.xlsx', index_col=0)
df_rhl = df_rhl.rename(index={'PA14':'UCBPP-PA14'})
df_rhl.head()

Unnamed: 0_level_0,rhamn3cats,rhamn2cats
strain,Unnamed: 1_level_1,Unnamed: 2_level_1
F22031,2,1
F23197,2,1
F30658,1,1
F34365,2,1
F5677,0,0


# Using PATRIC annotation

In [12]:
df_orth_plfam = pd.read_csv('../find_protein_orthologues_UCBPP-PA14_vs_otherPA/PLfam_dictionary.csv', index_col=0)
df_orth_plfam.head()

Unnamed: 0,PATRIC genus-specific families (PLfams),Product,F22031,F23197,F30658,F34365,F5677,F63912,F9670,H27930,...,T63266,W16407,W25637,W36662,W45909,W60856,W70332,W91453,X78812,X9820
0,PLF_286_00000001,Cytochrome c oxidase (cbb3-type) subunit CcoN ...,"fig|287.6613.peg.3420,fig|287.6613.peg.3781,fi...","fig|287.6611.peg.3353,fig|287.6611.peg.3671,fi...","fig|287.6614.peg.3879,fig|287.6614.peg.4201,fi...","fig|287.6616.peg.3358,fig|287.6616.peg.3681,fi...","fig|287.6612.peg.3450,fig|287.6612.peg.3869,fi...","fig|287.6618.peg.3519,fig|287.6618.peg.3840,fi...","fig|287.6615.peg.3548,fig|287.6615.peg.3869,fi...","fig|287.6617.peg.3353,fig|287.6617.peg.3733,fi...",...,"fig|287.6635.peg.3397,fig|287.6635.peg.3716,fi...","fig|287.6633.peg.3712,fig|287.6633.peg.4037,fi...","fig|287.6634.peg.4156,fig|287.6634.peg.4160,fi...","fig|287.6637.peg.981,fig|287.6637.peg.2087,fig...","fig|287.6636.peg.3494,fig|287.6636.peg.3814,fi...","fig|287.6620.peg.4035,fig|287.6620.peg.4352,fi...","fig|287.6630.peg.3615,fig|287.6630.peg.3946,fi...","fig|287.6639.peg.3828,fig|287.6639.peg.4151,fi...","fig|287.6638.peg.3291,fig|287.6638.peg.3608,fi...","fig|287.6627.peg.3392,fig|287.6627.peg.3709,fi..."
1,PLF_286_00000002,Serine hydroxymethyltransferase (EC 2.1.2.1),"fig|287.6613.peg.2785,fig|287.6613.peg.5159,fi...","fig|287.6611.peg.2687,fig|287.6611.peg.5087,fi...","fig|287.6614.peg.3177,fig|287.6614.peg.5788,fi...","fig|287.6616.peg.2714,fig|287.6616.peg.5023,fi...","fig|287.6612.peg.2832,fig|287.6612.peg.5251,fi...","fig|287.6618.peg.2789,fig|287.6618.peg.5290,fi...","fig|287.6615.peg.2793,fig|287.6615.peg.5418,fi...","fig|287.6617.peg.2694,fig|287.6617.peg.5110,fi...",...,"fig|287.6635.peg.2733,fig|287.6635.peg.5093,fi...","fig|287.6633.peg.2958,fig|287.6633.peg.5391,fi...","fig|287.6634.peg.5167,fig|287.6634.peg.5843,fi...","fig|287.6637.peg.3065,fig|287.6637.peg.5353,fi...","fig|287.6636.peg.2831,fig|287.6636.peg.5367,fi...","fig|287.6620.peg.1080,fig|287.6620.peg.3396,fi...","fig|287.6630.peg.2901,fig|287.6630.peg.5310,fi...","fig|287.6639.peg.3143,fig|287.6639.peg.5708,fi...","fig|287.6638.peg.2627,fig|287.6638.peg.4878,fi...","fig|287.6627.peg.2669,fig|287.6627.peg.4981,fi..."
2,PLF_286_00000003,VgrG protein,"fig|287.6613.peg.100,fig|287.6613.peg.104,fig|...","fig|287.6611.peg.102,fig|287.6611.peg.106,fig|...","fig|287.6614.peg.113,fig|287.6614.peg.117,fig|...","fig|287.6616.peg.97,fig|287.6616.peg.102,fig|2...","fig|287.6612.peg.108,fig|287.6612.peg.112,fig|...","fig|287.6618.peg.97,fig|287.6618.peg.101,fig|2...","fig|287.6615.peg.106,fig|287.6615.peg.110,fig|...","fig|287.6617.peg.98,fig|287.6617.peg.102,fig|2...",...,"fig|287.6635.peg.98,fig|287.6635.peg.102,fig|2...","fig|287.6633.peg.101,fig|287.6633.peg.105,fig|...","fig|287.6634.peg.113,fig|287.6634.peg.117,fig|...","fig|287.6637.peg.158,fig|287.6637.peg.162,fig|...","fig|287.6636.peg.102,fig|287.6636.peg.106,fig|...","fig|287.6620.peg.97,fig|287.6620.peg.101,fig|2...","fig|287.6630.peg.98,fig|287.6630.peg.102,fig|2...","fig|287.6639.peg.113,fig|287.6639.peg.117,fig|...","fig|287.6638.peg.106,fig|287.6638.peg.110,fig|...","fig|287.6627.peg.101,fig|287.6627.peg.105,fig|..."
3,PLF_286_00000004,GTP cyclohydrolase I (EC 3.5.4.16) type 1,"fig|287.6613.peg.1651,fig|287.6613.peg.3615","fig|287.6611.peg.1569,fig|287.6611.peg.3549","fig|287.6614.peg.1660,fig|287.6614.peg.4079","fig|287.6616.peg.1610,fig|287.6616.peg.3553","fig|287.6612.peg.1656,fig|287.6612.peg.3746","fig|287.6618.peg.1607,fig|287.6618.peg.3718","fig|287.6615.peg.1583,fig|287.6615.peg.3745","fig|287.6617.peg.1601,fig|287.6617.peg.3610",...,"fig|287.6635.peg.1554,fig|287.6635.peg.3595","fig|287.6633.peg.1726,fig|287.6633.peg.3912","fig|287.6634.peg.1663,fig|287.6634.peg.4283","fig|287.6637.peg.1742,fig|287.6637.peg.2213","fig|287.6636.peg.1647,fig|287.6636.peg.3691","fig|287.6620.peg.2098,fig|287.6620.peg.4230","fig|287.6630.peg.1570,fig|287.6630.peg.3824","fig|287.6639.peg.1661,fig|287.6639.peg.4028","fig|287.6638.peg.1571,fig|287.6638.peg.3486","fig|287.6627.peg.1536,fig|287.6627.peg.3588"
4,PLF_286_00000006,"Dipeptide ABC transporter, substrate-binding p...","fig|287.6613.peg.5047,fig|287.6613.peg.5051","fig|287.6611.peg.4854,fig|287.6611.peg.4858","fig|287.6614.peg.5471,fig|287.6614.peg.5476","fig|287.6616.peg.4912,fig|287.6616.peg.4916","fig|287.6612.peg.5033,fig|287.6612.peg.5038","fig|287.6618.peg.5064,fig|287.6618.peg.5068","fig|287.6615.peg.5195,fig|287.6615.peg.5199","fig|287.6617.peg.4888,fig|287.6617.peg.4892",...,fig|287.6635.peg.5006,"fig|287.6633.peg.5179,fig|287.6633.peg.5183","fig|287.6634.peg.2887,fig|287.6634.peg.2892","fig|287.6637.peg.5237,fig|287.6637.peg.5241","fig|287.6636.peg.5254,fig|287.6636.peg.5258","fig|287.6620.peg.1189,fig|287.6620.peg.1193","fig|287.6630.peg.5100,fig|287.6630.peg.5104","fig|287.6639.peg.5486,fig|287.6639.peg.5491","fig|287.6638.peg.4765,fig|287.6638.peg.4769","fig|287.6627.peg.4868,fig|287.6627.peg.4872"


## Find proteins that all RL producers contain

In [13]:
df_orth_plfam_producers = df_orth_plfam[['PATRIC genus-specific families (PLfams)','Product']+list(df_rhl[df_rhl.rhamn2cats==1].index)]
df_orth_plfam_producers_all_contain = df_orth_plfam_producers[~df_orth_plfam_producers.isnull().any(axis=1)]
df_orth_plfam_producers_all_contain.head()

Unnamed: 0,PATRIC genus-specific families (PLfams),Product,F22031,F23197,F30658,F34365,F9670,H47921,H5708,M37351,...,T52373,T6313,T63266,W16407,W25637,W45909,W70332,W91453,X78812,X9820
0,PLF_286_00000001,Cytochrome c oxidase (cbb3-type) subunit CcoN ...,"fig|287.6613.peg.3420,fig|287.6613.peg.3781,fi...","fig|287.6611.peg.3353,fig|287.6611.peg.3671,fi...","fig|287.6614.peg.3879,fig|287.6614.peg.4201,fi...","fig|287.6616.peg.3358,fig|287.6616.peg.3681,fi...","fig|287.6615.peg.3548,fig|287.6615.peg.3869,fi...","fig|287.6621.peg.3709,fig|287.6621.peg.4029,fi...","fig|287.6619.peg.3333,fig|287.6619.peg.3653,fi...","fig|287.6623.peg.2549,fig|287.6623.peg.2553,fi...",...,"fig|287.6632.peg.3297,fig|287.6632.peg.3634,fi...","fig|287.6629.peg.945,fig|287.6629.peg.3732,fig...","fig|287.6635.peg.3397,fig|287.6635.peg.3716,fi...","fig|287.6633.peg.3712,fig|287.6633.peg.4037,fi...","fig|287.6634.peg.4156,fig|287.6634.peg.4160,fi...","fig|287.6636.peg.3494,fig|287.6636.peg.3814,fi...","fig|287.6630.peg.3615,fig|287.6630.peg.3946,fi...","fig|287.6639.peg.3828,fig|287.6639.peg.4151,fi...","fig|287.6638.peg.3291,fig|287.6638.peg.3608,fi...","fig|287.6627.peg.3392,fig|287.6627.peg.3709,fi..."
1,PLF_286_00000002,Serine hydroxymethyltransferase (EC 2.1.2.1),"fig|287.6613.peg.2785,fig|287.6613.peg.5159,fi...","fig|287.6611.peg.2687,fig|287.6611.peg.5087,fi...","fig|287.6614.peg.3177,fig|287.6614.peg.5788,fi...","fig|287.6616.peg.2714,fig|287.6616.peg.5023,fi...","fig|287.6615.peg.2793,fig|287.6615.peg.5418,fi...","fig|287.6621.peg.2927,fig|287.6621.peg.5404,fi...","fig|287.6619.peg.2680,fig|287.6619.peg.4885,fi...","fig|287.6623.peg.1101,fig|287.6623.peg.3759,fi...",...,"fig|287.6632.peg.2634,fig|287.6632.peg.4889,fi...","fig|287.6629.peg.2993,fig|287.6629.peg.5525,fi...","fig|287.6635.peg.2733,fig|287.6635.peg.5093,fi...","fig|287.6633.peg.2958,fig|287.6633.peg.5391,fi...","fig|287.6634.peg.5167,fig|287.6634.peg.5843,fi...","fig|287.6636.peg.2831,fig|287.6636.peg.5367,fi...","fig|287.6630.peg.2901,fig|287.6630.peg.5310,fi...","fig|287.6639.peg.3143,fig|287.6639.peg.5708,fi...","fig|287.6638.peg.2627,fig|287.6638.peg.4878,fi...","fig|287.6627.peg.2669,fig|287.6627.peg.4981,fi..."
2,PLF_286_00000003,VgrG protein,"fig|287.6613.peg.100,fig|287.6613.peg.104,fig|...","fig|287.6611.peg.102,fig|287.6611.peg.106,fig|...","fig|287.6614.peg.113,fig|287.6614.peg.117,fig|...","fig|287.6616.peg.97,fig|287.6616.peg.102,fig|2...","fig|287.6615.peg.106,fig|287.6615.peg.110,fig|...","fig|287.6621.peg.101,fig|287.6621.peg.105,fig|...","fig|287.6619.peg.99,fig|287.6619.peg.104,fig|2...","fig|287.6623.peg.151,fig|287.6623.peg.156,fig|...",...,"fig|287.6632.peg.99,fig|287.6632.peg.103,fig|2...","fig|287.6629.peg.105,fig|287.6629.peg.109,fig|...","fig|287.6635.peg.98,fig|287.6635.peg.102,fig|2...","fig|287.6633.peg.101,fig|287.6633.peg.105,fig|...","fig|287.6634.peg.113,fig|287.6634.peg.117,fig|...","fig|287.6636.peg.102,fig|287.6636.peg.106,fig|...","fig|287.6630.peg.98,fig|287.6630.peg.102,fig|2...","fig|287.6639.peg.113,fig|287.6639.peg.117,fig|...","fig|287.6638.peg.106,fig|287.6638.peg.110,fig|...","fig|287.6627.peg.101,fig|287.6627.peg.105,fig|..."
3,PLF_286_00000004,GTP cyclohydrolase I (EC 3.5.4.16) type 1,"fig|287.6613.peg.1651,fig|287.6613.peg.3615","fig|287.6611.peg.1569,fig|287.6611.peg.3549","fig|287.6614.peg.1660,fig|287.6614.peg.4079","fig|287.6616.peg.1610,fig|287.6616.peg.3553","fig|287.6615.peg.1583,fig|287.6615.peg.3745","fig|287.6621.peg.1690,fig|287.6621.peg.3907","fig|287.6619.peg.1549,fig|287.6619.peg.3530","fig|287.6623.peg.2681,fig|287.6623.peg.4904",...,"fig|287.6632.peg.1552,fig|287.6632.peg.3495","fig|287.6629.peg.1687,fig|287.6629.peg.4019","fig|287.6635.peg.1554,fig|287.6635.peg.3595","fig|287.6633.peg.1726,fig|287.6633.peg.3912","fig|287.6634.peg.1663,fig|287.6634.peg.4283","fig|287.6636.peg.1647,fig|287.6636.peg.3691","fig|287.6630.peg.1570,fig|287.6630.peg.3824","fig|287.6639.peg.1661,fig|287.6639.peg.4028","fig|287.6638.peg.1571,fig|287.6638.peg.3486","fig|287.6627.peg.1536,fig|287.6627.peg.3588"
4,PLF_286_00000006,"Dipeptide ABC transporter, substrate-binding p...","fig|287.6613.peg.5047,fig|287.6613.peg.5051","fig|287.6611.peg.4854,fig|287.6611.peg.4858","fig|287.6614.peg.5471,fig|287.6614.peg.5476","fig|287.6616.peg.4912,fig|287.6616.peg.4916","fig|287.6615.peg.5195,fig|287.6615.peg.5199","fig|287.6621.peg.5286,fig|287.6621.peg.5290","fig|287.6619.peg.4772,fig|287.6619.peg.4776","fig|287.6623.peg.1351,fig|287.6623.peg.1355",...,"fig|287.6632.peg.4779,fig|287.6632.peg.4783","fig|287.6629.peg.5304,fig|287.6629.peg.5308",fig|287.6635.peg.5006,"fig|287.6633.peg.5179,fig|287.6633.peg.5183","fig|287.6634.peg.2887,fig|287.6634.peg.2892","fig|287.6636.peg.5254,fig|287.6636.peg.5258","fig|287.6630.peg.5100,fig|287.6630.peg.5104","fig|287.6639.peg.5486,fig|287.6639.peg.5491","fig|287.6638.peg.4765,fig|287.6638.peg.4769","fig|287.6627.peg.4868,fig|287.6627.peg.4872"


## Find proteins that are missing in at least one non-producer

In [14]:
df_orth_plfam_nonproducers = df_orth_plfam[['PATRIC genus-specific families (PLfams)','Product']+list(df_rhl[df_rhl.rhamn2cats==0].index)]
df_orth_plfam_nonproducers_missing = df_orth_plfam_nonproducers[df_orth_plfam_nonproducers.isnull().any(axis=1)]
df_orth_plfam_nonproducers_missing.head()

Unnamed: 0,PATRIC genus-specific families (PLfams),Product,F5677,F63912,H27930,M1608,M55212,S86968,W36662,W60856
7,PLF_286_00000009,Transposase InsO for insertion sequence elemen...,"fig|287.6612.peg.1065,fig|287.6612.peg.2468,fi...","fig|287.6618.peg.1221,fig|287.6618.peg.1222,fi...","fig|287.6617.peg.2374,fig|287.6617.peg.2678,fi...","fig|287.6622.peg.1918,fig|287.6622.peg.334,fig...","fig|287.6624.peg.1132,fig|287.6624.peg.2573,fi...","fig|287.6631.peg.2570,fig|287.6631.peg.3308,fi...",,"fig|287.6620.peg.3377,fig|287.6620.peg.3949,fi..."
22,PLF_286_00000028,Mobile element protein,"fig|287.6612.peg.3010,fig|287.6612.peg.5171,fi...","fig|287.6618.peg.1058,fig|287.6618.peg.1064,fi...",,fig|287.6622.peg.4906,fig|287.6624.peg.2733,,,
23,PLF_286_00000029,Uncharacterized protein YmdF,"fig|287.6612.peg.3128,fig|287.6612.peg.3176","fig|287.6618.peg.3164,fig|287.6618.peg.3213",fig|287.6617.peg.3039,,"fig|287.6624.peg.4936,fig|287.6624.peg.4985,fi...",fig|287.6631.peg.3191,"fig|287.6637.peg.2720,fig|287.6637.peg.2768",fig|287.6620.peg.3732
42,PLF_286_00000052,Arsenic resistance protein ArsH,,fig|287.6618.peg.2962,fig|287.6617.peg.2873,fig|287.6622.peg.3112,"fig|287.6624.peg.1891,fig|287.6624.peg.2782,fi...","fig|287.6631.peg.2420,fig|287.6631.peg.3059",fig|287.6637.peg.2887,"fig|287.6620.peg.3212,fig|287.6620.peg.3572"
55,PLF_286_00000070,Glutathione reductase (EC 1.8.1.7),fig|287.6612.peg.3310,fig|287.6618.peg.3342,fig|287.6617.peg.3171,,fig|287.6624.peg.4803,fig|287.6631.peg.3320,fig|287.6637.peg.2593,fig|287.6620.peg.3855


## Find overlaps

In [15]:
overlaps = list(set(df_orth_plfam_producers_all_contain.index).intersection(df_orth_plfam_nonproducers_missing.index))
df_orth_plfam_nonproducers_missing_to_focus = df_orth_plfam_nonproducers_missing.loc[overlaps]
df_orth_plfam_nonproducers_missing_to_focus.head()

Unnamed: 0,PATRIC genus-specific families (PLfams),Product,F5677,F63912,H27930,M1608,M55212,S86968,W36662,W60856
1025,PLF_286_00001117,hypothetical protein,fig|287.6612.peg.3306,fig|287.6618.peg.3338,fig|287.6617.peg.3167,,fig|287.6624.peg.4807,fig|287.6631.peg.3316,fig|287.6637.peg.2597,fig|287.6620.peg.3851
4106,PLF_286_00004901,Uncharacterized amino acid permease YtnA,fig|287.6612.peg.3253,fig|287.6618.peg.3284,fig|287.6617.peg.3115,,fig|287.6624.peg.4861,fig|287.6631.peg.3259,fig|287.6637.peg.2651,fig|287.6620.peg.3799
4109,PLF_286_00004904,Putative two-component sensor,fig|287.6612.peg.3343,fig|287.6618.peg.3376,fig|287.6617.peg.3205,,fig|287.6624.peg.4769,fig|287.6631.peg.3356,fig|287.6637.peg.2558,fig|287.6620.peg.3889
1552,PLF_286_00001796,Ku domain protein,fig|287.6612.peg.3171,fig|287.6618.peg.3208,fig|287.6617.peg.3034,,"fig|287.6624.peg.4941,fig|287.6624.peg.4942",fig|287.6631.peg.3186,fig|287.6637.peg.2725,fig|287.6620.peg.3727
4113,PLF_286_00004908,"Transcriptional regulator, LysR family",fig|287.6612.peg.3204,fig|287.6618.peg.3241,fig|287.6617.peg.3066,,fig|287.6624.peg.4908,fig|287.6631.peg.3218,fig|287.6637.peg.2693,fig|287.6620.peg.3759


In [16]:
df_orth_plfam_nonproducers_missing_to_focus.to_csv('missing_proteins_in_nonproducers_plfam.csv')

# Using Reciprocal Blast

In [106]:
df_orth_blast = pd.read_csv('../find_protein_orthologues_UCBPP-PA14_vs_otherPA/protein_orthologue_dictionary_ref_PA14_all.csv', index_col=0)
df_orth_blast.head()

Unnamed: 0,Product,F22031,F23197,F30658,F34365,F5677,F63912,F9670,H27930,H47921,...,T63266,W16407,W25637,W36662,W45909,W60856,W70332,W91453,X78812,X9820
fig|287.6770.peg.1000,Membrane-bound lytic murein transglycosylase B,fig|287.6613.peg.976,fig|287.6611.peg.983,fig|287.6614.peg.1064,fig|287.6616.peg.962,fig|287.6612.peg.1000,fig|287.6618.peg.1008,fig|287.6615.peg.1001,fig|287.6617.peg.1030,fig|287.6621.peg.1073,...,fig|287.6635.peg.988,fig|287.6633.peg.1064,fig|287.6634.peg.1066,fig|287.6637.peg.1116,fig|287.6636.peg.982,fig|287.6620.peg.5475,fig|287.6630.peg.1000,fig|287.6639.peg.1067,fig|287.6638.peg.992,fig|287.6627.peg.974
fig|287.6770.peg.1001,Septum-associated rare lipoprotein A,fig|287.6613.peg.977,fig|287.6611.peg.984,fig|287.6614.peg.1065,fig|287.6616.peg.963,fig|287.6612.peg.1001,fig|287.6618.peg.1009,fig|287.6615.peg.1002,,fig|287.6621.peg.1074,...,fig|287.6635.peg.989,fig|287.6633.peg.1065,fig|287.6634.peg.1067,fig|287.6637.peg.1117,fig|287.6636.peg.983,fig|287.6620.peg.5474,fig|287.6630.peg.1001,fig|287.6639.peg.1068,fig|287.6638.peg.993,fig|287.6627.peg.975
fig|287.6770.peg.1002,D-alanyl-D-alanine carboxypeptidase (EC 3.4.16.4),fig|287.6613.peg.978,fig|287.6611.peg.985,fig|287.6614.peg.1066,fig|287.6616.peg.964,fig|287.6612.peg.1002,fig|287.6618.peg.1010,fig|287.6615.peg.1003,fig|287.6617.peg.1033,fig|287.6621.peg.1075,...,fig|287.6635.peg.990,fig|287.6633.peg.1066,fig|287.6634.peg.1068,fig|287.6637.peg.1118,fig|287.6636.peg.984,fig|287.6620.peg.5473,fig|287.6630.peg.1002,fig|287.6639.peg.1069,fig|287.6638.peg.994,fig|287.6627.peg.976
fig|287.6770.peg.1003,Proposed lipoate regulatory protein YbeD,fig|287.6613.peg.979,fig|287.6611.peg.986,fig|287.6614.peg.1067,fig|287.6616.peg.965,fig|287.6612.peg.1003,fig|287.6618.peg.1011,fig|287.6615.peg.1004,fig|287.6617.peg.1034,fig|287.6621.peg.1076,...,fig|287.6635.peg.991,fig|287.6633.peg.1067,fig|287.6634.peg.1069,fig|287.6637.peg.1119,fig|287.6636.peg.985,fig|287.6620.peg.5472,fig|287.6630.peg.1003,fig|287.6639.peg.1070,fig|287.6638.peg.995,fig|287.6627.peg.977
fig|287.6770.peg.1004,Octanoate-[acyl-carrier-protein]-protein-N-oct...,fig|287.6613.peg.980,fig|287.6611.peg.987,fig|287.6614.peg.1068,fig|287.6616.peg.966,fig|287.6612.peg.1004,fig|287.6618.peg.1012,fig|287.6615.peg.1005,fig|287.6617.peg.1035,fig|287.6621.peg.1077,...,fig|287.6635.peg.992,fig|287.6633.peg.1068,fig|287.6634.peg.1070,fig|287.6637.peg.1120,fig|287.6636.peg.986,fig|287.6620.peg.5471,fig|287.6630.peg.1004,fig|287.6639.peg.1071,fig|287.6638.peg.996,fig|287.6627.peg.978


## Read PA14 annotation

In [107]:
df_PA14_annot = pd.read_excel('genome_comparison-PA14jbx.xlsx',index_col=0)
df_PA14_annot = df_PA14_annot[df_PA14_annot['comp_genome_1_hit']=='bi (<->)']
df_PA14_annot = df_PA14_annot[['ref_genome_patric_id','comp_genome_1_gene_name','comp_genome_1_locus_tag']].set_index('ref_genome_patric_id').fillna('')
df_PA14_annot.columns = ['Gene_Patric','locus']
df_PA14_annot.head()

Unnamed: 0_level_0,Gene_Patric,locus
ref_genome_patric_id,Unnamed: 1_level_1,Unnamed: 2_level_1
fig|287.6770.peg.1,dnaA,PA14_00010
fig|287.6770.peg.2,dnaN,PA14_00020
fig|287.6770.peg.3,recF,PA14_00030
fig|287.6770.peg.4,gyrB,PA14_00050
fig|287.6770.peg.6,,PA14_00060


In [108]:
df_PA14_locustag = pd.read_csv('PA14locs_OrthlogID.csv',index_col=0)
df_PA14_locustag = df_PA14_locustag[['locus','Gene']]
df_PA14_locustag = df_PA14_locustag.rename(columns={'Gene':'Gene_Prokka'})
df_PA14_annot = df_PA14_annot.reset_index().merge(df_PA14_locustag,how='left').set_index('ref_genome_patric_id').fillna('')
df_PA14_annot.head()

Unnamed: 0_level_0,Gene_Patric,locus,Gene_Prokka
ref_genome_patric_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
fig|287.6770.peg.1,dnaA,PA14_00010,dnaA
fig|287.6770.peg.2,dnaN,PA14_00020,dnaN
fig|287.6770.peg.3,recF,PA14_00030,recF_1
fig|287.6770.peg.4,gyrB,PA14_00050,gyrB
fig|287.6770.peg.6,,PA14_00060,group_10488


In [109]:
df_orth_blast = pd.merge(df_PA14_annot, df_orth_blast, left_index=True, right_index=True, how='right')
df_orth_blast.head()

Unnamed: 0,Gene_Patric,locus,Gene_Prokka,Product,F22031,F23197,F30658,F34365,F5677,F63912,...,T63266,W16407,W25637,W36662,W45909,W60856,W70332,W91453,X78812,X9820
fig|287.6770.peg.1000,sltB1,PA14_12080,mltB_1,Membrane-bound lytic murein transglycosylase B,fig|287.6613.peg.976,fig|287.6611.peg.983,fig|287.6614.peg.1064,fig|287.6616.peg.962,fig|287.6612.peg.1000,fig|287.6618.peg.1008,...,fig|287.6635.peg.988,fig|287.6633.peg.1064,fig|287.6634.peg.1066,fig|287.6637.peg.1116,fig|287.6636.peg.982,fig|287.6620.peg.5475,fig|287.6630.peg.1000,fig|287.6639.peg.1067,fig|287.6638.peg.992,fig|287.6627.peg.974
fig|287.6770.peg.1001,,PA14_12090,rlpA_1,Septum-associated rare lipoprotein A,fig|287.6613.peg.977,fig|287.6611.peg.984,fig|287.6614.peg.1065,fig|287.6616.peg.963,fig|287.6612.peg.1001,fig|287.6618.peg.1009,...,fig|287.6635.peg.989,fig|287.6633.peg.1065,fig|287.6634.peg.1067,fig|287.6637.peg.1117,fig|287.6636.peg.983,fig|287.6620.peg.5474,fig|287.6630.peg.1001,fig|287.6639.peg.1068,fig|287.6638.peg.993,fig|287.6627.peg.975
fig|287.6770.peg.1002,dacC,PA14_12100,dacC,D-alanyl-D-alanine carboxypeptidase (EC 3.4.16.4),fig|287.6613.peg.978,fig|287.6611.peg.985,fig|287.6614.peg.1066,fig|287.6616.peg.964,fig|287.6612.peg.1002,fig|287.6618.peg.1010,...,fig|287.6635.peg.990,fig|287.6633.peg.1066,fig|287.6634.peg.1068,fig|287.6637.peg.1118,fig|287.6636.peg.984,fig|287.6620.peg.5473,fig|287.6630.peg.1002,fig|287.6639.peg.1069,fig|287.6638.peg.994,fig|287.6627.peg.976
fig|287.6770.peg.1003,,PA14_12110,group_12410,Proposed lipoate regulatory protein YbeD,fig|287.6613.peg.979,fig|287.6611.peg.986,fig|287.6614.peg.1067,fig|287.6616.peg.965,fig|287.6612.peg.1003,fig|287.6618.peg.1011,...,fig|287.6635.peg.991,fig|287.6633.peg.1067,fig|287.6634.peg.1069,fig|287.6637.peg.1119,fig|287.6636.peg.985,fig|287.6620.peg.5472,fig|287.6630.peg.1003,fig|287.6639.peg.1070,fig|287.6638.peg.995,fig|287.6627.peg.977
fig|287.6770.peg.1004,lipB,PA14_12120,lipB,Octanoate-[acyl-carrier-protein]-protein-N-oct...,fig|287.6613.peg.980,fig|287.6611.peg.987,fig|287.6614.peg.1068,fig|287.6616.peg.966,fig|287.6612.peg.1004,fig|287.6618.peg.1012,...,fig|287.6635.peg.992,fig|287.6633.peg.1068,fig|287.6634.peg.1070,fig|287.6637.peg.1120,fig|287.6636.peg.986,fig|287.6620.peg.5471,fig|287.6630.peg.1004,fig|287.6639.peg.1071,fig|287.6638.peg.996,fig|287.6627.peg.978


## Find proteins that all RL producers contain

In [110]:
df_orth_blast_producers = df_orth_blast[['Gene_Patric','Gene_Prokka','locus','Product']+[x for x in list(df_rhl[df_rhl.rhamn2cats==1].index) if x!='UCBPP-PA14']]
df_orth_blast_producers_all_contain = df_orth_blast_producers[~df_orth_blast_producers.isnull().any(axis=1)]
df_orth_blast_producers_all_contain.head()

Unnamed: 0,Gene_Patric,Gene_Prokka,locus,Product,F22031,F23197,F30658,F34365,F9670,H47921,...,T52373,T6313,T63266,W16407,W25637,W45909,W70332,W91453,X78812,X9820
fig|287.6770.peg.1000,sltB1,mltB_1,PA14_12080,Membrane-bound lytic murein transglycosylase B,fig|287.6613.peg.976,fig|287.6611.peg.983,fig|287.6614.peg.1064,fig|287.6616.peg.962,fig|287.6615.peg.1001,fig|287.6621.peg.1073,...,fig|287.6632.peg.985,fig|287.6629.peg.1078,fig|287.6635.peg.988,fig|287.6633.peg.1064,fig|287.6634.peg.1066,fig|287.6636.peg.982,fig|287.6630.peg.1000,fig|287.6639.peg.1067,fig|287.6638.peg.992,fig|287.6627.peg.974
fig|287.6770.peg.1001,,rlpA_1,PA14_12090,Septum-associated rare lipoprotein A,fig|287.6613.peg.977,fig|287.6611.peg.984,fig|287.6614.peg.1065,fig|287.6616.peg.963,fig|287.6615.peg.1002,fig|287.6621.peg.1074,...,fig|287.6632.peg.986,fig|287.6629.peg.1079,fig|287.6635.peg.989,fig|287.6633.peg.1065,fig|287.6634.peg.1067,fig|287.6636.peg.983,fig|287.6630.peg.1001,fig|287.6639.peg.1068,fig|287.6638.peg.993,fig|287.6627.peg.975
fig|287.6770.peg.1002,dacC,dacC,PA14_12100,D-alanyl-D-alanine carboxypeptidase (EC 3.4.16.4),fig|287.6613.peg.978,fig|287.6611.peg.985,fig|287.6614.peg.1066,fig|287.6616.peg.964,fig|287.6615.peg.1003,fig|287.6621.peg.1075,...,fig|287.6632.peg.987,fig|287.6629.peg.1080,fig|287.6635.peg.990,fig|287.6633.peg.1066,fig|287.6634.peg.1068,fig|287.6636.peg.984,fig|287.6630.peg.1002,fig|287.6639.peg.1069,fig|287.6638.peg.994,fig|287.6627.peg.976
fig|287.6770.peg.1003,,group_12410,PA14_12110,Proposed lipoate regulatory protein YbeD,fig|287.6613.peg.979,fig|287.6611.peg.986,fig|287.6614.peg.1067,fig|287.6616.peg.965,fig|287.6615.peg.1004,fig|287.6621.peg.1076,...,fig|287.6632.peg.988,fig|287.6629.peg.1081,fig|287.6635.peg.991,fig|287.6633.peg.1067,fig|287.6634.peg.1069,fig|287.6636.peg.985,fig|287.6630.peg.1003,fig|287.6639.peg.1070,fig|287.6638.peg.995,fig|287.6627.peg.977
fig|287.6770.peg.1004,lipB,lipB,PA14_12120,Octanoate-[acyl-carrier-protein]-protein-N-oct...,fig|287.6613.peg.980,fig|287.6611.peg.987,fig|287.6614.peg.1068,fig|287.6616.peg.966,fig|287.6615.peg.1005,fig|287.6621.peg.1077,...,fig|287.6632.peg.989,fig|287.6629.peg.1082,fig|287.6635.peg.992,fig|287.6633.peg.1068,fig|287.6634.peg.1070,fig|287.6636.peg.986,fig|287.6630.peg.1004,fig|287.6639.peg.1071,fig|287.6638.peg.996,fig|287.6627.peg.978


## Find proteins that are missing in at least one non-producer

In [111]:
df_orth_blast_nonproducers = df_orth_blast[['Gene_Patric','Gene_Prokka','locus','Product']+list(df_rhl[df_rhl.rhamn2cats==0].index)]
df_orth_blast_nonproducers_missing = df_orth_blast_nonproducers[df_orth_blast_nonproducers.isnull().any(axis=1)]
df_orth_blast_nonproducers_missing.head()

Unnamed: 0,Gene_Patric,Gene_Prokka,locus,Product,F5677,F63912,H27930,M1608,M55212,S86968,W36662,W60856
fig|287.6770.peg.1001,,rlpA_1,PA14_12090,Septum-associated rare lipoprotein A,fig|287.6612.peg.1001,fig|287.6618.peg.1009,,fig|287.6622.peg.5056,fig|287.6624.peg.1066,fig|287.6631.peg.1043,fig|287.6637.peg.1117,fig|287.6620.peg.5474
fig|287.6770.peg.1009,,,,hypothetical protein,,fig|287.6618.peg.1017,fig|287.6617.peg.1040,fig|287.6622.peg.5048,,fig|287.6631.peg.1051,fig|287.6637.peg.1125,fig|287.6620.peg.5466
fig|287.6770.peg.100,,group_5316,PA14_01130,FIG00965783: hypothetical protein,,,,fig|287.6622.peg.153,,,,
fig|287.6770.peg.1014,,,,FIG00960788: hypothetical protein,fig|287.6612.peg.1014,fig|287.6618.peg.1022,,fig|287.6622.peg.5043,,,fig|287.6637.peg.1130,
fig|287.6770.peg.1015,,group_12433,PA14_12260,hypothetical protein,fig|287.6612.peg.1015,fig|287.6618.peg.1023,,fig|287.6622.peg.5042,,,fig|287.6637.peg.1131,


## Find overlaps

In [112]:
overlaps = list(set(df_orth_blast_producers_all_contain.index).intersection(df_orth_blast_nonproducers_missing.index))
df_orth_blast_nonproducers_missing_to_focus = df_orth_blast_nonproducers_missing.loc[overlaps]
df_orth_blast_nonproducers_missing_to_focus.head()

Unnamed: 0,Gene_Patric,Gene_Prokka,locus,Product,F5677,F63912,H27930,M1608,M55212,S86968,W36662,W60856
fig|287.6770.peg.3228,ppiC1,ppiC_2,PA14_38700,Peptidyl-prolyl cis-trans isomerase PpiC (EC 5...,fig|287.6612.peg.3339,fig|287.6618.peg.3372,fig|287.6617.peg.3201,,fig|287.6624.peg.4773,fig|287.6631.peg.3352,fig|287.6637.peg.2562,fig|287.6620.peg.3885
fig|287.6770.peg.3494,,group_2553,PA14_41910,Thiol-disulfide isomerase and thioredoxins,fig|287.6612.peg.3666,,fig|287.6617.peg.3531,fig|287.6622.peg.2758,fig|287.6624.peg.4510,fig|287.6631.peg.3626,fig|287.6637.peg.2292,fig|287.6620.peg.4151
fig|287.6770.peg.3227,,acsA_2,PA14_38690,Acetoacetyl-CoA synthetase (EC 6.2.1.16),fig|287.6612.peg.3338,fig|287.6618.peg.3371,fig|287.6617.peg.3200,,fig|287.6624.peg.4774,fig|287.6631.peg.3351,fig|287.6637.peg.2563,fig|287.6620.peg.3884
fig|287.6770.peg.3257,,group_4188,PA14_39080,hypothetical protein,fig|287.6612.peg.3369,fig|287.6618.peg.3402,fig|287.6617.peg.3232,,fig|287.6624.peg.4744,fig|287.6631.peg.3384,fig|287.6637.peg.2532,fig|287.6620.peg.3915
fig|287.6770.peg.3072,,group_2476,PA14_36850,hypothetical protein,fig|287.6612.peg.3180,fig|287.6618.peg.3217,fig|287.6617.peg.3043,,fig|287.6624.peg.4932,fig|287.6631.peg.3195,fig|287.6637.peg.2716,fig|287.6620.peg.3736


In [113]:
df_orth_blast_nonproducers_missing_to_focus.to_csv('missing_proteins_in_nonproducers_blast.csv')