In [224]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline 
import seaborn as sns
from copy import deepcopy
import os
import sys
import numpy as np
from Bio.Blast.Applications import NcbiblastpCommandline
from Bio.Blast import NCBIWWW, NCBIXML
from Bio import AlignIO, SeqIO
import Bio.Align.Applications
from Bio.Align.Applications import ClustalwCommandline
from sklearn.ensemble import RandomForestClassifier

# Read rhamnolipid production 

In [225]:
df_rhl = pd.read_excel('../data/rhamnolipids/rhamnMat.xlsx', index_col=0)
df_rhl = df_rhl.rename(index={'PA14': 'UCBPP-PA14'})
df_rhl.head()

Unnamed: 0_level_0,rhamn3cats,rhamn2cats
strain,Unnamed: 1_level_1,Unnamed: 2_level_1
F22031,2,1
F23197,2,1
F30658,1,1
F34365,2,1
F5677,0,0


# Read protein orthologue dictionary

In [226]:
df_orth = pd.read_csv('../find_protein_orthologues/protein_orthologue_dictionary_ref_PA14.csv', index_col=0)
df_orth.index.name = 'UCBPP-PA14'
df_orth.head()

Unnamed: 0_level_0,F22031,F23197,F30658,F34365,F5677,F63912,F9670,H27930,H47921,H5708,...,T63266,W16407,W25637,W36662,W45909,W60856,W70332,W91453,X78812,X9820
UCBPP-PA14,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
fig|287.6656.peg.1,fig|287.6613.peg.1,fig|287.6611.peg.1,fig|287.6614.peg.1,fig|287.6616.peg.1,fig|287.6612.peg.1,fig|287.6618.peg.1,fig|287.6615.peg.1,fig|287.6617.peg.1,fig|287.6621.peg.1,fig|287.6619.peg.1,...,fig|287.6635.peg.1,fig|287.6633.peg.1,fig|287.6634.peg.1,fig|287.6637.peg.1,fig|287.6636.peg.1,fig|287.6620.peg.1,fig|287.6630.peg.1,fig|287.6639.peg.1,fig|287.6638.peg.1,fig|287.6627.peg.1
fig|287.6656.peg.2,fig|287.6613.peg.2,fig|287.6611.peg.2,fig|287.6614.peg.2,fig|287.6616.peg.2,fig|287.6612.peg.2,fig|287.6618.peg.2,fig|287.6615.peg.2,fig|287.6617.peg.2,fig|287.6621.peg.2,fig|287.6619.peg.2,...,fig|287.6635.peg.2,fig|287.6633.peg.2,fig|287.6634.peg.2,fig|287.6637.peg.2,fig|287.6636.peg.2,fig|287.6620.peg.2,fig|287.6630.peg.2,fig|287.6639.peg.2,fig|287.6638.peg.2,fig|287.6627.peg.2
fig|287.6656.peg.3,fig|287.6613.peg.3,fig|287.6611.peg.3,fig|287.6614.peg.3,fig|287.6616.peg.3,fig|287.6612.peg.3,fig|287.6618.peg.3,fig|287.6615.peg.3,fig|287.6617.peg.3,fig|287.6621.peg.3,fig|287.6619.peg.3,...,fig|287.6635.peg.3,fig|287.6633.peg.3,fig|287.6634.peg.3,fig|287.6637.peg.3,fig|287.6636.peg.3,fig|287.6620.peg.3,fig|287.6630.peg.3,fig|287.6639.peg.3,fig|287.6638.peg.3,fig|287.6627.peg.3
fig|287.6656.peg.4,fig|287.6613.peg.4,fig|287.6611.peg.4,fig|287.6614.peg.4,fig|287.6616.peg.4,fig|287.6612.peg.4,fig|287.6618.peg.4,fig|287.6615.peg.4,fig|287.6617.peg.4,fig|287.6621.peg.4,fig|287.6619.peg.4,...,fig|287.6635.peg.4,fig|287.6633.peg.4,fig|287.6634.peg.4,fig|287.6637.peg.4,fig|287.6636.peg.4,fig|287.6620.peg.4,fig|287.6630.peg.4,fig|287.6639.peg.4,fig|287.6638.peg.4,fig|287.6627.peg.4
fig|287.6656.peg.6,fig|287.6613.peg.5,fig|287.6611.peg.5,fig|287.6614.peg.5,fig|287.6616.peg.5,fig|287.6612.peg.6,fig|287.6618.peg.5,fig|287.6615.peg.12,fig|287.6617.peg.5,fig|287.6621.peg.5,fig|287.6619.peg.5,...,fig|287.6635.peg.5,fig|287.6633.peg.5,fig|287.6634.peg.5,fig|287.6637.peg.5,fig|287.6636.peg.5,fig|287.6620.peg.5,fig|287.6630.peg.5,fig|287.6639.peg.5,fig|287.6638.peg.11,fig|287.6627.peg.5


In [227]:
len(df_orth.columns)

30

# Function to align sequences and classify rhamnolipid production based on accessory protein mutations

In [228]:
def seqAlignment(query_id, df_orth):
    
    # protein_id_dict has strain name as key and protein id as value
    protein_id_dict = df_orth.loc[query_id,:].to_dict()
    protein_id_dict['UCBPP-PA14'] = query_id
    
    # where protein sequence data stores
    data_path = '../find_protein_orthologues/prots/'
      
    # prepare alignment file
    query_id_alias = ('_').join(('_').join(query_id.split('|')).split('.'))
    alignment_file = data_path + query_id_alias + '_for_alignment'
    fout = open(alignment_file+'.fasta', 'w')
    strains = []
    for strain, pid in protein_id_dict.items():      
        strains.append(strain)
        fin = open(data_path + 'PA_%s_feature_protein.faa'%(strain), 'r')
        for record in SeqIO.parse(fin,'fasta'):
            if record.id == pid:
                fout.write(">" + strain + "\n")
                fout.write(str(record.seq) + "\n")
                break
        fin.close()
    fout.close()

    # run multiple sequence alignment by calling clustalw
    clustalw_cline = ClustalwCommandline("/usr/local/bin/clustalw2", infile=alignment_file+'.fasta')
    stdout, stderr = clustalw_cline()
    align = AlignIO.read(alignment_file+'.aln', "clustal")

    # generate accessory matrix
    prot_seq_matrix = np.chararray(shape=(len(strains),len(align[0].seq)))
    for row, aln in enumerate(align):
        new_row = strains.index(aln.id)
        for col in range(len(aln.seq)):
            prot_seq_matrix[new_row][col] = aln.seq[col]
            
    accessory_indices = list()   
    for col in range(len(align[0].seq)):
        col_seq = [prot_seq_matrix[row][col] for row in range(len(strains))]
        all_equal=True
        for row in range(len(col_seq)):
            if col_seq[row] != col_seq[0]:
                all_equal = False
                break
        if not all_equal:
            accessory_indices.append(col)  
            
    if len(accessory_indices)==0:
        return None,None
    else:
        accessory_prot_seq_matrix = prot_seq_matrix[:,accessory_indices].decode('utf-8')
        
        # turn accessory matrix to panda frame
        df_acc_prot_seq = pd.DataFrame(accessory_prot_seq_matrix, index = strains, columns = np.array(accessory_indices)+1)
    
        # convert string to dummy variable
        df_acc_prot_seq_dummy = pd.get_dummies(df_acc_prot_seq)
        
        # delete file
        os.remove(alignment_file+'.fasta')
        os.remove(alignment_file+'.aln')
        os.remove(alignment_file+'.dnd') 
    
        return df_acc_prot_seq, df_acc_prot_seq_dummy

In [229]:
def rfClassification(df_acc_prot_seq_dummy, category_level, df_rhl):
    
    # join features with labels
    df_acc_prot_seq_dummy_rhl = pd.merge(df_acc_prot_seq_dummy, df_rhl, left_index=True, right_index=True).astype('category')
    
    # train classifier
    clf = RandomForestClassifier(n_estimators=500, random_state=0, n_jobs=-1)
    features = df_acc_prot_seq_dummy_rhl.iloc[:,:-2].values
    if category_level==2:
        obs_label = df_acc_prot_seq_dummy_rhl['rhamn2cats'].values
    if category_level==3:
        obs_label = df_acc_prot_seq_dummy_rhl['rhamn3cats'].values
    clf.fit(features, obs_label)
    pred_label=clf.predict(features)
    
    # calculate prediction accuracy
    hit = 0
    for x,y in zip(obs_label,pred_label):
        if x==y:
            hit += 1
    
    return hit/len(obs_label)

In [230]:
def alignment_and_classification(args):
    query_id = args[0]
    df_orth = args[1]
    category_level = args[2]
    df_rhl = args[3]
    
    # sequence alignment
    _, df_acc_prot_seq_dummy = seqAlignment(query_id, df_orth)
    
    # classification
    if df_acc_prot_seq_dummy is None:
        return np.NaN
    else:
        prediction_accuracy = rfClassification(df_acc_prot_seq_dummy, category_level, df_rhl)
        return prediction_accuracy

## 2-category RL production

In [231]:
prediction_accuracy_cat2 = []
for index,pid in enumerate(df_orth.index):
    try:
        acc = alignment_and_classification([pid, df_orth, 2, df_rhl])
        prediction_accuracy_cat2.append(acc)
    except:
        prediction_accuracy_cat2.append(np.nan)

In [232]:
df_prediction_accuracy_cat2 = pd.DataFrame(prediction_accuracy_cat2, index=df_orth.index, columns=['accuracy'])
df_prediction_accuracy_cat2.to_csv('df_prediction_accuracy_cat2.csv')

### fig|287.6656.peg.5757 (Glycine dehydrogenase [decarboxylating] (glycine cleavage system P protein) (EC 1.4.4.2))

In [233]:
df_acc_prot_seq, _ = seqAlignment('fig|287.6656.peg.5757', df_orth)
df_acc_prot_seq = pd.merge(df_acc_prot_seq, df_rhl, left_index=True, right_index=True).astype('category')

## 3-category RL production

In [234]:
prediction_accuracy_cat3 = []
for index,pid in enumerate(df_orth.index):
    try:
        acc = alignment_and_classification([pid, df_orth, 3, df_rhl])
        prediction_accuracy_cat3.append(acc)
    except:
        prediction_accuracy_cat3.append(np.nan)

In [235]:
df_prediction_accuracy_cat3 = pd.DataFrame(prediction_accuracy_cat3, index=df_orth.index, columns=['accuracy'])
df_prediction_accuracy_cat3.to_csv('df_prediction_accuracy_cat3.csv')