In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline 
import seaborn as sns
from copy import deepcopy
import collections
import scipy as sp
import os
import sys
import numpy as np
from Bio.Blast.Applications import NcbiblastpCommandline
from Bio.Blast import NCBIWWW, NCBIXML
from Bio import AlignIO, SeqIO
import Bio.Align.Applications
from Bio.Align.Applications import ClustalwCommandline
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Read rhamnolipid production 

In [2]:
df_rhl = pd.read_excel('../data/rhamnolipids/rhamnMat.xlsx', index_col=0)
df_rhl = df_rhl.rename(index={'PA14': 'UCBPP-PA14'})
df_rhl.head()

Unnamed: 0_level_0,rhamn3cats,rhamn2cats
strain,Unnamed: 1_level_1,Unnamed: 2_level_1
F22031,2,1
F23197,2,1
F30658,1,1
F34365,2,1
F5677,0,0


In [3]:
df_rhl.loc['W36662']

rhamn3cats    0
rhamn2cats    0
Name: W36662, dtype: int64

# Read Growth rate

In [4]:
df_sgr = pd.read_csv('../nnmf/tblgcfeatures.csv', index_col=0).set_index('strain')
df_sgr = df_sgr.loc[df_sgr.phase==1, 'specific_growth_rate_max'].to_frame()
df_sgr = df_sgr.rename(index={'PA14': 'UCBPP-PA14'})
df_sgr.head()

Unnamed: 0_level_0,specific_growth_rate_max
strain,Unnamed: 1_level_1
F22031,0.249327
F23197,0.18835
F30658,0.523181
F34365,0.264798
F5677,0.091215


# Read protein orthologue dictionary

In [6]:
df_orth = pd.read_csv('../find_protein_orthologues_UCBPP-PA14_vs_otherPA/protein_orthologue_dictionary_ref_PA14_core.csv', index_col=0)
df_orth.index.name = 'UCBPP-PA14'
df_orth.head()

Unnamed: 0_level_0,F22031,F23197,F30658,F34365,F5677,F63912,F9670,H27930,H47921,H5708,...,T63266,W16407,W25637,W36662,W45909,W60856,W70332,W91453,X78812,X9820
UCBPP-PA14,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
fig|287.6770.peg.1,fig|287.6613.peg.1,fig|287.6611.peg.1,fig|287.6614.peg.1,fig|287.6616.peg.1,fig|287.6612.peg.1,fig|287.6618.peg.1,fig|287.6615.peg.1,fig|287.6617.peg.1,fig|287.6621.peg.1,fig|287.6619.peg.1,...,fig|287.6635.peg.1,fig|287.6633.peg.1,fig|287.6634.peg.1,fig|287.6637.peg.1,fig|287.6636.peg.1,fig|287.6620.peg.1,fig|287.6630.peg.1,fig|287.6639.peg.1,fig|287.6638.peg.1,fig|287.6627.peg.1
fig|287.6770.peg.2,fig|287.6613.peg.2,fig|287.6611.peg.2,fig|287.6614.peg.2,fig|287.6616.peg.2,fig|287.6612.peg.2,fig|287.6618.peg.2,fig|287.6615.peg.2,fig|287.6617.peg.2,fig|287.6621.peg.2,fig|287.6619.peg.2,...,fig|287.6635.peg.2,fig|287.6633.peg.2,fig|287.6634.peg.2,fig|287.6637.peg.2,fig|287.6636.peg.2,fig|287.6620.peg.2,fig|287.6630.peg.2,fig|287.6639.peg.2,fig|287.6638.peg.2,fig|287.6627.peg.2
fig|287.6770.peg.3,fig|287.6613.peg.3,fig|287.6611.peg.3,fig|287.6614.peg.3,fig|287.6616.peg.3,fig|287.6612.peg.3,fig|287.6618.peg.3,fig|287.6615.peg.3,fig|287.6617.peg.3,fig|287.6621.peg.3,fig|287.6619.peg.3,...,fig|287.6635.peg.3,fig|287.6633.peg.3,fig|287.6634.peg.3,fig|287.6637.peg.3,fig|287.6636.peg.3,fig|287.6620.peg.3,fig|287.6630.peg.3,fig|287.6639.peg.3,fig|287.6638.peg.3,fig|287.6627.peg.3
fig|287.6770.peg.4,fig|287.6613.peg.4,fig|287.6611.peg.4,fig|287.6614.peg.4,fig|287.6616.peg.4,fig|287.6612.peg.4,fig|287.6618.peg.4,fig|287.6615.peg.4,fig|287.6617.peg.4,fig|287.6621.peg.4,fig|287.6619.peg.4,...,fig|287.6635.peg.4,fig|287.6633.peg.4,fig|287.6634.peg.4,fig|287.6637.peg.4,fig|287.6636.peg.4,fig|287.6620.peg.4,fig|287.6630.peg.4,fig|287.6639.peg.4,fig|287.6638.peg.4,fig|287.6627.peg.4
fig|287.6770.peg.6,fig|287.6613.peg.5,fig|287.6611.peg.5,fig|287.6614.peg.5,fig|287.6616.peg.5,fig|287.6612.peg.6,fig|287.6618.peg.5,fig|287.6615.peg.12,fig|287.6617.peg.5,fig|287.6621.peg.5,fig|287.6619.peg.5,...,fig|287.6635.peg.5,fig|287.6633.peg.5,fig|287.6634.peg.5,fig|287.6637.peg.5,fig|287.6636.peg.5,fig|287.6620.peg.5,fig|287.6630.peg.5,fig|287.6639.peg.5,fig|287.6638.peg.11,fig|287.6627.peg.5


In [7]:
len(df_orth.columns)

30

# Function to align sequences and classify rhamnolipid production based on accessory protein mutations

In [15]:
def seqAlignment(query_id, df_orth):
    
    # protein_id_dict has strain name as key and protein id as value
    protein_id_dict = df_orth.loc[query_id,:].to_dict()
    protein_id_dict['UCBPP-PA14'] = query_id + '|'
    
    # where protein sequence data stores
    data_path = '../find_protein_orthologues_UCBPP-PA14_vs_otherPA/prots/'
      
    # prepare alignment file
    query_id_alias = ('_').join(('_').join(query_id.split('|')).split('.'))
    alignment_file = data_path + query_id_alias + '_for_alignment'
    fout = open(alignment_file+'.fasta', 'w')
    strains = []
    for strain, pid in protein_id_dict.items():      
        strains.append(strain)
        fin = open(data_path + 'PA_%s_feature_protein.faa'%(strain), 'r')
        for record in SeqIO.parse(fin,'fasta'):
            if record.id == pid:
                fout.write(">" + strain + "\n")
                fout.write(str(record.seq) + "\n")
                break
        fin.close()
    fout.close()

    # run multiple sequence alignment by calling clustalw
    clustalw_cline = ClustalwCommandline("/usr/local/bin/clustalw2", infile=alignment_file+'.fasta')
    stdout, stderr = clustalw_cline()
    align = AlignIO.read(alignment_file+'.aln', "clustal")

    # generate accessory matrix
    prot_seq_matrix = np.chararray(shape=(len(strains),len(align[0].seq)))
    for row, aln in enumerate(align):
        new_row = strains.index(aln.id)
        for col in range(len(aln.seq)):
            prot_seq_matrix[new_row][col] = aln.seq[col]
            
    accessory_indices = list()   
    for col in range(len(align[0].seq)):
        col_seq = [prot_seq_matrix[row][col] for row in range(len(strains))]
        all_equal=True
        for row in range(len(col_seq)):
            if col_seq[row] != col_seq[0]:
                all_equal = False
                break
        if not all_equal:
            accessory_indices.append(col)  
            
    # delete file
    os.remove(alignment_file+'.fasta')
    os.remove(alignment_file+'.aln')
    os.remove(alignment_file+'.dnd') 
    
    if len(accessory_indices)==0:
        return None,None
    else:
        accessory_prot_seq_matrix = prot_seq_matrix[:,accessory_indices].decode('utf-8')
        
        # turn accessory matrix to panda frame
        df_acc_prot_seq = pd.DataFrame(accessory_prot_seq_matrix, index = strains, columns = np.array(accessory_indices)+1)
    
        # convert string to dummy variable
        df_acc_prot_seq_dummy = pd.get_dummies(df_acc_prot_seq)
        return df_acc_prot_seq, df_acc_prot_seq_dummy

In [16]:
def classification(df_acc_prot_seq_dummy, category_level, df_rhl, clf_method):
    
    # join features with labels
    df_acc_prot_seq_dummy_rhl = pd.merge(df_acc_prot_seq_dummy, df_rhl, left_index=True, right_index=True).astype('category')
    features = df_acc_prot_seq_dummy_rhl.iloc[:,:-2].values
    if category_level==2:
        obs_label = df_acc_prot_seq_dummy_rhl['rhamn2cats'].values
    if category_level==3:
        obs_label = df_acc_prot_seq_dummy_rhl['rhamn3cats'].values
        
    # train classifier
    if clf_method == 'RandomForest':
        clf = RandomForestClassifier(n_estimators=500, random_state=0, n_jobs=-1)
    elif clf_method == 'DecisionTree':
        clf = DecisionTreeClassifier(random_state=0)
    else:
        print('unknown classification method.')
        raise
    clf.fit(features, obs_label)
    pred_label=clf.predict(features)
    
    # calculate prediction accuracy
    hit = 0
    for x,y in zip(obs_label,pred_label):
        if x==y:
            hit += 1
    
    return hit/len(obs_label)

In [10]:
def alignment_and_classification(args):
    query_id = args[0]
    df_orth = args[1]
    category_level = args[2]
    df_rhl = args[3]
    clf_method = args[4]
    
    # sequence alignment
    _, df_acc_prot_seq_dummy = seqAlignment(query_id, df_orth)
    
    # classification
    if df_acc_prot_seq_dummy is None:
        return np.NaN
    else:
        prediction_accuracy = classification(df_acc_prot_seq_dummy, category_level, df_rhl, clf_method)
        return prediction_accuracy

In [11]:
def calculate_total_entropy(df_acc):
    if df_acc is None:
        return 0
    else:
        total_entropy = 0
        for col in df_acc.columns:
            counter = dict(collections.Counter(list(df_acc[col].values)))
            total_entropy += sp.stats.entropy([v/sum(counter.values()) for k,v in counter.items()])
        return total_entropy

## 2-category RL production

### random forest

In [10]:
prediction_accuracy_cat2_rf = []
for index,pid in enumerate(df_orth.index):
    try:
        acc = alignment_and_classification([pid, df_orth, 2, df_rhl, 'RandomForest'])
        prediction_accuracy_cat2_rf.append(acc)
    except:
        prediction_accuracy_cat2_rf.append(np.nan)

In [11]:
df_prediction_accuracy_cat2_rf = pd.DataFrame(prediction_accuracy_cat2_rf, index=df_orth.index, columns=['accuracy'])
df_prediction_accuracy_cat2_rf.to_csv('RL_prediction_accuracy_cat2_rf.csv')

### decision tree

In [12]:
prediction_accuracy_cat2_dt = []
for index,pid in enumerate(df_orth.index):
    try:
        acc = alignment_and_classification([pid, df_orth, 2, df_rhl, 'DecisionTree'])
        prediction_accuracy_cat2_dt.append(acc)
    except:
        prediction_accuracy_cat2_dt.append(np.nan)

In [13]:
df_prediction_accuracy_cat2_dt = pd.DataFrame(prediction_accuracy_cat2_dt, index=df_orth.index, columns=['accuracy'])
df_prediction_accuracy_cat2_dt.to_csv('RL_prediction_accuracy_cat2_dt.csv')

## total entropy

In [23]:
total_entropy = []
for index,pid in enumerate(df_orth.index):
    try:
        df_acc_prot_seq, _ = seqAlignment(pid, df_orth)
        total_entropy.append(calculate_total_entropy(df_acc_prot_seq))
    except:
        total_entropy.append(0)

In [24]:
df_total_entropy  = pd.DataFrame(total_entropy, index=df_orth.index, columns=['entropy'])
df_total_entropy.to_csv('total_entropy.csv')

## 3-category RL production

### random forest

In [None]:
prediction_accuracy_cat3_rf = []
for index,pid in enumerate(df_orth.index):
    try:
        acc = alignment_and_classification([pid, df_orth, 3, df_rhl, 'RandomForest'])
        prediction_accuracy_cat3_rf.append(acc)
    except:
        prediction_accuracy_cat3_rf.append(np.nan)

In [None]:
df_prediction_accuracy_cat3_rf = pd.DataFrame(prediction_accuracy_cat3_rf, index=df_orth.index, columns=['accuracy'])
df_prediction_accuracy_cat3_rf.to_csv('RL_prediction_accuracy_cat3_rf.csv')

### decision tree

In [None]:
prediction_accuracy_cat3_dt = []
for index,pid in enumerate(df_orth.index):
    try:
        acc = alignment_and_classification([pid, df_orth, 3, df_rhl, 'DecisionTree'])
        prediction_accuracy_cat3_dt.append(acc)
    except:
        prediction_accuracy_cat3_dt.append(np.nan)

In [None]:
df_prediction_accuracy_cat3_dt = pd.DataFrame(prediction_accuracy_cat3_dt, index=df_orth.index, columns=['accuracy'])
df_prediction_accuracy_cat3_dt.to_csv('RL_prediction_accuracy_cat3_dt.csv')

# Look at some interesiting proteins in more details

## fig|287.6770.peg.5751 (Glycine dehydrogenase [decarboxylating] (glycine cleavage system P protein) (EC 1.4.4.2))

In [17]:
df_acc_prot_seq, _ = seqAlignment('fig|287.6770.peg.1481', df_orth)
df_acc_prot_seq

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,55,56,57,58,93,94,116,123,156,270
F22031,M,T,T,A,A,P,T,P,S,L,...,V,S,M,A,I,T,I,N,V,E
F23197,M,T,T,A,A,P,T,P,S,L,...,V,S,M,A,I,T,I,N,V,E
F30658,M,T,T,A,A,P,T,P,S,L,...,V,S,M,A,I,T,I,N,V,E
F34365,M,T,T,A,A,P,T,P,S,L,...,V,S,M,A,I,T,I,N,V,E
F5677,M,T,T,A,A,P,T,P,S,L,...,V,S,M,A,I,T,I,N,V,E
F63912,M,T,T,A,A,P,T,P,S,L,...,V,S,M,A,I,T,I,N,V,E
F9670,M,T,T,A,A,P,T,P,S,L,...,V,S,M,A,I,T,I,N,V,E
H27930,M,T,T,A,A,P,T,P,S,L,...,V,S,M,A,I,T,I,N,V,E
H47921,M,T,T,A,A,P,T,P,S,L,...,V,S,M,A,I,T,I,N,V,E
H5708,M,T,T,A,A,P,T,P,S,L,...,V,S,M,A,I,T,I,N,V,E


# Regression of protein variants on growth rate

In [12]:
def regression(df_acc_prot_seq_dummy, df_sgr, clf_method):
    
    # join features with labels
    df_acc_prot_seq_dummy_sgr = pd.merge(df_acc_prot_seq_dummy, df_sgr, left_index=True, right_index=True)
    features = df_acc_prot_seq_dummy_sgr.iloc[:,:-1].values
    obs_values = df_acc_prot_seq_dummy_sgr['specific_growth_rate_max'].values
    
    # train regressor
    if clf_method == 'RandomForest':
        clf = RandomForestRegressor(n_estimators=500, random_state=0, n_jobs=-1)
    elif clf_method == 'DecisionTree':
        clf = DecisionTreeRegressor(random_state=0)
    elif clf_method == 'LinearRegression':
        clf = LinearRegression()
    else:
        print('unknown classification method.')
        raise
    clf.fit(features, obs_values)
    pred_values=clf.predict(features)
    
    # calculate r2
    r2 = r2_score(obs_values, pred_values)
    
    return r2

In [13]:
def alignment_and_regression(args):
    query_id = args[0]
    df_orth = args[1]
    df_sgr = args[2]
    clf_method = args[3]
    
    # sequence alignment
    _, df_acc_prot_seq_dummy = seqAlignment(query_id, df_orth)
    
    # classification
    if df_acc_prot_seq_dummy is None:
        return np.NaN
    else:
        r2 = regression(df_acc_prot_seq_dummy, df_sgr, clf_method)
        return r2

## Random forest

In [77]:
SGR_r2_rf = []
for index,pid in enumerate(df_orth.index):
    try:
        r2 = alignment_and_regression([pid, df_orth, df_sgr, 'RandomForest'])
        SGR_r2_rf.append(r2)
    except:
        SGR_r2_rf.append(np.nan)

In [78]:
df_SGR_r2_rf = pd.DataFrame(SGR_r2_rf, index=df_orth.index, columns=['R2'])
df_SGR_r2_rf.to_csv('SGR_prediction_accuracy_r2_rf.csv')

## Decision tree

In [79]:
SGR_r2_dt = []
for index,pid in enumerate(df_orth.index):
    try:
        r2 = alignment_and_regression([pid, df_orth, df_sgr, 'DecisionTree'])
        SGR_r2_dt.append(r2)
    except:
        SGR_r2_dt.append(np.nan)

In [80]:
df_SGR_r2_dt = pd.DataFrame(SGR_r2_dt, index=df_orth.index, columns=['R2'])
df_SGR_r2_dt.to_csv('SGR_prediction_accuracy_r2_dt.csv')

## Linear model

In [84]:
SGR_r2_lin = []
for index,pid in enumerate(df_orth.index):
    try:
        r2 = alignment_and_regression([pid, df_orth, df_sgr, 'LinearRegression'])
        SGR_r2_lin.append(r2)
    except:
        SGR_r2_lin.append(np.nan)

In [85]:
df_SGR_r2_lin = pd.DataFrame(SGR_r2_lin, index=df_orth.index, columns=['R2'])
df_SGR_r2_lin.to_csv('SGR_prediction_accuracy_r2_lin.csv')