# Script to generate predictions of KO strain design performance

## Detailed in "Integrated data mining, genome-scale modeling, and machine learning for predicting yeast bioproduction"

### Description. 
Script takes a target compound, a list of reactions or genes to test for KO, and generates predicted titers for each design. 

The default conditions for prediction are in glucose, with no prior genetic engineering. The prediction is obtained through generated machine-learning model detailed in the publication.

#### Procedure:
1. Read in data and constructs to screen from "Template_ComputationalDesign" spreadsheet.
2. Generate FBA features for the WT and each strain construct.
3. Predict the titer of each strain.
4. Output the results.

#### Inputs:
1. Supplemental Excel File 6- CSD Template.xlsx: Spreadsheet where the product, testing environmental conditons, and list of KO targets to screen are input.
      Supplemental Excel File 6- CSD Template.xslx
2. Data Encoding File 
      Supplemental Data File 2- DataCharateristics & Encoding.xlsx

#### Output:
1. titerPredictionsKO.csv: Spreadsheet containing a prediction of the WT strain titer, each KO strain titer, and the FBA predicted product yield and biomass growth rate.


#### Additional required scripts:
1. FBA_function_.py:
    Performs FBA feature generation and extraction
2. encodingFunction_.py:
    Encodes the data for input to the ML model

### Libraries to import

In [1]:
import pandas as pd
import pickle
from collections import defaultdict
import warnings
import numpy as np
import os

from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer,MinMaxScaler, MaxAbsScaler,Normalizer,PowerTransformer
from sklearn.pipeline import Pipeline
import warnings
from sklearn.linear_model import ElasticNet,Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from mlxtend.regressor import StackingRegressor
from sklearn.model_selection import learning_curve

#custom functions provided in the directory.
from FBA_function_ import FBA_FeatureExtraction

# from FBA_function_ import FBA_FeatureExtraction
from encodingFunction_ import encodeTransform

### Ensure the spreadsheet is within the directory.

In [2]:
dir_path = os.path.dirname(os.path.realpath('Supplemental Excel File 6- CSD Template.xlsx'))
file_path = os.path.join(dir_path,'Supplemental Excel File 6- CSD Template.xlsx')

In [3]:
#reads in the information from the datasheets
raw_construct = pd.read_excel(file_path,sheet_name='predictions',skiprows=range(1))
optKnockRxns = pd.read_excel(file_path,sheet_name='targetRxns')

# consolidate meta-information into usable features
data = raw_construct
optData = optKnockRxns
data['number_genes_mod'] = data.genes_modified_updated.apply(lambda x: x.count(';')+1 if isinstance(x,str) else 0)
data['number_genes_deleted'] = data.gene_deletion.apply(lambda x: x.replace(';','').count('1') if isinstance(x,str) else 0)
data['number_total_genes_overexp'] = data.gene_overexpression.apply(lambda x: x.replace(';','').count('1') if isinstance(x,str) else 0)
data['number_genes_het'] = data.heterologous_gene.apply(lambda x: x.replace(';','').count('1') if isinstance(x,str) else 0)

# hettemp1 = data.heterologous_gene#.apply(lambda x: x if isinstance(x,str) else 'NA')
hettemp1 = data.heterologous_gene.apply(lambda x: x if isinstance(x,str) else 'NA')
data.heterologous_gene
hettemp2 = hettemp1.str.split(';',expand=True)

# overexpressTemp1 = data.gene_overexpression.fillna('2')
overexpressTemp1 = data.gene_overexpression.apply(lambda x: x if isinstance(x,str) else 'NA')
overexpressTemp2 = overexpressTemp1.str.split(';',expand=True)
nativeGenes = overexpressTemp2[hettemp2=='0']

data['number_native_genes_overexp'] = nativeGenes.count(axis=1)


## FBA Modeling

In [4]:
##FBA modeling
#GSM to use, default is 'iYLI647'
FBA_models=['iYLI647']

#Extracted FBA features
output = FBA_FeatureExtraction(data,optKnockRxns,FBA_models)



1.1398166174414033 iYLI647
0.8714918032786889
0.17285196521900842
0 ALCD2m,PGM
0.15632333342439064
0.13623450374055796 0.15632333342439064
1 CITtcm
0.17346769120430297
0.15117567101822876 0.17346769120430297
2 TYRTAm,TYRTA
0.650302837105515
0.5667335921863327 0.650302837105515
3 THRD_L,HSERTA
0.6503028371055142
0.566733592186332 0.6503028371055142
4 HACD8p,CSm
0.6503028371055155
0.5667335921863332 0.6503028371055155
5 FA180COAabcp,CSm
0.6503028371055151
0.5667335921863328 0.6503028371055151
6 CSm,ECOAH8p
0.6503028371055154
0.5667335921863331 0.6503028371055154
7 ACOAO8p,CSm
0.6503028371055163
0.5667335921863339 0.6503028371055163
8 CSm,ACACT8p
0.6503028371055148
0.5667335921863326 0.6503028371055148
9 ACS
0.17285196521900117
0.15063907086897255 0.17285196521900117
10 PYRDC,MACACI
 not in GSM, no KO modification performed
0.17285196521899981
0.15063907086897135 0.17285196521899981
11 PYRDC,HGNTOR
 not in GSM, no KO modification performed
0.17285196521899981
0.15063907086897135 0.1728519

In [5]:
output

Unnamed: 0,paper_number,blank1,cs1,cs_conc1,cs1_heatCombustion(kJ/mol),cs2,cs_conc2,cs2_heatCombustion(kJ/mol),reactor_type,rxt_volume,...,EMP_iYLI647,PPP_iYLI647,TCA_iYLI647,NADPH_iYLI647,ATP_iYLI647,PrdtFlux_iYLI647,PrdtYield_iYLI647,Biomass_iYLI647,O2Uptake_iYLI647,GlcUptake_iYLI647
0,1,,1,20,2626,0,0,0,1,0.05,...,6.394429,8.462109,1.220642,18.22588,54.786144,0.172852,0.150639,0.8548625,-12.672942,-10.0
1,1,,1,20,2626,0,0,0,1,0.05,...,5.788302,13.171759,1.010255,27.42824,50.577105,0.156323,0.136235,0.7857247,-16.700664,-10.0
2,1,,1,20,2626,0,0,0,1,0.05,...,6.391207,8.520574,1.2211,18.459497,55.038511,0.173468,0.151176,0.8520938,-12.733941,-10.0
3,1,,1,20,2626,0,0,0,1,0.05,...,8.453937,7.32547,0.0,14.65094,60.350653,0.650303,0.566734,1.292892e-29,-8.951227,-10.0
4,1,,1,20,2626,0,0,0,1,0.05,...,8.453937,7.32547,0.0,14.65094,60.350653,0.650303,0.566734,7.636735999999999e-30,-8.951227,-10.0
5,1,,1,20,2626,0,0,0,1,0.05,...,8.453937,7.32547,0.0,14.65094,60.350653,0.650303,0.566734,-1.516646e-15,-8.951227,-10.0
6,1,,1,20,2626,0,0,0,1,0.05,...,8.453937,7.32547,0.0,14.65094,60.350653,0.650303,0.566734,1.324952e-15,-8.951227,-10.0
7,1,,1,20,2626,0,0,0,1,0.05,...,8.453937,7.32547,0.0,14.65094,60.350653,0.650303,0.566734,-1.516646e-15,-8.951227,-10.0
8,1,,1,20,2626,0,0,0,1,0.05,...,8.453937,7.32547,0.0,14.65094,60.350653,0.650303,0.566734,-1.516646e-15,-8.951227,-10.0
9,1,,1,20,2626,0,0,0,1,0.05,...,8.453937,7.32547,0.0,14.65094,60.350653,0.650303,0.566734,-1.516646e-15,-8.951227,-10.0


## Encode data

In [6]:
#encode data, using output from FBA modeling section
encodedData = encodeTransform(output)

### Features used in the machine learning trained model.

In [7]:
#rename feature to original feature name
encodedData['mw_Lipids'] = encodedData['mw']

cols_train__set = [
'mw_Lipids'
,'pH'
,'product_deltaGo'
,'foldCarbonFed'
,'product_name'
,'rxt_volume'
,'inputThermo(kJ/L)'
,'FermentationTime'
,'atp_cost'
,'precursorsRequiredEncoded'
,'nadh_nadph_cost'
,'Pathway_enzymatic_steps'
,'averageThermBarrier'
,'media'
,'number_genes_het'
,'number_native_genes_overexp'
,'ATP_iYLI647'
,'NADPH_iYLI647'
,'PPP_iYLI647'
,'TCA_iYLI647'
,'PrdtYield_iYLI647'
]

warnings.simplefilter('ignore')


useful_cols = []
useful_cols.extend(cols_train__set)
data = pd.DataFrame()


data = encodedData.loc[:,useful_cols]
for column in data:
    data[column] = data[column].astype(np.float32)

In [8]:
#obtain features used from the data
warnings.simplefilter('ignore')


useful_cols = []
useful_cols.extend(cols_train__set)
data = pd.DataFrame()

data = encodedData.loc[:,useful_cols]
for column in data:
    data[column] = data[column].astype(np.float32)


In [9]:
#open the ML model for prediction
with open('M21iYL.pickle','rb') as f:
    masterGrid = pickle.load(f)

masterGrid = masterGrid[0]

In [10]:
#perform prediction on data
x_testData = data.copy()
target = 'Product_titer(g/L)'
x_testData.PrdtYield_iYLI647.fillna(0,inplace=True)

#prediction
y_prediction = np.exp(masterGrid[target].predict(x_testData))
len(y_prediction)

16

In [11]:
x_testData

Unnamed: 0,mw_Lipids,pH,product_deltaGo,foldCarbonFed,product_name,rxt_volume,inputThermo(kJ/L),FermentationTime,atp_cost,precursorsRequiredEncoded,...,Pathway_enzymatic_steps,averageThermBarrier,media,number_genes_het,number_native_genes_overexp,ATP_iYLI647,NADPH_iYLI647,PPP_iYLI647,TCA_iYLI647,PrdtYield_iYLI647
0,871.491821,0.0,-218.592346,0.0,2.0,2.0,291.525116,120.0,23.5,27.459,...,12.0,26.0,3.0,0.0,0.0,54.786144,18.22588,8.46211,1.220642,0.150639
1,871.491821,0.0,-218.592346,0.0,2.0,2.0,291.525116,120.0,23.5,27.459,...,12.0,26.0,3.0,0.0,0.0,50.577106,27.42824,13.17176,1.010255,0.136235
2,871.491821,0.0,-218.592346,0.0,2.0,2.0,291.525116,120.0,23.5,27.459,...,12.0,26.0,3.0,0.0,0.0,55.038509,18.459497,8.520575,1.2211,0.151176
3,871.491821,0.0,-218.592346,0.0,2.0,2.0,291.525116,120.0,23.5,27.459,...,12.0,26.0,3.0,0.0,0.0,60.350655,14.65094,7.32547,0.0,0.566734
4,871.491821,0.0,-218.592346,0.0,2.0,2.0,291.525116,120.0,23.5,27.459,...,12.0,26.0,3.0,0.0,0.0,60.350655,14.65094,7.32547,0.0,0.566734
5,871.491821,0.0,-218.592346,0.0,2.0,2.0,291.525116,120.0,23.5,27.459,...,12.0,26.0,3.0,0.0,0.0,60.350655,14.65094,7.32547,0.0,0.566734
6,871.491821,0.0,-218.592346,0.0,2.0,2.0,291.525116,120.0,23.5,27.459,...,12.0,26.0,3.0,0.0,0.0,60.350655,14.65094,7.32547,0.0,0.566734
7,871.491821,0.0,-218.592346,0.0,2.0,2.0,291.525116,120.0,23.5,27.459,...,12.0,26.0,3.0,0.0,0.0,60.350655,14.65094,7.32547,0.0,0.566734
8,871.491821,0.0,-218.592346,0.0,2.0,2.0,291.525116,120.0,23.5,27.459,...,12.0,26.0,3.0,0.0,0.0,60.350655,14.65094,7.32547,0.0,0.566734
9,871.491821,0.0,-218.592346,0.0,2.0,2.0,291.525116,120.0,23.5,27.459,...,12.0,26.0,3.0,0.0,0.0,60.350655,14.65094,7.32547,0.0,0.566734


In [12]:
#output dataframe
MLOutput = pd.DataFrame()
MLOutput['TiterPrediction(g/L)'] = y_prediction
MLOutput['FBA predicted Biomass'] = output['Biomass_iYLI647']
MLOutput['FBA predicted Yield'] = data['PrdtYield_iYLI647']
MLOutput.index = data.index


In [13]:
#Save file
MLOutput.to_csv('titerPredictions.csv')