# Script to generate predictions of KO strain design performance

## Detailed in "Integrated knowledge mining, genome-scale modeling, and machine learning for predicting *Yarrowia lipolytica* bioproduction".

### Description. 
Script takes a target compound, a list of reactions or genes to test for KO or OE, and generates predicted titers for each design. 

The default conditions for prediction are in glucose, with no prior genetic engineering. The prediction is obtained through generated machine-learning model detailed in the publication.

#### Procedure:
1. Read in data and constructs to screen from "Template_ComputationalDesign" spreadsheet.
2. Generate FBA features for the WT and each strain construct.
3. Predict the titer of each strain.
4. Output the results.

#### Inputs:
1. Supplemental Excel File 6- CSD Template.xlsx: Spreadsheet where the product, testing environmental conditons, and list of KO targets to screen are input.
      Supplemental Excel File 6- CSD Template.xslx
2. Data Encoding File 
      Supplemental Data File 2- DataCharateristics & Encoding.xlsx

#### Output:
1. titerPredictionsKO.csv: Spreadsheet containing a prediction of the WT strain titer, each KO strain titer, and the FBA predicted product yield and biomass growth rate.


#### Additional required scripts:
1. FBA_function_.py:
    Performs FBA feature generation and extraction
2. encodingFunction_.py:
    Encodes the data for input to the ML model
3. FBA_functionOE_.py:
    Performs Gene OE feature generation and extraction

### Libraries to import

In [1]:
import pandas as pd
import pickle
from collections import defaultdict
import warnings
import numpy as np
import os

from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer,MinMaxScaler, MaxAbsScaler,Normalizer,PowerTransformer
from sklearn.pipeline import Pipeline
import warnings
from sklearn.linear_model import ElasticNet,Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from mlxtend.regressor import StackingRegressor
from sklearn.model_selection import learning_curve



# from FBA_function_ import FBA_FeatureExtraction
from encodingFunction_ import encodeTransform

### Ensure the spreadsheet is within the directory.

In [2]:
dir_path = os.path.dirname(os.path.realpath('Supplemental Excel File 6- CSD Template.xlsx'))
file_path = os.path.join(dir_path,'Supplemental Excel File 6- CSD Template.xlsx')

In [3]:
# reads in if you will performing knocksouts or overexpressions
inputKOorOE = pd.read_excel(file_path,sheet_name='KO_or_OE')



In [4]:
if inputKOorOE['Specify'][0]=='KO':  

# if perform_knockouts==1:
    #custom functions provided in the directory.
    from FBA_function_cs import FBA_FeatureExtraction #KO
else:
    #custom functions provided in the directory for OE.
    from FBA_functionOE_cs import FBA_FeatureExtraction #OE


In [5]:
#reads in the information from the datasheets
raw_construct = pd.read_excel(file_path,sheet_name='predictions',skiprows=range(2))
optKnockRxns = pd.read_excel(file_path,sheet_name='targetRxns')

optOERxns = optKnockRxns 


# consolidate meta-information into usable features
data = raw_construct
optData = optKnockRxns
data['number_genes_mod'] = data.genes_modified_updated.apply(lambda x: x.count(';')+1 if isinstance(x,str) else 0)
data['number_genes_deleted'] = data.gene_deletion.apply(lambda x: x.replace(';','').count('1') if isinstance(x,str) else 0)
data['number_total_genes_overexp'] = data.gene_overexpression.apply(lambda x: x.replace(';','').count('1') if isinstance(x,str) else 0)
data['number_genes_het'] = data.heterologous_gene.apply(lambda x: x.replace(';','').count('1') if isinstance(x,str) else 0)

# hettemp1 = data.heterologous_gene#.apply(lambda x: x if isinstance(x,str) else 'NA')
hettemp1 = data.heterologous_gene.apply(lambda x: x if isinstance(x,str) else 'NA')
data.heterologous_gene
hettemp2 = hettemp1.str.split(';',expand=True)

# overexpressTemp1 = data.gene_overexpression.fillna('2')
overexpressTemp1 = data.gene_overexpression.apply(lambda x: x if isinstance(x,str) else 'NA')
overexpressTemp2 = overexpressTemp1.str.split(';',expand=True)
nativeGenes = overexpressTemp2[hettemp2=='0']

data['number_native_genes_overexp'] = nativeGenes.count(axis=1)


In [6]:
optOERxns

Unnamed: 0,strain design #,rxns_deleted_updated_,iYLI647 Rxns,rxnID,rxn,GPR,name
0,17,GND,,,,,
1,18,GAPD,,,,,
2,19,CSm,,,,,


## FBA Modeling

In [7]:
##FBA modeling
#GSM to use, default is 'iYLI647'
FBA_models=['iYLI647']

#Extracted FBA features
if inputKOorOE['Specify'][0]=='KO':  

# if perform_knockouts==1:
    output = FBA_FeatureExtraction(data,optKnockRxns,FBA_models) #KO
else:
    output = FBA_FeatureExtraction(data,optKnockRxns,optKnockRxns,FBA_models) # OE





1.1398166174414033 iYLI647
Completed  1  overexpression simulations
Completed  1  overexpression simulations
0 OE failures
0 Prod failures
0 0 0 0 0 0 failure cases 1-6


In [8]:
output

Unnamed: 0,paper_number,blank1,cs1,cs_conc1,cs1_heatCombustion(kJ/mol),cs2,cs_conc2,cs2_heatCombustion(kJ/mol),reactor_type,rxt_volume,...,EMP_iYLI647,PPP_iYLI647,TCA_iYLI647,NADPH_iYLI647,ATP_iYLI647,PrdtFlux_iYLI647,PrdtYield_iYLI647,Biomass_iYLI647,O2Uptake_iYLI647,GlcUptake_iYLI647
0,1.0,,10,20,34734.5,0,0,0,1,0.05,...,-3.51908,0.104378,25.292458,16.491953,133.083531,0.388304,0.231759,1.419512,-83.06914,0.0
1,1.0,,10,20,34734.5,0,0,0,1,0.05,...,-3.526039,0.146129,25.264624,16.550877,133.093921,0.388168,0.231678,1.419512,-83.07755,0.0
2,1.0,,10,20,34734.5,0,0,0,1,0.05,...,-4.926712,8.55017,19.66193,28.41155,138.254576,0.360865,0.215382,1.419512,-84.770372,0.0
3,1.0,,10,20,34734.5,0,0,0,1,0.05,...,-3.51908,0.104378,17.489441,16.067306,142.946268,0.329497,0.19666,1.419512,-86.715201,0.0
4,,,1,20,34734.5,0,0,0,1,0.05,...,-3.51908,0.104378,25.292458,16.491953,133.083531,0.388304,0.231759,1.419512,-83.06914,0.0
5,,,1,20,34734.5,0,0,0,1,0.05,...,3.921924,0.180279,46.453071,27.305955,158.286119,0.641218,0.382711,2.45175,-86.625813,-10.0
6,,,1,20,34734.5,0,0,0,1,0.05,...,3.909905,0.252391,46.404997,27.403319,158.013948,0.641167,0.382681,2.45175,-86.628951,-10.0
7,,,1,20,34734.5,0,0,0,1,0.05,...,3.921924,0.180279,46.453071,27.305955,158.286119,0.641218,0.382711,2.45175,-86.625813,-10.0


## Encode data

In [9]:
#encode data, using output from FBA modeling section
encodedData = encodeTransform(output)

### Features used in the machine learning trained model.

In [25]:
#rename feature to original feature name
encodedData['mw_Lipids'] = encodedData['mw']

cols_train__set = [
'mw_Lipids'
,'pH'
,'product_deltaGo'
,'foldCarbonFed'
,'product_name'
,'rxt_volume'
,'inputThermo(kJ/L)'
,'FermentationTime'
,'atp_cost'
,'precursorsRequiredEncoded'
,'nadh_nadph_cost'
,'Pathway_enzymatic_steps'
,'averageThermBarrier'
,'media'
,'number_genes_het'
,'number_native_genes_overexp'
,'ATP_iYLI647'
,'NADPH_iYLI647'
,'PPP_iYLI647'
,'TCA_iYLI647'
,'PrdtYield_iYLI647'
]

warnings.simplefilter('ignore')



In [33]:
#obtain features used from the data
warnings.simplefilter('ignore')


useful_cols = []
useful_cols.extend(cols_train__set)
data = pd.DataFrame()

data = encodedData.loc[:,useful_cols]
for column in data:
    data[column] = data[column].astype(np.float32)
data.pH = data.pH.replace(0,7)

In [34]:
#open the ML model for prediction
with open('M21iYL_cs_GlcNormalized_noO2noEMPnoBio.pickle','rb') as f:
    masterGrid = pickle.load(f)
# with open('M21iYL.pickle','rb') as f:
#     masterGrid = pickle.load(f)

masterGrid = masterGrid[0]

In [35]:
#perform prediction on data
x_testData = data.copy()
target = 'Product_titer(g/L)'
x_testData.PrdtYield_iYLI647.fillna(0,inplace=True)

#prediction
y_prediction = np.exp(masterGrid[target].predict(x_testData))
len(y_prediction)

8

In [36]:
x_testData

Unnamed: 0,mw_Lipids,pH,product_deltaGo,foldCarbonFed,product_name,rxt_volume,inputThermo(kJ/L),FermentationTime,atp_cost,precursorsRequiredEncoded,...,Pathway_enzymatic_steps,averageThermBarrier,media,number_genes_het,number_native_genes_overexp,ATP_iYLI647,NADPH_iYLI647,PPP_iYLI647,TCA_iYLI647,PrdtYield_iYLI647
0,596.849976,7.0,2575.899902,0.0,4.0,2.0,502.597321,120.0,24.0,24.0,...,16.0,271.0,3.0,3.0,0.0,133.083527,16.491953,0.104378,25.292458,0.231759
1,596.849976,7.0,2575.899902,0.0,4.0,2.0,502.597321,120.0,24.0,24.0,...,16.0,271.0,3.0,3.0,0.0,133.093918,16.550877,0.146129,25.264624,0.231678
2,596.849976,7.0,2575.899902,0.0,4.0,2.0,502.597321,120.0,24.0,24.0,...,16.0,271.0,3.0,3.0,0.0,138.254578,28.411551,8.55017,19.66193,0.215382
3,596.849976,7.0,2575.899902,0.0,4.0,2.0,502.597321,120.0,24.0,24.0,...,16.0,271.0,3.0,3.0,0.0,142.946274,16.067307,0.104378,17.489441,0.19666
4,596.849976,7.0,2575.899902,0.0,4.0,2.0,3856.046875,120.0,24.0,24.0,...,16.0,271.0,3.0,3.0,0.0,133.083527,16.491953,0.104378,25.292458,0.231759
5,596.849976,7.0,2575.899902,0.0,4.0,2.0,3856.046875,120.0,24.0,24.0,...,16.0,271.0,3.0,3.0,0.0,158.286118,27.305956,0.180279,46.453072,0.382711
6,596.849976,7.0,2575.899902,0.0,4.0,2.0,3856.046875,120.0,24.0,24.0,...,16.0,271.0,3.0,3.0,0.0,158.013947,27.403318,0.252391,46.404999,0.382681
7,596.849976,7.0,2575.899902,0.0,4.0,2.0,3856.046875,120.0,24.0,24.0,...,16.0,271.0,3.0,3.0,0.0,158.286118,27.305956,0.180279,46.453072,0.382711


In [37]:
#output dataframe
MLOutput = pd.DataFrame()
MLOutput['TiterPrediction(g/L)'] = y_prediction
MLOutput['FBA predicted Biomass'] = output['Biomass_iYLI647']
MLOutput['FBA predicted Yield'] = data['PrdtYield_iYLI647']
# MLOutput.at['rxns'] = optKnockRxns
MLOutput.index = data.index


In [38]:
#Save file
# MLOutput.to_csv('Screen.csv')

In [39]:
MLOutput

Unnamed: 0,TiterPrediction(g/L),FBA predicted Biomass,FBA predicted Yield
0,0.003553,1.419512,0.231759
1,0.003384,1.419512,0.231678
2,0.022838,1.419512,0.215382
3,0.00358,1.419512,0.19666
4,0.003075,1.419512,0.231759
5,0.005549,2.45175,0.382711
6,0.005917,2.45175,0.382681
7,0.005549,2.45175,0.382711
