# ML Pipeline Code was built for the initial model training.
ML Pipeline Code was built for the initial model training detailed in "Integrated knowledge mining, genome-scale modeling, and machine learning for predicting *Yarrowia lipolytica* bioproduction".

### Part 3/4:
* Part 1: Performs data importation, intial formatting and splits data into 3 parts for training, validation, and testing.
* Part 2: FBA feature generation is completed; script entitled "ML_pipeline_JC_part2"
* Part 3: Feature encoding is completed; script entitled "ML_pipeline_part3"
* Part 4: Machine learning model training is completed; script entitled "ML_pipeline_part4"
    
### Inputs:
* pickle file: Train&ValidateData_part2.pickle or TESTData_part2.pickle from Part 2 are inputs to the script. 
* Data encoding file: Publication entitled file: 'Supplemental Excel File 2- Supplemental Excel File 3- DataCharateristics & Encoding.xlsx'

### Outputs:    

* A pickle datafile entitled "Train&ValidateData_part3.pickle" or "TESTData_part3.pickle" at the end of the file.
    
#### Additional Info: This file can also be used for non-Yarrowia lipoytica GSMs and has been validated with *Rhodosporidium toruloides* and *Cutaneotrichosporon oleaginosus*.

### Note:
This file process the train&validate and TEST data as separate branches. It must be ran twice in order to get the full data, but changing the input file from Part 2. 


In [1]:
#This cell imports the necessary libraries.

import pandas as pd
import pickle
from collections import defaultdict
import warnings
import numpy as np
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
lipidSimplificationOption=1

### The following cell contains functions for the code.

In [3]:
#Cell takes the database input and outputs different encoding for each class

#categorically encodes the volume to 1-5 (smallest to largest) based on cultivation volume.
def rxtVolumeEncoding(x):
    if x <= 0.01:
        return 1
    elif x <= 0.075:
        return 2
    elif x <= 0.25:
        return 3
    elif x < 1:
        return 4
    else:
        return 5

#categorically encodes the reactor vessel type (micro-reactors, shaking flasks, batch fedbatch or continuous vessels).
#smallest (1) to largest (3).
def reactorTypeEncoding(x):
    if x == 2:
        return 3
    elif x == 4:
        return 3
    elif x==3:
        return 3
    elif x == 1:
        return 2
    elif x == 5:
        return 1

#corrects the database encoding to categorically encode from lowest oxygen level (1) to highest (3).    
def oxygenEncodingFix(x):
    if x==1:
        return 3 #3 not oxygen sufficient
    elif x==2:
        return 1 #1 now oxygen insufficeint
    elif x==3:
        return 2 #2 now intermidiate

#corrects the database encoding to categorically encode from lowest nitrogen level (1) to highest (3).        
def nitrogenEncodingFix(x):
    if x==1:
        return 3 #nitrogen sufficent
    elif x==2:
        return 1 #nitrogen limited
    elif x==3:
        return 2 #intermeidiate

def funcReturnStr(x):
    return str(x)

### Ensure the DataStructure file is in the correct directory.

In [4]:
#create dict for enocoding.

encoding_Data=pd.ExcelFile('Supplemental Excel File 2- DataCharateristics & Encoding.xlsx').parse('Encoding')


#not all encoding dict features were used in the final model. 
strainDict = dict(zip(encoding_Data.strain_background,encoding_Data.strain_class))
mediaDict = dict(zip(encoding_Data.media,encoding_Data.media_class))
productDict = dict(zip(encoding_Data.Product, encoding_Data.prdt_class))
carbonSourceMWDict = dict(zip(encoding_Data.carbonSource,encoding_Data.carbonSourceMW))
N2sourceDict = dict(zip(encoding_Data.N2Source,encoding_Data.N2source_class))
promoterDict = dict(zip(encoding_Data.Promoters,encoding_Data.prom_class))
integrationSiteDict = dict(zip(encoding_Data.integrationSite,encoding_Data.int_class))

### Ensure the pickle file from Part 2 is in the correct directory.

In [5]:
#open the data from part 2.
FBATrainData = pd.DataFrame()

# # TEST data.

# with open('TESTData_part2.pickle','rb') as f:   
#     Data = pickle.load(f)

# # Train & validate data.
with open('Train&ValidateData_part2.pickle','rb') as f:   
    Data = pickle.load(f)

FBATrainData = Data[0]
workingData = FBATrainData.copy()

In [6]:
workingData['product_name2'] = workingData['product_name']
workingData['product_name'] = workingData.product_name.map(productDict).fillna(workingData.product_name)
workingData['strain_background'] = workingData.strain_background.map(strainDict).fillna(workingData.strain_background)
workingData['media'] = workingData.media.map(mediaDict).fillna(workingData.media)

In [7]:
#Create encoded feature for Carbon Source quality and quantity based off of Delta G' and concentration added to culture.

workingData['carbonSourceOneMolecularWeight'] = workingData.cs1.map(carbonSourceMWDict).fillna(workingData.cs1)
workingData['carbonSourceTwoMolecularWeight'] = workingData.cs2.map(carbonSourceMWDict).fillna(workingData.cs2)

temp1 = workingData.cs_conc1/workingData.carbonSourceOneMolecularWeight*workingData['cs1_heatCombustion(kJ/mol)']
temp2 = workingData.cs_conc2/workingData.carbonSourceTwoMolecularWeight*workingData['cs2_heatCombustion(kJ/mol)']

temp2.fillna(0,inplace=True)
df = pd.DataFrame()
df['one'] = temp1
df['two'] = temp2

temp3={}

for y,z in enumerate(df.one):
    if df.two.iloc[y]!=0:
        temp3[df.index[y]]=df.one.iloc[y]+df.two.iloc[y]
    else:
        temp3[df.index[y]]=df.one.iloc[y]

temp3 = pd.Series(temp3)

workingData['inputThermo(kJ/L)'] = temp3

In [8]:
#precursors required
temp2 = pd.DataFrame()
temp2 = workingData.precursor_required.apply(funcReturnStr).str.split(';',expand=True).fillna(0) #TAG
temp2 = temp2.apply(pd.to_numeric)
workingData['precursorsRequiredEncoded'] = temp2.sum(axis=1)

In [9]:
#create a thermodynamic barrier features (total thermodynamic barrier from precursor to product and average).
totalTherm={}
averageTherm={}


for dataPoint in workingData.index:
    
    stoichNADPH=(workingData.nadh_nadph_cost.loc[dataPoint])
    stoichATP=(workingData.atp_cost.loc[dataPoint])
    stoichprecursor={}
    temp1={}
    temp2={}
    # prec=[]

    #Thermodynamics used the fatty acid based information instead of the TAG molecules.
    if workingData.loc[dataPoint]['product_name2']=='Lipids' and lipidSimplificationOption==1:
        stoichATP = stoichATP/3
        stoichNADPH = stoichNADPH/3
        temp2=workingData.loc[dataPoint].precursor_required.strip().split(';')
        stoichprecursor[0] = float(temp2[0])/3

        temp1[0] = -3341.2 #deltaGo
        prec = ['Acetyl-CoA']
        
    # non-lipid compounds
    else:
        prec = workingData.loc[dataPoint].central_carbon_precursor.strip().split(';')

        if isinstance(workingData.loc[dataPoint].precursor_required,str):
            stoichprecursor=workingData.loc[dataPoint].precursor_required.strip().split(';')
            temp1 = workingData.loc[dataPoint]['ccm_precursor_deltaGo'].strip().split(';')
        else:
            stoichprecursor[0]=workingData.loc[dataPoint].precursor_required
            temp1[0] = workingData.loc[dataPoint]['ccm_precursor_deltaGo']

    thermoTemp = 0
    for i,j in enumerate(prec):

        thermoTemp += float(stoichprecursor[i])*float(temp1[i])
        if j == 'Acetyl-CoA':
            stoichCoA = float(stoichprecursor[i])
        else:
            stoichCoA = 0

    ATP_nadph_tempThermo = stoichATP*-31.8+-28.8*stoichNADPH #deltaGo

    totalTherm[dataPoint] = round(ATP_nadph_tempThermo + workingData.loc[dataPoint]['product_deltaGo']-thermoTemp+stoichCoA*-3202.2)

    if (totalTherm[dataPoint]==0 and workingData.loc[dataPoint]['Pathway_enzymatic_steps']==0):
        averageTherm[dataPoint]=0
    else:
        averageTherm[dataPoint]=round(totalTherm[dataPoint]/(workingData.loc[dataPoint]['Pathway_enzymatic_steps']))

workingData['totalThermBarrier'] = totalTherm
workingData['averageThermBarrier'] = averageTherm
workingData['averageThermBarrier'].fillna(0,inplace=True)

In [10]:
#N2 Source Encoding
temp1 = pd.DataFrame()

temp1 = workingData.N2Source.apply(funcReturnStr).str.split(';',expand=True).fillna('NaN') #FA
for col in temp1.columns:
    temp1[col]=temp1[col].map(N2sourceDict)

workingData['N2SourceEncoded(mean)'] = temp1.mean(axis=1).fillna(0.5)
workingData['N2SourceEncoded(max)'] = temp1.max(axis=1).fillna(1)



# Options: High N2 and organic source; high N2 and inorganic source; low N2 and organic source; low N2 and inoranic source. 
temp1={}
for dataPoint in workingData.index:
    if workingData.loc[dataPoint]['N2SourceEncoded(max)']==1 and workingData.loc[dataPoint]['N2_content']>1:
        temp1[dataPoint]=4 # high N2, organic
    elif workingData.loc[dataPoint]['N2SourceEncoded(max)']==1 and workingData.loc[dataPoint]['N2_content']<2:
        temp1[dataPoint]=2 #low N2, organic
    elif workingData.loc[dataPoint]['N2SourceEncoded(max)']==0 and workingData.loc[dataPoint]['N2_content']<2:
        temp1[dataPoint]=1 #low N2, inorganic
    elif workingData.loc[dataPoint]['N2SourceEncoded(max)']==0 and workingData.loc[dataPoint]['N2_content']>1:
        temp1[dataPoint]=3 #high N2, inorganic


temp1 = pd.Series(temp1)
workingData['N2_contentEncoded']=temp1

In [11]:
# Integration site & Promoter strength Encoding
## Not utilized in final model.

temp1 = pd.DataFrame()
temp2 = pd.DataFrame()

temp1 = workingData.integration_site_Filled.apply(funcReturnStr).str.split(';',expand=True).fillna('NaN')
for col in temp1:
    temp1[col]=temp1[col].map(integrationSiteDict)


workingData['integrationSiteEncoded(Sum)'] = temp1.sum(axis=1).fillna(0)
workingData['integrationSiteEncoded(Mean)'] = temp1.mean(axis=1).fillna(0)


temp2 = workingData.gene_promoter.apply(funcReturnStr).str.split(';',expand=True).fillna('NaN')

for col in temp2:
    temp2[col]=temp2[col].map(promoterDict)

workingData['promoterEncoded(Sum)'] = temp2.sum(axis=1).fillna(0)
workingData['promoterEncoded(Mean)'] = temp2.mean(axis=1).fillna(0)

In [12]:
#data encoding.
workingData.rxt_volume = workingData.rxt_volume.apply(rxtVolumeEncoding)
workingData.reactor_type = workingData.reactor_type.apply(reactorTypeEncoding)
workingData.oxygen = workingData.oxygen.apply(oxygenEncodingFix)
workingData.pH = workingData.pH.fillna(6.8)
workingData['csConcTotal'] = workingData['cs_conc1']+ workingData['cs_conc2']
workingData['dir_evo'].fillna(0,inplace=True)

In [13]:
# FBA features encoding. 

# workingData['O2Uptake_iYL_2.0'] = abs(workingData['O2Uptake_iYL_2.0'])
# workingData['O2Uptake_iNL895'] = abs(workingData['O2Uptake_iNL895'])
workingData['O2Uptake_iYLI647'] = abs(workingData['O2Uptake_iYLI647'])
# workingData['O2Uptake_iMK735'] = abs(workingData['O2Uptake_iMK735'])
# workingData['O2Uptake_iYali4'] = abs(workingData['O2Uptake_iYali4'])
# workingData['O2Uptake_Coleaginosus'] = abs(workingData['O2Uptake_Coleaginosus'])
# workingData['O2Uptake_iRhtoC'] = abs(workingData['O2Uptake_iRhtoC'])


In [14]:
#save file
with open('Train&ValidateData_part3.pickle', 'wb') as f:
    pickle.dump([workingData], f)
    
# with open('Train&ValidateData_part3.pickle', 'wb') as f:
#     pickle.dump([workingData], f)    