# ML Pipeline Code was built for the initial model training.
ML Pipeline Code was built for the initial model training detailed in "Integrated knowledge mining, genome-scale modeling, and machine learning for predicting *Yarrowia lipolytica* bioproduction".

### Part 1/4:
* Part 1: Performs data importation, intial formatting and splits data into 3 parts for training, validation, and testing.
* Part 2: FBA feature generation is completed; script entitled "ML_pipeline_part2"
* Part 3: Feature encoding is completed; script entitled "ML_pipeline_part3"
* Part 4: Machine learning model training is completed; script entitled "ML_pipeline_part4"
    
### Inputs:
* Database file: Machine learning. Publication entitled file: 'Supplemental Excel File 1- Database.xlsx'
* Data encoding file: Publication entitled file: 'Supplemental Excel File 2- DataCharateristics & Encoding.xlsx'

### Outputs:    
* A pickle datafile entitled "Train&ValidateData_part1.pickle" & "TESTData_part1.pickle" is created at the end of the file.
    

In [1]:
#cell initializes the packages needed for the data importing.

import pandas as pd
import pickle
import warnings
import numpy as np
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import os

### Ensure you are in folder with database & files

In [2]:
# Impport the data from the database file.

#find the file
dir_path = os.path.dirname(os.path.relpath('Supplemental Excel File 1- Database.xlsx'))
file_path = os.path.join(dir_path,'Supplemental Excel File 1- Database.xlsx')

raw_data = pd.read_excel(file_path,sheet_name='data',skiprows=range(1))

# raw_data=pd.ExcelFile(file_path).parse('data')
raw_data_col_headers = raw_data.columns


# specifies that the zero product production instances will be dropped.
drop0option=1

#### Not all the information in the database was used for training. The unnecessary columns are removed from the datalist.

In [3]:
# several columns in the original database were not used for training due to incomplete information. This cell cleans up the data.
so_far_not_filled_in_cols = ['Laststep_energy_barrier_step','Relative_promoter_strength','sensor_regulator','Mod_path_opt','engineering_approaches']

alternatives_toTest = ['Pathway_enzymatic_steps','atp_cost','nadh_nadph_cost']

so_far_useless = ['TF','integration_site_plasmids','blank1','blank2','blank3','blank4','blank5','blank6','blank7','temp_holding']

columnsToDelete = ['cs1_equilibrator_deltaG0','cs1_deltaG\'0','cs2_equilibrator_deltaG0','cs2_deltaG\'0','Product_yield(g/DCW)','Biomass_titer(g/L)','Biomass_growth_rate']


# Make a list of the columns that are used.
useful_cols = list(set(raw_data_col_headers)-set(so_far_not_filled_in_cols)-set(so_far_useless)-set(columnsToDelete))

#
data = raw_data.loc[:, useful_cols]
print(len(data))

3070


In [4]:
#Create Engineered gene feature categories from the input data.

data['number_genes_mod'] = data.genes_modified_updated.apply(lambda x: x.count(';')+1 if isinstance(x,str) else 0)
# data['number_genes_mod']

data['number_genes_deleted'] = data.gene_deletion.apply(lambda x: x.count('1') if isinstance(x,str) else 0)
# data['number_genes_deleted']

data['number_total_genes_overexp'] = data.gene_overexpression.apply(lambda x: x.count('1') if isinstance(x,str) else 0)
# data['number_total_genes_overexp']

data['number_genes_het'] = data.heterologous_gene.apply(lambda x: x.count('1') if isinstance(x,str) else 0)
# data['number_genes_het']

#number of heterlogous genes
hettemp1 = data.heterologous_gene#.apply(lambda x: x if isinstance(x,str) else 'NA')
hettemp2 = hettemp1.str.split(';',expand=True)
overexpressTemp1 = data.gene_overexpression.fillna('2')
overexpressTemp2 = overexpressTemp1.str.split(';',expand=True)
nativeGenes = overexpressTemp2[hettemp2=='0']

data['number_native_genes_overexp'] = nativeGenes.count(axis=1)


#### Ensure that the DataStructure file is within the directory.

In [5]:
#Load data encoding information contained in the DataStructure sheet.
encoding_Data=pd.ExcelFile('Supplemental Excel File 2- DataCharateristics & Encoding.xlsx').parse('Encoding')

#product super classes (1-9 general classes, such as Lipids, small terpenes, flavenoids, etc.).
productDict = dict(zip(encoding_Data.Product, encoding_Data.prdt_class))
data['product_name2'] = data.product_name.map(productDict).fillna(data.product_name) #save original product name.

#drop the zero product instances.
if drop0option==1:
    data = (data.loc[data['Product_titer(g/L)']!=0])
    data['Product_titer(g/L)'].dropna(inplace=True,axis='rows')
print(len(data))

2915


In [6]:
#Create the training split for the data. 
Train, Test = train_test_split(data, test_size = 0.2, random_state = 56,stratify = data.product_name2)
    
# add specific products to the test set, remove fromt the training set.
# productsToTestCases = ['Arachidonic acid', 'a-Ionone', 'Valencene', 'Mevalonate', 'Campersterol', 'Crotonic acid', 'a-Farnesene', 'Riboflavin', '1-decanol']
productsToTestCases = ['a-Ionone', 'Mevalonate', 'Campersterol', 'a-Farnesene', '1-decanol','Arachidonic acid']

for p in productsToTestCases:
    temp = Train[Train.product_name == p]
    tempIndex = Train[Train.product_name ==p].index
    Train.drop(tempIndex,inplace=True)
    Test = Test.append(temp)

print('Training & validating instances:',len(Train))
print('Testing instances:',len(Test),'\n')
    
for p in productsToTestCases:
    t = Test[Test.product_name == p]
    tr = Train[Train.product_name == p]
#     print(p,'test cases: ',len(t), 'training cases: ',len(tr))

data['product_name2'] = data.product_name.map(productDict).fillna(data.product_name) #save original product name.

for i in range(1,10):
    print('Training instances of product class',i,':',(Train.product_name2==i).sum())
    
print('\n')
for i in range(1,10):    
    print('Testing instances of product class',i,':',(Test.product_name2==i).sum())

    



Training & validating instances: 2257
Testing instances: 658 

Training instances of product class 1 : 673
Training instances of product class 2 : 370
Training instances of product class 3 : 157
Training instances of product class 4 : 651
Training instances of product class 5 : 47
Training instances of product class 6 : 86
Training instances of product class 7 : 147
Training instances of product class 8 : 18
Training instances of product class 9 : 108


Testing instances of product class 1 : 192
Testing instances of product class 2 : 102
Testing instances of product class 3 : 77
Testing instances of product class 4 : 175
Testing instances of product class 5 : 12
Testing instances of product class 6 : 32
Testing instances of product class 7 : 37
Testing instances of product class 8 : 4
Testing instances of product class 9 : 27


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [7]:
#Create pickle files containing data.    
with open('TESTData_part1.pickle', 'wb') as f:
    pickle.dump([Test], f)
with open('Train&ValidateData_part1.pickle', 'wb') as f:
    pickle.dump([Train], f)