In [79]:
import pandas as pd
import numpy as np

import random

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from rgf.sklearn import RGFClassifier
from sklearn.svm import SVC 
from sklearn.linear_model import LassoCV, Lasso

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_absolute_error, r2_score, median_absolute_error, root_mean_squared_error
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc

from tensorflow import keras

import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

import joblib

from sklearn.inspection import permutation_importance
import shap

from tensorflow.keras import backend as F
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Dense, Dropout, Activation, Concatenate, Input
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.constraints import max_norm
from tensorflow.keras.utils import plot_model
from tensorflow.keras.losses import BinaryCrossentropy
from sklearn.neural_network import MLPClassifier
from scikeras.wrappers import KerasClassifier

In [80]:
random.seed(10)

Function to load the datasets, preprocess and format the data


Input: None


Output: Return the preprocessed dataframes

In [81]:
def load_dataset():

    #load T and P
    t = pd.read_csv('./transcriptome/compare_transcriptome.csv')
    p = pd.read_csv('./proteome/compare_proteome.csv')

    #remove the NaN values from P
    p.dropna(inplace=True)

    #set the knockout name as the index
    t.index = t['Unnamed: 0']
    p.index = p['Unnamed: 0']

    #remove the knockout name column from the dataframe
    t = t.iloc[:, 1:]
    p = p.iloc[:, 2:]

    #load the fluxomics datasets and set the index same as T or P
    fluxes_p = pd.read_csv('./fluxes/Fluxes_matlab_seed1.csv', header=None)
    fluxes_p = fluxes_p.T
    fluxes_p.index = p.index 

    fluxes_t = pd.read_csv('./fluxes/Fluxes_t_matlab.csv', header=None)
    fluxes_t  = fluxes_t.T
    fluxes_t.index = t.index

    #min-max normalization of fluxomics dataframe
    X_train_liq_pf = (fluxes_p - fluxes_p.min()) / fluxes_p.max() - fluxes_p.min()
    fluxes_p = X_train_liq_pf

    X_valid_liq_pf = (fluxes_t - fluxes_t.min()) / fluxes_t.max() - fluxes_t.min()
    fluxes_t = X_valid_liq_pf

    #remove the NaN columns from the normalized fluxomics dataframe
    fluxes_p.dropna(axis=1, inplace=True)
    fluxes_t.dropna(axis=1, inplace=True)

    #min-max normalization of T and P dataframe
    p = (p - p.min())/(p.max() - p.min())
    t = (t - t.min())/(t.max() - t.min())

    #load the growth rates and set the index same as T or P
    gr_measured_ML = pd.read_csv('./fluxes/compare_transcriptome_measured_gr.csv')
    gr_measured_ML.index = t.index

    y = pd.read_csv('./fluxes/compare_measured_gr.csv')
    y.index = y['ORF PROT']

    return [t, fluxes_t, p, fluxes_p, gr_measured_ML, y]

Function to filter knockouts as low or high growth based on a given percentile


Input: Liquid GR, Solid GR, Percentile for splitting


Output: Modified GR dataframe with a class column [0 as low and 2 as high]

In [82]:
def filter_ko(gr_measured_ML, y, a_per, b_per):
    
    #find the percentile for liquid GR
    a = np.percentile(gr_measured_ML['0'], a_per)
    b = np.percentile(gr_measured_ML['0'], b_per)

    y_t = []
    for i in gr_measured_ML['0']:
        if i < a:
            y_t.append(0)
        elif i>a and i<b:
            y_t.append(1)
        else:
            y_t.append(2)

    #append class column based on the calculated percentile
    gr_measured_ML['class'] = y_t


    #find the percentile for solid GR
    a = np.percentile(y['SM'], a_per)
    b = np.percentile(y['SM'], b_per)

    y_p = []
    for i in y['SM']:
        if i < a:
            y_p.append(0)
        elif i>a and i<b:
            y_p.append(1)
        else:
            y_p.append(2)

    #append class column based on the calculated percentile
    y['class'] = y_p

    #filter medium gr KOs
    gr_measured_ML = gr_measured_ML[gr_measured_ML['class'] != 1]
    y = y[y['class'] != 1]

    return [gr_measured_ML, y]

Function to combine the transcriptomics, proteomics and fluxomics dataset into a single dataframe


Input: Transcriptomics, Proteomics, Fluxomics derived from transcriptomics, Fluxomics derived from proteomics


Output: Dataframe 

In [83]:
def generate_combined_dataset(t, p, fluxes_t, fluxes_p):

    X = pd.concat([t, fluxes_t, p, fluxes_p], axis=1)
    X.columns = X.columns.astype(str)

    return X

Function to split the dataset into train, test and validation 

Input: y dataframes

Output: Train, test and validation dataframes

In [84]:
def split_data(y):

    # est: 30% of the total data
    test_size = int(0.3*y.shape[0])

    #train: 80% of the remaining data
    train_size = int(0.8*(y.shape[0] - test_size))

    valid_size = int(0.2*(y.shape[0] - test_size))

    #randomly sample train_size number of rows, ensuring equal split of each class
    y_train = y.groupby('class', group_keys=False).apply(lambda x: x.sample(n=train_size//2, random_state=42))

    #filter the remaining indices
    train_valid = y.loc[list(set(y.index).difference(set(y_train.index))), :]

    #randomly sample valid_size number of rows from the remaining rows
    y_valid = train_valid.groupby('class', group_keys=False).apply(lambda x: x.sample(n=valid_size//2, random_state=42))

    y_test = y.loc[list(set(y.index).difference(set(y_valid.index).union(set(y_train.index)))), :]

    return [y_train, y_test, y_valid]

Function to find performance of given model


Input: Model, X train values, y train values, X test values, y test values, title for the plots


Output: Array of statistics

In [85]:
def performance(model, X_train, y_train, X_test, y_test, condition, ann = False):

    #predict y values with the test split
    y_pred = model.predict(X_test)

    if ann == True:
        pred = [2 if i > np.mean(y_pred) else 0 for i in y_pred ]
        y_pred = pred
        print(y_pred)

    #print statistics
    print('R2 score: ', r2_score(y_test, y_pred))
    print('MAE: ', mean_absolute_error(y_test, y_pred))
    print('RMSE: ', root_mean_squared_error(y_test, y_pred))
    print('MDAE: ', median_absolute_error(y_test, y_pred))
    print("Spearman Correlation: ", stats.spearmanr(y_test, y_pred))

    statistics = [r2_score(y_test, y_pred), mean_absolute_error(y_test, y_pred), root_mean_squared_error(y_test, y_pred), median_absolute_error(y_test, y_pred), stats.spearmanr(y_test, y_pred)]

    #print accuracy
    if ann == True:
        
        print('Accuracy: ',accuracy_score(y_pred, y_test))
        pred = model.predict(X_train)
        train_acc = accuracy_score([2 if i > np.mean(pred) else 0 for i in pred ], y_train)
        print("Train accuracy: ", train_acc)
    else:
        print('Accuracy: ',accuracy_score(y_pred, y_test))
        train_acc = accuracy_score(model.predict(X_train), y_train)
        print("Train accuracy: ", train_acc)

    statistics.append(accuracy_score(y_pred, y_test))
    statistics.append(train_acc)

    #print classification report [precision    recall  f1-score   support]
    print(classification_report(y_test, y_pred))

    statistics.append(classification_report(y_test, y_pred, output_dict=True)['weighted avg'])

    #find the classification from prediction of both train and validation

    df_aux = pd.concat([X_test, y_test], axis=1)
    df_aux['class'] = [1 if y == 0 else 0 for y in y_test.values]
    pred = model.predict(X_test)
    if ann:
        pred = [2 if i > np.mean(pred) else 0 for i in pred]
    df_aux['prob'] = [1 if y == 0 else 0 for y in pred]
    df_aux = df_aux.reset_index(drop = True)

    df_aux_train = pd.concat([X_train, y_train], axis=1)
    df_aux_train['class'] = [1 if y == 0 else 0 for y in y_train.values]
    pred = model.predict(X_train)
    if ann:
        pred = [2 if i > np.mean(pred) else 0 for i in pred]
    df_aux_train['prob'] = [1 if y == 0 else 0 for y in pred]
    df_aux_train = df_aux_train.reset_index(drop = True)

    #find the false positive rate
    fpr, tpr, thresholds = roc_curve(df_aux['class'], df_aux['prob']) 
    roc_auc = auc(fpr, tpr)

    fpr_train, tpr_train, thresholds_train = roc_curve(df_aux_train['class'], df_aux_train['prob']) 
    roc_auc_train = auc(fpr_train, tpr_train)


    #plot the ROC curve for both validation and training
    plt.figure()  
    plt.plot(fpr, tpr, label='Validation ROC curve (area = %0.2f)' % roc_auc)
    plt.plot(fpr_train, tpr_train, label='Train ROC curve (area = %0.2f)' % roc_auc_train)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(condition)
    plt.legend()
    plt.show()


    return statistics

Function to train and test the performance of asn SVM classifier


Input: X train values, X validation values, y train values, y validation values, name to save the model, dataframe to store the statistics

Output: Returns the statistics dataframe

In [86]:
def SVM_model(x_train, x_valid, y_train, y_valid, name, temp):

    #initialize a SVM classifier
    svm = SVC(random_state = 11850)

    #set the hyperparameters
    param_dist = {'C': [0.1, 0.5, 1, 5, 10, 15, 20, 25, 30, 35],
                    'gamma': [0.001, 0.0025, 0.005, 0.0075, 0.01, 0.025, 0.05, 0.075, 0.1],
                    'kernel': ['linear', 'rbf', 'poly']
                }
    
    #perform gridsearch to find the optimal hyperparameters
    svm = GridSearchCV(svm, param_dist, cv=10, return_train_score=True)

    #fit the best classifier and save the model
    svm.fit(x_train, y_train)
    joblib.dump(svm, name+".joblib")

    #test the performance of the model
    statistics = performance(svm, x_train, y_train, x_valid, y_valid, name)

    #find the index of the best estimator
    best_estimator = {
            'C': svm.best_estimator_.C,
        }

    index = 0
    for i in range(len(svm.cv_results_['params'])):
            if svm.cv_results_['params'][i] == best_estimator:
                index = i
                break

    #filter the training score across 10 folds of the best estimator
    scores = [svm.cv_results_['split'+str(i)+'_train_score'][index] for i in range(10)]
    print(scores)

    #plot the score across the 10 folds
    plt.plot(scores)
    plt.title(name + "k-fold")

    temp.loc[name, temp.columns[:4]] = statistics[:4]
    temp.loc[name, 'Spearman Correlation'] = statistics[4].statistic
    temp.loc[name, temp.columns[5:7]] = statistics[5:7]
    temp.loc[name, 'Precision'] = statistics[7]['precision']
    temp.loc[name, 'Recall'] = statistics[7]['recall']
    temp.loc[name, 'F1-Score'] = statistics[7]['f1-score']

    return temp

In [87]:
def LR_Lasso_model(x_train, x_valid, y_train, y_valid, name, temp):

    #initialize a Logistic Regressor classifier with LASSO regularization
    logreg = LogisticRegression(penalty='l2')

    #set the hyperparameters
    grid={"C": [0.0, 0.25, 0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0, 2.25, 2.5]}
    
    #perform gridsearch to find the optimal hyperparameters
    logreg_cv = GridSearchCV(logreg,grid,cv=10)

    #fit the best classifier and save the model
    logreg_cv.fit(x_train, y_train)
    joblib.dump(logreg_cv, name+".joblib")

    #test the performance of the model
    statistics = performance(logreg_cv, x_train, y_train, x_valid, y_valid, name)

    #find the index of the best estimator
    best_estimator = {
            'C': logreg_cv.best_estimator_.C,
        }

    index = 0
    for i in range(len(logreg_cv.cv_results_['params'])):
            if logreg_cv.cv_results_['params'][i] == best_estimator:
                index = i
                break

    #filter the training score across 10 folds of the best estimator
    scores = [logreg_cv.cv_results_['split'+str(i)+'_test_score'][index] for i in range(10)]
    print(scores)

    #plot the score across the 10 folds
    plt.plot(scores)
    plt.title(name + "k-fold")

    temp.loc[name, temp.columns[:4]] = statistics[:4]
    temp.loc[name, 'Spearman Correlation'] = statistics[4].statistic
    temp.loc[name, temp.columns[5:7]] = statistics[5:7]
    temp.loc[name, 'Precision'] = statistics[7]['precision']
    temp.loc[name, 'Recall'] = statistics[7]['recall']
    temp.loc[name, 'F1-Score'] = statistics[7]['f1-score']

    return temp

def LR_elasticnet_model(x_train, x_valid, y_train, y_valid, name, temp):

    #initialize a Logistic Regressor classifier with LASSO regularization
    logreg = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5)

    #set the hyperparameters
    grid={"C":np.logspace(-5,5,50)}
    
    #perform gridsearch to find the optimal hyperparameters
    logreg_cv = GridSearchCV(logreg,grid,cv=10)

    #fit the best classifier and save the model
    logreg_cv.fit(x_train, y_train)
    joblib.dump(logreg_cv, name+".joblib")

    #test the performance of the model
    statistics = performance(logreg_cv, x_train, y_train, x_valid, y_valid, name)

    #find the index of the best estimator
    best_estimator = {
            'C': logreg_cv.best_estimator_.C,
        }

    index = 0
    for i in range(len(logreg_cv.cv_results_['params'])):
            if logreg_cv.cv_results_['params'][i] == best_estimator:
                index = i
                break

    #filter the training score across 10 folds of the best estimator
    scores = [logreg_cv.cv_results_['split'+str(i)+'_test_score'][index] for i in range(10)]
    print(scores)

    #plot the score across the 10 folds
    plt.plot(scores)
    plt.title(name + "k-fold")

    temp.loc[name, temp.columns[:4]] = statistics[:4]
    temp.loc[name, 'Spearman Correlation'] = statistics[4].statistic
    temp.loc[name, temp.columns[5:7]] = statistics[5:7]
    temp.loc[name, 'Precision'] = statistics[7]['precision']
    temp.loc[name, 'Recall'] = statistics[7]['recall']
    temp.loc[name, 'F1-Score'] = statistics[7]['f1-score']

    return temp

In [88]:
def RandomForest_model(x_train, x_valid, y_train, y_valid, name, temp):

    #initialize a Random Forest classifier
    rf = RandomForestClassifier(random_state = 11850)

    #set the hyperparameters
    param_dist = {'n_estimators': list(range(200, 500, 50)),
              'max_depth': list(range(1,15,2)),
              'min_samples_leaf': [1, 2, 4],
              'max_features': list(range(120, 250, 10))
            }
    
    #perform gridsearch to find the optimal hyperparameters
    rf = GridSearchCV(rf, param_dist, cv=10, return_train_score=True)

    #fit the best classifier and save the model
    rf.fit(x_train, y_train)
    joblib.dump(rf, name+".joblib")

    #test the performance of the model
    statistics = performance(rf, x_train, y_train, x_valid, y_valid, name)

    #find the index of the best estimator
    best_estimator = {
            'n_estimators': rf.best_estimator_.n_estimators,
            'max_depth': rf.best_estimator_.max_depth,
            'min_samples_leaf': rf.best_estimator_.min_samples_leaf,
            'max_features': rf.best_estimator_.max_features,
        }

    index = 0
    for i in range(len(rf.cv_results_['params'])):
            if rf.cv_results_['params'][i] == best_estimator:
                index = i
                break

    #filter the training score across 10 folds of the best estimator
    scores = [rf.cv_results_['split'+str(i)+'_train_score'][index] for i in range(10)]
    print(scores)

    #plot the score across the 10 folds
    plt.plot(scores)
    plt.title(name + "k-fold")

    temp.loc[name, temp.columns[:4]] = statistics[:4]
    temp.loc[name, 'Spearman Correlation'] = statistics[4].statistic
    temp.loc[name, temp.columns[5:7]] = statistics[5:7]
    temp.loc[name, 'Precision'] = statistics[7]['precision']
    temp.loc[name, 'Recall'] = statistics[7]['recall']
    temp.loc[name, 'F1-Score'] = statistics[7]['f1-score']

    return temp

In [89]:
def kNN_model(x_train, x_valid, y_train, y_valid, name, temp):
        
        #initialize a kNN classifier
        knn = KNeighborsClassifier(n_neighbors=1)

        #set the hyperparameters
        parameters_KNN = {
            'n_neighbors': (1,50, 5),
            'leaf_size': (20,100,10),
            'p': (1,2),
            'weights': ('uniform', 'distance'),
            'metric': ('minkowski', 'chebyshev')}
        
        #perform gridsearch to find the optimal hyperparameters
        knn = GridSearchCV(
                estimator=knn,
                param_grid=parameters_KNN,
                scoring = 'accuracy',
                n_jobs = -1,
                cv = 5
            )

        #fit the best classifier and save the model
        knn.fit(x_train, y_train)
        joblib.dump(knn, name+".joblib")

        #test the performance of the model
        statistics = performance(knn, x_train, y_train, x_valid, y_valid, name)

        #find the index of the best estimator
        best_estimator = {
                'n_neighbors': knn.best_estimator_.n_neighbors,
                'leaf_size': knn.best_estimator_.leaf_size,
                'p': knn.best_estimator_.p,
                'weights': knn.best_estimator_.weights,
                'metric': knn.best_estimator_.metric,
            }

        index = 0
        for i in range(len(knn.cv_results_['params'])):
                if knn.cv_results_['params'][i] == best_estimator:
                    index = i
                    break

        #filter the training score across 10 folds of the best estimator
        scores = [knn.cv_results_['split'+str(i)+'_train_score'][index] for i in range(10)]
        print(scores)

        #plot the score across the 10 folds
        plt.plot(scores)
        plt.title(name + "k-fold")

        temp.loc[name, temp.columns[:4]] = statistics[:4]
        temp.loc[name, 'Spearman Correlation'] = statistics[4].statistic
        temp.loc[name, temp.columns[5:7]] = statistics[5:7]
        temp.loc[name, 'Precision'] = statistics[7]['precision']
        temp.loc[name, 'Recall'] = statistics[7]['recall']
        temp.loc[name, 'F1-Score'] = statistics[7]['f1-score']

        return temp

In [90]:
def regression_LASSO_model(x_train, x_valid, y_train, y_valid, name, temp):
        
        #initialize a Linear Regressor with LASSO regularization
        reg = Lasso()

        #set the hyperparameters
        param_space = {'alpha': [0.0001, 0.001, 0.01, 0.1, 1.0]}
        
        #perform gridsearch to find the optimal hyperparameters
        reg = GridSearchCV(reg, param_space, cv=10)

        #fit the best classifier and save the model
        reg.fit(x_train, y_train)
        joblib.dump(reg, name+".joblib")

        #test the performance of the model
        statistics = performance(reg, x_train, y_train, x_valid, y_valid, name)

        #find the index of the best estimator
        best_estimator = {
                'alpha': reg.best_estimator_.alpha,
            }

        index = 0
        for i in range(len(reg.cv_results_['params'])):
                if reg.cv_results_['params'][i] == best_estimator:
                    index = i
                    break

        #filter the training score across 10 folds of the best estimator
        scores = [reg.cv_results_['split'+str(i)+'_test_score'][index] for i in range(10)]
        print(scores)

        #plot the score across the 10 folds
        plt.plot(scores)
        plt.title(name + "k-fold")

        temp.loc[name, temp.columns[:4]] = statistics[:4]
        temp.loc[name, 'Spearman Correlation'] = statistics[4].statistic
        temp.loc[name, temp.columns[5:7]] = statistics[5:7]
        temp.loc[name, 'Precision'] = statistics[7]['precision']
        temp.loc[name, 'Recall'] = statistics[7]['recall']
        temp.loc[name, 'F1-Score'] = statistics[7]['f1-score']

        return temp

In [91]:
def regression_RandomForest_model(x_train, x_valid, y_train, y_valid, name, temp):
        
        #initialize a Linear Regressor with LASSO regularization
        reg = RandomForestRegressor()

        #set the hyperparameters
        param_dist = {'n_estimators': list(range(200, 500, 50)),
              'max_depth': list(range(1,15,2)),
              'min_samples_leaf': [1, 2, 4],
              'max_features': list(range(120, 250, 10))
            }
        
        #perform gridsearch to find the optimal hyperparameters
        reg = GridSearchCV(reg, param_dist, cv=10)

        #fit the best classifier and save the model
        reg.fit(x_train, y_train)
        joblib.dump(reg, name+".joblib")

        #test the performance of the model
        statistics = performance(reg, x_train, y_train, x_valid, y_valid, name)

        #find the index of the best estimator
        best_estimator = {
            'n_estimators': reg.best_estimator_.n_estimators,
            'max_depth': reg.best_estimator_.max_depth,
            'min_samples_leaf': reg.best_estimator_.min_samples_leaf,
            'max_features': reg.best_estimator_.max_features,
        }

        index = 0
        for i in range(len(reg.cv_results_['params'])):
                if reg.cv_results_['params'][i] == best_estimator:
                    index = i
                    break

        #filter the training score across 10 folds of the best estimator
        scores = [reg.cv_results_['split'+str(i)+'_train_score'][index] for i in range(10)]
        print(scores)

        #plot the score across the 10 folds
        plt.plot(scores)
        plt.title(name + "k-fold")

        temp.loc[name, temp.columns[:4]] = statistics[:4]
        temp.loc[name, 'Spearman Correlation'] = statistics[4].statistic
        temp.loc[name, temp.columns[5:7]] = statistics[5:7]
        temp.loc[name, 'Precision'] = statistics[7]['precision']
        temp.loc[name, 'Recall'] = statistics[7]['recall']
        temp.loc[name, 'F1-Score'] = statistics[7]['f1-score']

        return temp

In [92]:
def regression_SVM_model(x_train, x_valid, y_train, y_valid, name, temp):
        
        #initialize a SVM Regressor
        reg = SVR()

        #set the hyperparameters
        param_dist = {'C': [0.1, 0.5, 1, 5, 10, 15, 20, 25, 30, 35],
                    'gamma': [0.001, 0.0025, 0.005, 0.0075, 0.01, 0.025, 0.05, 0.075, 0.1],
                    'kernel': ['linear', 'rbf', 'poly']
                }
        
        #perform gridsearch to find the optimal hyperparameters
        reg = GridSearchCV(reg, param_dist, cv=10)

        #fit the best classifier and save the model
        reg.fit(x_train, y_train)
        joblib.dump(reg, name+".joblib")

        #test the performance of the model
        statistics = performance(reg, x_train, y_train, x_valid, y_valid, name)

        #find the index of the best estimator
        best_estimator = {
            'C': svm.best_estimator_.C,
        }

        index = 0
        for i in range(len(reg.cv_results_['params'])):
                if reg.cv_results_['params'][i] == best_estimator:
                    index = i
                    break

        #filter the training score across 10 folds of the best estimator
        scores = [reg.cv_results_['split'+str(i)+'_train_score'][index] for i in range(10)]
        print(scores)

        #plot the score across the 10 folds
        plt.plot(scores)
        plt.title(name + "k-fold")

        temp.loc[name, temp.columns[:4]] = statistics[:4]
        temp.loc[name, 'Spearman Correlation'] = statistics[4].statistic
        temp.loc[name, temp.columns[5:7]] = statistics[5:7]
        temp.loc[name, 'Precision'] = statistics[7]['precision']
        temp.loc[name, 'Recall'] = statistics[7]['recall']
        temp.loc[name, 'F1-Score'] = statistics[7]['f1-score']

        return temp

In [93]:
#load the external double KO dataset [required for the features list]

double_KO = pd.read_csv("../Krithikaa/yeast_datasets/yeast_gstf_dataset.csv")

In [94]:
#load and preprocess data
[t, fluxes_t, p, fluxes_p, gr_measured_ML, y] = load_dataset()

#filter based on percentile
[gr_measured_ML, y] = filter_ko(gr_measured_ML, y, 30, 70)

#concatenate the dataframes
X = generate_combined_dataset(t, p, fluxes_t, fluxes_p)

#split transcriptomics dataset into train, test and validation
[y_train_liq, y_test_liq, y_valid_liq] = split_data(gr_measured_ML)
X_train_liq = X.loc[y_train_liq.index, :]
X_test_liq = X.loc[y_test_liq.index, :]
X_valid_liq = X.loc[y_valid_liq.index, :]

y_valid_liq = y_valid_liq['class']
y_train_liq = y_train_liq['class']
y_test_liq = y_test_liq['class']

#split proteomics dataset into train, test and validation
[y_train_solid, y_test_solid, y_valid_solid] = split_data(y)
X_train_solid = X.loc[y_train_solid.index, :]
X_test_solid = X.loc[y_test_solid.index, :]
X_valid_solid = X.loc[y_valid_solid.index, :]

y_train_solid = y_train_solid['class']
y_test_solid = y_test_solid['class']
y_valid_solid = y_valid_solid['class']

#initialize the statistics dataframe
temp = pd.DataFrame(columns=['R2 Score', 'MAE', 'RMSE', 'MDAE', 'Spearman Correlation', 'Accuracy', 'Train Accuracy', 'Precision', 'Recall', 'F1-Score'])

#subset train and validation for transcriptomics
x_train = X_train_liq.iloc[:, 0:1826].loc[:, double_KO.columns[1:]]
x_valid = X_valid_liq.iloc[:, 0:1826].loc[:, double_KO.columns[1:]]


temp = SVM_model(x_train, x_valid, y_train_liq, y_valid_liq, 'svm_t_30', temp)
temp = LR_Lasso_model(x_train, x_valid, y_train_liq, y_valid_liq, 'lasso_log_t_30', temp)
temp = LR_elasticnet_model(x_train, x_valid, y_train_liq, y_valid_liq, 'elasticnet_log_t_30', temp)
temp = kNN_model(x_train, x_valid, y_train_liq, y_valid_liq, 'knn_t_30', temp)
temp = regression_LASSO_model(x_train, x_valid, y_train_liq, y_valid_liq, 'lasso_t_30', temp)
temp = regression_SVM_model(x_train, x_valid, y_train_liq, y_valid_liq, 'svr_t_30', temp)


  y_train = y.groupby('class', group_keys=False).apply(lambda x: x.sample(n=train_size//2, random_state=42))
  y_valid = train_valid.groupby('class', group_keys=False).apply(lambda x: x.sample(n=valid_size//2, random_state=42))
  y_train = y.groupby('class', group_keys=False).apply(lambda x: x.sample(n=train_size//2, random_state=42))
  y_valid = train_valid.groupby('class', group_keys=False).apply(lambda x: x.sample(n=valid_size//2, random_state=42))


KeyboardInterrupt: 

In [72]:
transcriptome = X_train_liq.iloc[:, 0:1826].loc[:, double_KO.columns[1:]]
flux = X_train_liq.iloc[:, 1826: 2222]
x_train = pd.concat([transcriptome, flux], axis=1)
x_valid = X_valid_liq.iloc[:, 0:2222].loc[:, x_train.columns]

temp = SVM_model(x_train, x_valid, y_train_liq, y_valid_liq, 'svm_t_f_30', temp)
temp = LR_Lasso_model(x_train, x_valid, y_train_liq, y_valid_liq, 'lasso_log_t_f_30', temp)
temp = LR_elasticnet_model(x_train, x_valid, y_train_liq, y_valid_liq, 'elasticnet_log_t_f_30', temp)
temp = kNN_model(x_train, x_valid, y_train_liq, y_valid_liq, 'knn_t_f_30', temp)
temp = regression_LASSO_model(x_train, x_valid, y_train_liq, y_valid_liq, 'lasso_t_f_30', temp)
temp = regression_SVM_model(x_train, x_valid, y_train_liq, y_valid_liq, 'svr_t_f_30', temp)

((991, 1826), (991, 396))

In [None]:
x_train = X_train_liq.iloc[:, 2222:4048].loc[:, double_KO.columns[1:]]
x_valid = X_valid_liq.iloc[:, 2222:4048].loc[:, double_KO.columns[1:]]

temp = SVM_model(x_train, x_valid, y_train_liq, y_valid_liq, 'svm_p_30', temp)
temp = LR_Lasso_model(x_train, x_valid, y_train_liq, y_valid_liq, 'lasso_log_p_30', temp)
temp = LR_elasticnet_model(x_train, x_valid, y_train_liq, y_valid_liq, 'elasticnet_log_p_30', temp)
temp = kNN_model(x_train, x_valid, y_train_liq, y_valid_liq, 'knn_p_30', temp)
temp = regression_LASSO_model(x_train, x_valid, y_train_liq, y_valid_liq, 'lasso_p_30', temp)
temp = regression_SVM_model(x_train, x_valid, y_train_liq, y_valid_liq, 'svr_p_30', temp)

In [None]:
proteome = X_train_liq.iloc[:, 2222:4048].loc[:, double_KO.columns[1:]]
flux = X_train_liq.iloc[:, 4048: ]
x_train = pd.concat([transcriptome, flux], axis=1)
x_valid = X_valid_liq.iloc[:, 2222:].loc[:, x_train.columns]

temp = SVM_model(x_train, x_valid, y_train_liq, y_valid_liq, 'svm_p_f_30', temp)
temp = LR_Lasso_model(x_train, x_valid, y_train_liq, y_valid_liq, 'lasso_log_p_f_30', temp)
temp = LR_elasticnet_model(x_train, x_valid, y_train_liq, y_valid_liq, 'elasticnet_log_p_f_30', temp)
temp = kNN_model(x_train, x_valid, y_train_liq, y_valid_liq, 'knn_p_f_30', temp)
temp = regression_LASSO_model(x_train, x_valid, y_train_liq, y_valid_liq, 'lasso_p_f_30', temp)
temp = regression_SVM_model(x_train, x_valid, y_train_liq, y_valid_liq, 'svr_p_f_30', temp)