# Imports

In this cell we are importing all relevant packages for our project

In [1]:
# connections and OS
import pandas as pd
#import seaborn as sns
import os
import sqlite3
#import csv

#utils (Pandas,numpy,tqdm)
import numpy as np
import pandas as pd
from tqdm import tqdm

#visualize 
#import seaborn as sns

#preprocessing, metrices and splits 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.preprocessing import StandardScaler,MinMaxScaler

#ML models:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression, Ridge


#tensorflow layer, callbacks and layers
import tensorflow as tf
import tensorflow.keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Concatenate, Input
from tensorflow.keras.callbacks import ModelCheckpoint, Callback

# Constants and params

Change directory to project directory within the department cluster (SLURM) and define to constant variables


In [2]:
#change directory to project directory within the department cluster (SLURM)
PROJECT_DIRECTORY = r'C:\Users\Niko\Desktop\plates'
CHANNELS = ["AGP","DNA","ER","Mito","RNA"]
LABEL_FIELD = 'Metadata_ASSAY_WELL_ROLE'

In [3]:
os.chdir(PROJECT_DIRECTORY)

# Preprocessing functions

In [4]:
def scale_data(df):
    """
    This function is scaling the data using two methods: STD and MinMax
    df: dataFrame     
    return: two scaled dataframes, the first one according to StandardScaler and the second according to MinMaxScaler
    """
    std_scalar = StandardScaler()
    minMax_scalar = MinMaxScaler()
    
    std_df = pd.DataFrame(std_scalar.fit_transform(df),columns=df.columns)
    minmax_df = pd.DataFrame(minMax_scalar.fit_transform(df),columns=df.columns)    
    std_df.fillna(0,inplace=True)
    minmax_df.fillna(0,inplace=True)
    
    return std_df,minmax_df

In [5]:
def split_channels_x_and_y(filename, task_channel):
    """
    This function is responsible for splitting five channels into four channels as train and the remaining channel to test
    filename: file path to the cell table from a single plate
    task_channel: the current channel that we aim to predict
    
    Notably: In order to avoid leakage we drop all 'correlation features
    return: separated dataframes x_features and y_df.
            x_features: contains all available features excluding the features related to 'task_channel' we aim to predict
            y_df: contains all available features related to 'task_channel' only
    """

    # Data preparation
    df = pd.read_csv(filename)
    df = df.set_index(['ImageNumber', 'ObjectNumber'])
    df.drop(['TableNumber'], inplace=True, axis=1)
    df.dropna(inplace=True)

    labels = [LABEL_FIELD, 'Image_Metadata_Well', 'Metadata_broad_sample']
    general_cols = [f for f in df.columns if f not in labels and all(c not in f for c in CHANNELS)]
    corr_cols = [f for f in df.columns if 'Correlation' in f]

    # Split columns by channel
    dict_channel_cols = {}
    for channel in CHANNELS:
        dict_channel_cols[channel] = [col for col in df.columns if channel in col and col not in corr_cols]

    not_curr_channel_cols = [col for channel in CHANNELS if channel != task_channel
                             for col in dict_channel_cols[channel]]
    cols = labels + general_cols + not_curr_channel_cols

    x_features_df = df[cols]

    y_df = df[dict_channel_cols[task_channel]]

    return x_features_df, y_df

# Create Models

In the following three cells we are creating three ML models

In [6]:
def create_LR(df_train_X, df_train_Y):  
    """
    In this cell we are creating and training a linear regression model        
    df_train_X: contains all available features excluding the features related to 'task_channel' we aim to predict (train)
    df_train_Y: contains all available features related to 'task_channel' only for the train
    
    
    return: trained linear regression model
    """
    LR_model = LinearRegression()    
    LR_model.fit(df_train_X.values,df_train_Y.values)
    return LR_model
    
    

In [7]:
def create_Ridge(df_train_X, df_train_Y):
    """    
    In this cell we are creating and training a ridge regression model    
    
    
    df_train_X: contains all available features excluding the features related to 'task_channel' we aim to predict (train)
    df_train_Y: contains all available features related to 'task_channel' only for the train    
    
    return: trained ridge regression model
    """
    Ridge_model = Ridge()    
    Ridge_model.fit(X=df_train_X.values,y=df_train_Y.values)    
    return Ridge_model

In [8]:
def create_model_dnn(task_channel,df_train_X, df_train_Y,test_plate):
    """    
    In this cell we are creating and training a multi layer perceptron (we refer to it as deep neural network, DNN) model
    
    task_channel: the current channel that we aim to predict
    df_train_X: contains all available features excluding the features related to 'task_channel' we aim to predict (train)
    df_train_Y: contains all available features related to 'task_channel' only for the train
    test_plate: the ID of a given plate. This information assist us while printing the results.
    
    return: trained dnn model
    """
    # Stracture of the network#
    inputs = Input(shape=(df_train_X.shape[1],))
    dense1 = Dense(512,activation = 'relu')(inputs)
    dense2 = Dense(256,activation = 'relu')(dense1)
    dense3 = Dense(128,activation = 'relu')(dense2)    
    dense4 = Dense(100,activation = 'relu')(dense3)
    dense5 = Dense(50,activation = 'relu')(dense4)
    dense6 = Dense(25,activation = 'relu')(dense5)
    dense7 = Dense(10,activation = 'relu')(dense6)
    predictions = Dense(df_train_Y.shape[1],activation='sigmoid')(dense7)
    
    #model compiliation
    model = Model(inputs=inputs,outputs = predictions)
    model.compile(optimizer='adam',loss='mse')
    
    #model training    
    test_plate_number = test_plate[:5]
    folder = os.path.join(PROJECT_DIRECTORY, 'Models')
    filepath = os.path.join(folder, f'{test_plate_number}_{task_channel}.h5')
    my_callbacks = [ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=False, save_weights_only=False, mode='auto', period=1)]
    model.fit(df_train_X,df_train_Y,epochs = 5,batch_size=1024*8,verbose=0,shuffle=True,validation_split=0.2,callbacks=my_callbacks)
    return model
    
    

In [9]:
#    print_results(test_plate_number, task_channel, "Overall", "DNN", "None", "MSE", str(mean_squared_error(model_pred,channel_task_y)))
def print_results(plate_number, channel, family, model, _type, metric, value):
    """
    This function is creating a csv named: 'results' that contains all of the models’ performance (e.g. MSE) for each plate and each family of attributes
    plate_number: ID of palte
    channel: The channel we aim to predict
    family: features united by their charactheristics (e.g., Granularity, Texture)
    model: the model name
    _type: scaling method (e.g., MinMax Scaler or StandardScaler)
    metric: MSE/MAE
    value: value of the metric error    
    """
    results_path = os.path.join(PROJECT_DIRECTORY, 'Results')
    file_path = os.path.join(results_path, 'results.csv')
    files_list = os.listdir(results_path)
    if 'results.csv' not in files_list:
        file1 = open(file_path,"a+")     
        file1.write("Plate,Channel,Family,Model,Type,Metric,Value \n")
        file1.write(plate_number+","+channel+","+family+","+model+","+_type+","+metric+","+value+"\n")
        file1.close()
    else:
        file1 = open(file_path, "a+")
        file1.write(plate_number+","+channel+","+family+","+model+","+_type+","+metric+","+value+"\n")
        file1.close()

In [10]:
def get_family_MSE(test_plate_number, task_channel, model, _type, df, channel_task_y):    
    """
    This function is calculating the MSE/MAE measures for plates based on different models
    test_plate_number: ID of the examine plate
    task_channel: Channel we aim to predict
    model: model name
    _type: scaling method (e.g., MinMax Scaler or StandardScaler)
    df: prediction of any given ML model which aim to predict the channel_task_y
    channel_task_y: features corresponding to the 'task channel' (channel we aim to predict)    
    """
    Families = {'Granularity':[],
               'Intensity':[],
               'Location':[],
               'RadialDistribution':[],
               'Texture':[]}

    for name in (channel_task_y.columns):
        if '_Granularity' in name:
            Families['Granularity'].append(name)
        elif '_Intensity' in name:
            Families['Intensity'].append(name)
        elif '_Location' in name:
            Families['Location'].append(name)        
        elif '_RadialDistribution' in name:
            Families['RadialDistribution'].append(name)
        elif '_Texture' in name:
            Families['Texture'].append(name)
            
    for key in Families.keys():
        try:            
            print_results(test_plate_number, task_channel, key, model, _type, "MSE", str(mean_squared_error(df[Families[key]],channel_task_y[Families[key]])))
        except:
            if len(Families[key]) == 0:
                print('empty family {}'.format(key))
            else:
                print('problem in mse key')
            



In [11]:
def get_family_MAE(test_plate_number, task_channel, model, _type, df, channel_task_y):
    
    """
    This function is calculating the MSE/MAE measures for plates based on different models
    test_plate_number: ID of the examine plate
    task_channel: Channel we aim to predict
    model: model name
    _type: scaling method (e.g., MinMax Scaler or StandardScaler)
    df: prediction of any given ML model which aim to predict the channel_task_y
    channel_task_y: features corresponding to the 'task channel' (channel we aim to predict)    
    """
    
    Families = {'Granularity':[],
               'Intensity':[],
               'Location':[],
               'RadialDistribution':[],
               'Texture':[]}

    for name in (channel_task_y.columns):
        if '_Granularity' in name:
            Families['Granularity'].append(name)
        elif '_Intensity' in name:
            Families['Intensity'].append(name)
        elif '_Location' in name:
            Families['Location'].append(name)        
        elif '_RadialDistribution' in name:
            Families['RadialDistribution'].append(name)
        elif '_Texture' in name:
            Families['Texture'].append(name)
            
    for key in Families.keys():
        try:            
            print_results(test_plate_number, task_channel, key, model, _type, "MAE", str(mean_absolute_error(df[Families[key]],channel_task_y[Families[key]])))
        except:
            if len(Families[key]) == 0:
                print('empty family {}'.format(key))
            else:
                print('problem in mae key')
        

In [12]:
def main(path,scale_method):
    """
    This is the main function of the preprocessing steps.
    This function will iterate all over the sqlite files and do the following:
    1) prepate train + test files
    2) scale train + test files (x + y values separately)
    3) return: 
        task_channel -> string, reflect the relevant channel for test. For example, 'AGP'
        df_train_X -> DataFrame, (instances,features) for the train set
        df_train_Y -> DataFrame, (instances,labels) for the train set
        channel_task_x -> DataFrame, (instances,features) for the test set
        channel_task_y -> DataFrame, (instances,labels) for the test set
    """

    csv_files= [_ for _ in os.listdir(path) if _.endswith(".csv")]
    for task_channel in tqdm(CHANNELS):        
        # This is the current file that we will predict        
        for test_plate in csv_files:
            print(test_plate)

            channel_task_x, channel_task_y = split_channels_x_and_y(path + test_plate, task_channel)
            print(channel_task_x[LABEL_FIELD].unique())

            channel_task_x_mock = channel_task_x[channel_task_x[LABEL_FIELD]=='mock']
            channel_task_x_treated = channel_task_x[channel_task_x[LABEL_FIELD]=='treated']

            channel_task_y_mock = channel_task_y.loc[channel_task_x_mock.index]
            channel_task_y_treated = channel_task_y.loc[channel_task_x_treated.index]

            channel_task_x_mock.drop([LABEL_FIELD, 'Image_Metadata_Well', 'Metadata_broad_sample'],inplace=True,axis=1)
            channel_task_x_treated.drop([LABEL_FIELD, 'Image_Metadata_Well', 'Metadata_broad_sample'],inplace=True,axis=1)


            std_df_treated_x ,min_max_df_treated_x = scale_data(channel_task_x_treated)
            std_df_treated_y ,min_max_df_treated_y = scale_data(channel_task_y_treated)
            std_df_mock_x ,min_max_df_mock_x = scale_data(channel_task_x_mock)
            std_df_mock_y ,min_max_df_mock_y = scale_data(channel_task_y_mock)

                
        # This is all other files X input
            list_x_df = []
            list_y_df = []
            
            
            for train_plate in tqdm(csv_files):
                if train_plate!=test_plate:
                    curr_x, curr_y = split_channels_x_and_y(path + train_plate, task_channel)
                    curr_x = curr_x[curr_x[LABEL_FIELD]=='mock']
                    curr_y = curr_y.loc[curr_x.index]
                    ## drop additional label fields
                    curr_x.drop([LABEL_FIELD, 'Image_Metadata_Well', 'Metadata_broad_sample'],inplace=True,axis=1)

                    list_x_df.append(curr_x)
                    list_y_df.append(curr_y)
            
            df_train_X = pd.concat(list_x_df)
            df_train_Y = pd.concat(list_y_df)   
            
             # Scale for training set#
            std_df ,min_max_df = scale_data(df_train_X)            
            std_df_y ,min_max_df_y = scale_data(df_train_Y)
            
            #Scale for testing set - treated#
            std_df_channel_task_treated ,min_max_df_channel_task_treated = scale_data(channel_task_x_treated)
            std_df_y_test_treated ,min_max_df_y_test_treated = scale_data(channel_task_y_treated)
            
            #Scale for testing set - mock#
            std_df_channel_task_mock ,min_max_df_channel_task_mock = scale_data(channel_task_x_mock)
            std_df_y_test_mock ,min_max_df_y_test_mock = scale_data(channel_task_y_mock)   
            
            if scale_method == 'MinMax':
                #train set#
                df_train_X = min_max_df
                df_train_Y = min_max_df_y
                
                #treated #
                df_test_X_treated = min_max_df_channel_task_treated
                df_test_Y_treated = min_max_df_y_test_treated
                
                #mock#                
                df_test_X_mock = min_max_df_channel_task_mock
                df_test_Y_mock = min_max_df_y_test_mock
                
                
            elif scale_method == 'Std':
                #train set#
                df_train_X = std_df
                df_train_Y = std_df_y
                
                #treated #
                df_test_X_treated = std_df_channel_task_treated
                df_test_Y_treated = std_df_y_test_treated
                
                #mock#                
                df_test_X_mock = std_df_channel_task_mock
                df_test_Y_mock = std_df_y_test_mock
                
                
        # Model Creation - AVG MSE for each model:
            print(test_plate)
            print(task_channel+":")
            LR_model = create_LR(df_train_X, df_train_Y)
            Ridge_model = create_Ridge(df_train_X, df_train_Y)
#             DNN_model = create_model_dnn(task_channel,df_train_X, df_train_Y,test_plate)
#             svr_model = create_SVR(task_channel,df_train_X, df_train_Y, channel_task_x, channel_task_y)
            #return task_channel,df_train_X, df_train_Y, channel_task_x, channel_task_y
    
            print('**************')
            print('LR')
            print('profile_treated:') 
            yhat_lr = pd.DataFrame(LR_model.predict(std_df_treated_x.values),columns=std_df_treated_y.columns)                           
            print('Linear Reg MSE: {}'.format(mean_squared_error(yhat_lr,std_df_treated_y.values)))  
            print('Linear Reg MAE: {}'.format(mean_absolute_error(yhat_lr,std_df_treated_y.values)))
            
            print_results(test_plate, task_channel, 'Overall', 'Linear Regression', 'Treated', 'MSE', str(mean_squared_error(yhat_lr,std_df_treated_y.values)))
            print_results(test_plate, task_channel, 'Overall', 'Linear Regression', 'Treated', 'MAE', str(mean_absolute_error(yhat_lr,std_df_treated_y.values)))
            
            get_family_MSE(test_plate, task_channel, "Linear Regression", "Treated", yhat_lr,std_df_treated_y)
            get_family_MAE(test_plate, task_channel, "Linear Regression", "Treated", yhat_lr,std_df_treated_y)
            
            #get_family_MSE(yhat_lr,std_df_treated_y)
            #get_family_MAE(yhat_lr,std_df_treated_y)
            
            print('profile_mock:')            
            yhat_lr = pd.DataFrame(LR_model.predict(std_df_mock_x.values),columns=std_df_mock_y.columns)   
            print('Linear Reg MSE: {}'.format(mean_squared_error(yhat_lr,std_df_mock_y.values)))  
            print('Linear Reg MAE: {}'.format(mean_absolute_error(yhat_lr,std_df_mock_y.values)))  
            
            print_results(test_plate, task_channel, 'Overall', 'Linear Regression', 'Mock', 'MSE', str(mean_squared_error(yhat_lr,std_df_mock_y.values)))
            print_results(test_plate, task_channel, 'Overall', 'Linear Regression', 'Mock', 'MAE', str(mean_absolute_error(yhat_lr,std_df_mock_y.values)))            
            #get_family_MSE(yhat_lr,std_df_mock_y)
            #get_family_MAE(yhat_lr,std_df_mock_y)
            get_family_MSE(test_plate, task_channel, "Linear Regression", "Mock", yhat_lr,std_df_mock_y)
            get_family_MAE(test_plate, task_channel, "Linear Regression", "Mock", yhat_lr,std_df_mock_y)
                          
            print('**************')
            
            print('**************')
            print('Ridge')
            print('profile_treated:') 
            yhat_ridge = pd.DataFrame(Ridge_model.predict(std_df_treated_x.values),columns=std_df_treated_y.columns)                           
            print('Ridge MSE: {}'.format(mean_squared_error(yhat_ridge,std_df_treated_y.values)))  
            print('Ridge MAE: {}'.format(mean_absolute_error(yhat_ridge,std_df_treated_y.values)))  
                          
            print_results(test_plate, task_channel, 'Overall', 'Ridge', 'Treated', 'MSE', str(mean_squared_error(yhat_ridge,std_df_treated_y.values)))
            print_results(test_plate, task_channel, 'Overall', 'Ridge', 'Treated', 'MAE', str(mean_absolute_error(yhat_ridge,std_df_treated_y.values)))
                    
                          
            get_family_MSE(test_plate, task_channel, "Ridge", "Treated", yhat_ridge,std_df_treated_y)
            get_family_MAE(test_plate, task_channel, "Ridge", "Treated", yhat_ridge,std_df_treated_y)
                          
            #get_family_MSE(yhat_lr,std_df_treated_y)
            #get_family_MAE(yhat_lr,std_df_treated_y)
            
            print('profile_mock:')            
            yhat_ridge = pd.DataFrame(Ridge_model.predict(std_df_mock_x.values),columns=std_df_mock_y.columns)   
            print('Ridge MSE: {}'.format(mean_squared_error(yhat_ridge,std_df_mock_y.values)))  
            print('Ridge Reg MAE: {}'.format(mean_absolute_error(yhat_ridge,std_df_mock_y.values)))  
            #get_family_MSE(yhat_ridge,std_df_mock_y)
            #get_family_MAE(yhat_ridge,std_df_mock_y)
            print_results(test_plate, task_channel, 'Overall', 'Ridge', 'Mock', 'MSE', str(mean_squared_error(yhat_ridge,std_df_mock_y.values)))
            print_results(test_plate, task_channel, 'Overall', 'Ridge', 'Mock', 'MAE', str(mean_absolute_error(yhat_ridge,std_df_mock_y.values)))
                    
                          
            get_family_MSE(test_plate, task_channel, "Ridge", "Mock", yhat_ridge,std_df_mock_y)
            get_family_MAE(test_plate, task_channel, "Ridge", "Mock", yhat_ridge,std_df_mock_y)
            print('**************')
            
#             print('**************')
#             print('DNN')
#             print('profile_treated:') 
#             yhat_DNN = pd.DataFrame(DNN_model.predict(std_df_treated_x.values),columns=std_df_treated_y.columns)                           
#             print('DNN MSE: {}'.format(mean_squared_error(yhat_DNN,std_df_treated_y.values)))  
#             print('DNN MAE: {}'.format(mean_absolute_error(yhat_DNN,std_df_treated_y.values)))  
#             #get_family_MSE(yhat_DNN,std_df_treated_y)
#             #get_family_MAE(yhat_DNN,std_df_treated_y)
#             print_results(test_plate, task_channel, 'Overall', 'DNN', 'Treated', 'MSE', str(mean_squared_error(yhat_DNN,std_df_treated_y.values)))
#             print_results(test_plate, task_channel, 'Overall', 'DNN', 'Treated', 'MAE', str(mean_absolute_error(yhat_DNN,std_df_treated_y.values)))
                    
                          
#             get_family_MSE(test_plate, task_channel, "DNN", "Treated", yhat_DNN,std_df_treated_y)
#             get_family_MAE(test_plate, task_channel, "DNN", "Treated", yhat_DNN,std_df_treated_y)
            
#             print('profile_mock:')            
#             yhat_DNN = pd.DataFrame(DNN_model.predict(std_df_mock_x.values),columns=std_df_mock_y.columns)   
#             print('DNN MSE: {}'.format(mean_squared_error(yhat_DNN,std_df_mock_y.values)))  
#             print('DNN MAE: {}'.format(mean_absolute_error(yhat_DNN,std_df_mock_y.values)))  
                          
                          
#             print_results(test_plate, task_channel, 'Overall', 'DNN', 'Mock', 'MSE', str(mean_squared_error(yhat_DNN,std_df_mock_y.values)))
#             print_results(test_plate, task_channel, 'Overall', 'DNN', 'Mock', 'MAE', str(mean_absolute_error(yhat_DNN,std_df_mock_y.values)))
                    
                          
#             get_family_MSE(test_plate, task_channel, "DNN", "Mock", yhat_DNN,std_df_mock_y)
#             get_family_MAE(test_plate, task_channel, "DNN", "Mock", yhat_DNN,std_df_mock_y)
#             #get_family_MSE(yhat_DNN,std_df_mock_y)
#             #get_family_MAE(yhat_DNN,std_df_mock_y)
#             print('**************')

# Main

In [13]:
main('csvs/','Std')

  0%|                                                                                            | 0/5 [00:00<?, ?it/s]

26569.csv
['mock' 'treated']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,

  0%|                                                                                            | 0/3 [00:00<?, ?it/s][A
 67%|████████████████████████████████████████████████████████                            | 2/3 [00:02<00:01,  1.36s/it][A
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:07<00:00,  2.46s/it][A


26569.csv
AGP:
**************
LR
profile_treated:
Linear Reg MSE: 457.5545027140529
Linear Reg MAE: 14.305113258584605
profile_mock:
Linear Reg MSE: 2476.9573170659146
Linear Reg MAE: 30.213916336917787
**************
**************
Ridge
profile_treated:
Ridge MSE: 0.45317444366924337
Ridge MAE: 0.44713301101426867
profile_mock:
Ridge MSE: 0.437617530234788
Ridge Reg MAE: 0.44325231013595967
**************
26572.csv
['treated' 'mock']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,

  0%|                                                                                            | 0/3 [00:00<?, ?it/s][A
 33%|████████████████████████████                                                        | 1/3 [00:00<00:01,  1.21it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:05<00:00,  1.80s/it][A


26572.csv
AGP:
**************
LR
profile_treated:
Linear Reg MSE: 2367.7853895548697
Linear Reg MAE: 32.10843856464617
profile_mock:
Linear Reg MSE: 5200.131489226318
Linear Reg MAE: 41.47267429681776
**************
**************
Ridge
profile_treated:
Ridge MSE: 0.46677998910827534
Ridge MAE: 0.45219613802786995
profile_mock:
Ridge MSE: 0.4354205897632473
Ridge Reg MAE: 0.42750194377512973
**************
26574.csv
['treated' 'mock']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,

  0%|                                                                                            | 0/3 [00:00<?, ?it/s][A
 33%|████████████████████████████                                                        | 1/3 [00:00<00:01,  1.21it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:03<00:00,  1.19s/it][A


26574.csv
AGP:
**************
LR
profile_treated:
Linear Reg MSE: 18921.28803602417
Linear Reg MAE: 78.50983396062955
profile_mock:
Linear Reg MSE: 8662.016610810257
Linear Reg MAE: 50.441703791957416
**************
**************
Ridge
profile_treated:
Ridge MSE: 0.5841386613913744
Ridge MAE: 0.5013085442182337
profile_mock:
Ridge MSE: 0.5566457016401685
Ridge Reg MAE: 0.49231320561794667
**************

 20%|████████████████▊                                                                   | 1/5 [00:32<02:11, 32.91s/it]


26569.csv
['mock' 'treated']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,

  0%|                                                                                            | 0/3 [00:00<?, ?it/s][A
 67%|████████████████████████████████████████████████████████                            | 2/3 [00:02<00:01,  1.42s/it][A
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:07<00:00,  2.58s/it][A


26569.csv
DNA:
**************
LR
profile_treated:
Linear Reg MSE: 181.68354577062047
Linear Reg MAE: 8.110421757825195
empty family Granularity
empty family Granularity
profile_mock:
Linear Reg MSE: 88.03292134917537
Linear Reg MAE: 5.58906772737639
empty family Granularity
empty family Granularity
**************
**************
Ridge
profile_treated:
Ridge MSE: 0.31360388297648745
Ridge MAE: 0.38064718215351956
empty family Granularity
empty family Granularity
profile_mock:
Ridge MSE: 0.3262464109494893
Ridge Reg MAE: 0.39254011838828745
empty family Granularity
empty family Granularity
**************
26572.csv
['treated' 'mock']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,

  0%|                                                                                            | 0/3 [00:00<?, ?it/s][A
 33%|████████████████████████████                                                        | 1/3 [00:00<00:01,  1.09it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:05<00:00,  1.88s/it][A


26572.csv
DNA:
**************
LR
profile_treated:
Linear Reg MSE: 423.9469312889442
Linear Reg MAE: 11.763544570186749
empty family Granularity
empty family Granularity
profile_mock:
Linear Reg MSE: 917.7178755848632
Linear Reg MAE: 19.02286552684089
empty family Granularity
empty family Granularity
**************
**************
Ridge
profile_treated:
Ridge MSE: 0.3222962152773411
Ridge MAE: 0.3688397218491284
empty family Granularity
empty family Granularity
profile_mock:
Ridge MSE: 0.2852987010909896
Ridge Reg MAE: 0.3639604339685609
empty family Granularity
empty family Granularity
**************
26574.csv
['treated' 'mock']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,

  0%|                                                                                            | 0/3 [00:00<?, ?it/s][A
 33%|████████████████████████████                                                        | 1/3 [00:00<00:01,  1.22it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:03<00:00,  1.18s/it][A


26574.csv
DNA:
**************
LR
profile_treated:
Linear Reg MSE: 1401.7725395114442
Linear Reg MAE: 22.707052294214503
empty family Granularity
empty family Granularity
profile_mock:
Linear Reg MSE: 431.4080353280184
Linear Reg MAE: 13.18756491042259
empty family Granularity
empty family Granularity
**************
**************
Ridge
profile_treated:
Ridge MSE: 0.371253446884824
Ridge MAE: 0.37948317851805363
empty family Granularity
empty family Granularity
profile_mock:
Ridge MSE: 0.39050671260780184
Ridge Reg MAE: 0.3838474738944274
empty family Granularity
empty family Granularity

 40%|█████████████████████████████████▌                                                  | 2/5 [01:06<01:38, 33.00s/it]


**************
26569.csv
['mock' 'treated']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,

  0%|                                                                                            | 0/3 [00:00<?, ?it/s][A
 67%|████████████████████████████████████████████████████████                            | 2/3 [00:02<00:01,  1.34s/it][A
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:07<00:00,  2.44s/it][A


26569.csv
ER:
**************
LR
profile_treated:
Linear Reg MSE: 113.01675227643328
Linear Reg MAE: 6.177664970734134
profile_mock:
Linear Reg MSE: 874.0856041620967
Linear Reg MAE: 15.754624472865824
**************
**************
Ridge
profile_treated:
Ridge MSE: 0.3248285450143897
Ridge MAE: 0.3873913147964881
profile_mock:
Ridge MSE: 0.335083365155424
Ridge Reg MAE: 0.3946509474999663
**************
26572.csv
['treated' 'mock']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,

  0%|                                                                                            | 0/3 [00:00<?, ?it/s][A
 33%|████████████████████████████                                                        | 1/3 [00:00<00:01,  1.26it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:05<00:00,  1.78s/it][A


26572.csv
ER:
**************
LR
profile_treated:
Linear Reg MSE: 1025.4060406878457
Linear Reg MAE: 18.93645525402495
profile_mock:
Linear Reg MSE: 2984.474618570664
Linear Reg MAE: 30.639388730139075
**************
**************
Ridge
profile_treated:
Ridge MSE: 0.33638049309548074
Ridge MAE: 0.3920001947279246
profile_mock:
Ridge MSE: 0.31680624247619626
Ridge Reg MAE: 0.37961615175975577
**************
26574.csv
['treated' 'mock']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,

  0%|                                                                                            | 0/3 [00:00<?, ?it/s][A
 33%|████████████████████████████                                                        | 1/3 [00:00<00:01,  1.26it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:03<00:00,  1.12s/it][A


26574.csv
ER:
**************
LR
profile_treated:
Linear Reg MSE: 9953.64347649267
Linear Reg MAE: 53.374198146126325
profile_mock:
Linear Reg MSE: 4072.8541101691244
Linear Reg MAE: 33.34543894496587
**************
**************
Ridge
profile_treated:
Ridge MSE: 0.40336376407394986
Ridge MAE: 0.4251076518427175


 60%|██████████████████████████████████████████████████▍                                 | 3/5 [01:38<01:05, 32.76s/it]

profile_mock:
Ridge MSE: 0.400516393992223
Ridge Reg MAE: 0.4279299027466207
**************
26569.csv
['mock' 'treated']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,

  0%|                                                                                            | 0/3 [00:00<?, ?it/s][A
 67%|████████████████████████████████████████████████████████                            | 2/3 [00:02<00:01,  1.28s/it][A
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:07<00:00,  2.39s/it][A


26569.csv
Mito:
**************
LR
profile_treated:
Linear Reg MSE: 94.29936054916266
Linear Reg MAE: 5.62415765988197
profile_mock:
Linear Reg MSE: 811.9607224659585
Linear Reg MAE: 15.475516160639541
**************
**************
Ridge
profile_treated:
Ridge MSE: 0.27489191306663385
Ridge MAE: 0.3263977857087278
profile_mock:
Ridge MSE: 0.27993478894772467
Ridge Reg MAE: 0.33144702065878345
**************
26572.csv
['treated' 'mock']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,

  0%|                                                                                            | 0/3 [00:00<?, ?it/s][A
 33%|████████████████████████████                                                        | 1/3 [00:00<00:01,  1.21it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:05<00:00,  1.81s/it][A


26572.csv
Mito:
**************
LR
profile_treated:
Linear Reg MSE: 522.5938350543943
Linear Reg MAE: 13.426932859834373
profile_mock:
Linear Reg MSE: 1150.492301594082
Linear Reg MAE: 19.09876757654242
**************
**************
Ridge
profile_treated:
Ridge MSE: 0.2943901192139606
Ridge MAE: 0.3322987154077803
profile_mock:
Ridge MSE: 0.2353363744180137
Ridge Reg MAE: 0.3024385921614865
**************
26574.csv
['treated' 'mock']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,

  0%|                                                                                            | 0/3 [00:00<?, ?it/s][A
 33%|████████████████████████████                                                        | 1/3 [00:00<00:01,  1.24it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:03<00:00,  1.12s/it][A


26574.csv
Mito:
**************
LR
profile_treated:
Linear Reg MSE: 10026.807270675898
Linear Reg MAE: 58.77708575895765
profile_mock:
Linear Reg MSE: 4276.684172263204
Linear Reg MAE: 36.68070335761251
**************
**************
Ridge
profile_treated:
Ridge MSE: 0.352312215522312
Ridge MAE: 0.37280099848087234


 80%|███████████████████████████████████████████████████████████████████▏                | 4/5 [02:10<00:32, 32.59s/it]

profile_mock:
Ridge MSE: 0.30153692456854525
Ridge Reg MAE: 0.3470106033481534
**************
26569.csv
['mock' 'treated']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,

  0%|                                                                                            | 0/3 [00:00<?, ?it/s][A
 67%|████████████████████████████████████████████████████████                            | 2/3 [00:02<00:01,  1.27s/it][A
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:07<00:00,  2.34s/it][A


26569.csv
RNA:
**************
LR
profile_treated:
Linear Reg MSE: 132.27407379708075
Linear Reg MAE: 7.471686339262549
profile_mock:
Linear Reg MSE: 1019.7313889863213
Linear Reg MAE: 18.80291032655262
**************
**************
Ridge
profile_treated:
Ridge MSE: 0.23259205759323603
Ridge MAE: 0.31197049242191943
profile_mock:
Ridge MSE: 0.2245144411535015
Ridge Reg MAE: 0.3059507236553596
**************
26572.csv
['treated' 'mock']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,

  0%|                                                                                            | 0/3 [00:00<?, ?it/s][A
 33%|████████████████████████████                                                        | 1/3 [00:00<00:01,  1.23it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:05<00:00,  1.76s/it][A


26572.csv
RNA:
**************
LR
profile_treated:
Linear Reg MSE: 620.5627012166158
Linear Reg MAE: 15.145425152425867
profile_mock:
Linear Reg MSE: 1355.089948432224
Linear Reg MAE: 20.603812015889044
**************
**************
Ridge
profile_treated:
Ridge MSE: 0.2580117126984367
Ridge MAE: 0.32010712412886155
profile_mock:
Ridge MSE: 0.21119657534068637
Ridge Reg MAE: 0.2940328080726246
**************
26574.csv
['treated' 'mock']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,

  0%|                                                                                            | 0/3 [00:00<?, ?it/s][A
 33%|████████████████████████████                                                        | 1/3 [00:00<00:01,  1.23it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:03<00:00,  1.12s/it][A


26574.csv
RNA:
**************
LR
profile_treated:
Linear Reg MSE: 10649.19998112121
Linear Reg MAE: 60.61676151120197
profile_mock:
Linear Reg MSE: 4427.175341625324
Linear Reg MAE: 37.07029807244756
**************
**************
Ridge
profile_treated:
Ridge MSE: 0.2937601310867454
Ridge MAE: 0.34450925950979927


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [02:42<00:00, 32.43s/it]

profile_mock:
Ridge MSE: 0.2703761291688972
Ridge Reg MAE: 0.33450063681822556
**************



