# Imports

In this cell we are importing all relevant packages for our project

In [1]:
# connections and OS
import pandas as pd
import seaborn as sns
import os
import sqlite3
import csv

#utils (Pandas,numpy,tqdm)
import numpy as np
import pandas as pd
from tqdm import tqdm

#visualize 
import seaborn as sns

#preprocessing, metrices and splits 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.preprocessing import StandardScaler,MinMaxScaler

#ML models:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression, Ridge


#tensorflow layer, callbacks and layers
import tensorflow.keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Concatenate, Input
from tensorflow.keras.callbacks import ModelCheckpoint, Callback

# Constants and params

Change directory to project directory within the department cluster (SLURM) and define to constant variables


In [2]:
#change directory to project directory within the department cluster (SLURM)
os.chdir('../../storage/users/AmitAdirProject/Data/sqlite/')
CHANNELS = ["AGP","DNA","ER","Mito","RNA"]

# Preprocessing functions

In [3]:
def scale_data(df):
    """
    This function is scaling the data using two methods: STD and MinMax
    df: dataFrame     
    return: two scaled dataframes, the first one according to StandardScaler and the second according to MinMaxScaler
    """
    std_scalar = StandardScaler()
    minMax_scalar = MinMaxScaler()
    
    std_df = pd.DataFrame(std_scalar.fit_transform(df),columns=df.columns)
    minmax_df = pd.DataFrame(minMax_scalar.fit_transform(df),columns=df.columns)    
    std_df.fillna(0,inplace=True)
    minmax_df.fillna(0,inplace=True)
    
    return std_df,minmax_df

In [17]:
def append_cells_labels(filename):
    """    
    This function is responsible for creating label (Mock/Tretament) for cell resultion and create a corresponding csv file
    filename: the full file path    
    """
    cnx2 = sqlite3.connect(filename)
    df_image = pd.read_sql_query("SELECT ImageNumber,Image_Metadata_Well FROM Image", cnx2)
    table_name = filename[44:-7]
    df_well = pd.read_csv("//storage//users//AmitAdirProject//Data//mean_well_profiles//"+table_name+".csv")
    df_well = df_well[["Metadata_Well","Metadata_ASSAY_WELL_ROLE"]]
    print('Done reading Image sql')
    
    dict_treatment = {} 
    for i, row in df_image.iterrows():
        label = df_well.loc[df_well['Metadata_Well'] == row['Image_Metadata_Well'], 'Metadata_ASSAY_WELL_ROLE'].iloc[0]
        dict_treatment[row['ImageNumber']] = label
    
    
    dict_channel = {}
    # Create your connection.
    cnx = sqlite3.connect(filename)
    #df is the cell dataframe
    df = pd.read_sql_query("SELECT * FROM Cells", cnx)
    df['label'] = df['ImageNumber'].map(dict_treatment)
    df.to_csv("/storage/users/AmitAdirProject/Data/sqlite/"+table_name+".csv")
    print("Done with " + table_name)

Now, we will generate a csv for each plate with the corresponding mock/treated data

In [18]:
path_plates = "/storage/users/AmitAdirProject/Data/sqlite/"
for plate in tqdm(os.listdir(path_plates)):
    if plate.endswith(".sqlite"):
        append_cells_labels(path_plates+"/"+plate)

  0%|          | 0/3 [00:00<?, ?it/s]

Done reading Image sql
Done with 25740


 33%|███▎      | 1/3 [02:51<05:43, 171.75s/it]

Done reading Image sql


 67%|██████▋   | 2/3 [03:22<02:09, 129.41s/it]

Done with 26572
Done reading Image sql
Done with 25741


100%|██████████| 3/3 [06:08<00:00, 122.85s/it]


In [19]:
def split_channels_x_and_y(filename, task_channel):
    """
    This function is responsible for splitting five channels into four channels as train and the remaining channel to test
    filename: file path to the cell table from a single plate
    task_channel: the current channel that we aim to predict
    
    Notably: In order to avoid leakage we drop all 'correlation features
    return: seperated dataframes x_features and y_df. 
            x_features: contains all available features excluding the features related to 'task_channel' we aim to predict
            y_df: contains all available features related to 'task_channel' only
    """
    dict_channel = {}
    
    #df = pd.read_csv(filename+".csv")
    df = pd.read_csv(filename)
    df = df.set_index(['ImageNumber', 'ObjectNumber'])
    df.drop(['TableNumber'],inplace=True,axis=1)
    df.dropna(inplace=True)
    
    # Data Preperation
    general_featuers = df.iloc[:, 0:52]
    general_featuers['label'] = df['label']
    df = df.iloc[:, 52:]
    for channel in CHANNELS:
        dict_channel[channel] = df[[col for col in df.columns if channel in col]]

    ready_channel_features = []
    for feature_name in dict_channel:
        if feature_name != task_channel:
            curr_channel_features = dict_channel[feature_name]
            curr_channel_features = curr_channel_features[[col for col in curr_channel_features.columns if task_channel not in col]]
            ready_channel_features.append(curr_channel_features)
    x_features_df = ready_channel_features[0]    
    for i in range(1, len(ready_channel_features)):
        x_features_df = x_features_df.join(ready_channel_features[i], how='outer', lsuffix='_left',
                                           rsuffix='_right')
    x_features_df = general_featuers.join(x_features_df, how='outer', lsuffix='_left', rsuffix='_right')
    y_df = dict_channel[task_channel]
    corr_cols = [c for c in y_df.columns if 'correlation' not in c.lower()]
    y_df = y_df[corr_cols]
    return x_features_df, y_df

# Create Models

In the following three cells we are creating three ML models

In [28]:
def create_LR(df_train_X, df_train_Y):  
    """
    In this cell we are creating and training a linear regression model        
    df_train_X: contains all available features excluding the features related to 'task_channel' we aim to predict (train)
    df_train_Y: contains all available features related to 'task_channel' only for the train
    
    
    return: trained linear regression model
    """
    LR_model = LinearRegression()    
    LR_model.fit(df_train_X.values,df_train_Y.values)
    return LR_model
    
    

In [29]:
def create_Ridge(df_train_X, df_train_Y):
    """    
    In this cell we are creating and training a ridge regression model    
    
    
    df_train_X: contains all available features excluding the features related to 'task_channel' we aim to predict (train)
    df_train_Y: contains all available features related to 'task_channel' only for the train    
    
    return: trained linear regression model
    """
    Ridge_model = Ridge()    
    Ridge_model.fit(X=df_train_X.values,y=df_train_Y.values)    
    return Ridge_model

In [30]:
def create_model_dnn(task_channel,df_train_X, df_train_Y,test_plate):
    """    
    In this cell we are creating and training a multi layer perceptron (we refer to it as deep neural network, DNN) model
    
    task_channel: the current channel that we aim to predict
    df_train_X: contains all available features excluding the features related to 'task_channel' we aim to predict (train)
    df_train_Y: contains all available features related to 'task_channel' only for the train
    channel_task_x: contains all available features excluding the features related to 'task_channel' we aim to predict (test)
    channel_task_y: contains all available features related to 'task_channel' only for the test
    test_plate: the ID of a given plate. This information assist us while printing the results.
    
    return: trained linear regression model
    """
    # Stracture of the network#
    inputs = Input(shape=(df_train_X.shape[1],))
    dense1 = Dense(512,activation = 'relu')(inputs)
    dense2 = Dense(256,activation = 'relu')(dense1)
    dense3 = Dense(128,activation = 'relu')(dense2)    
    dense4 = Dense(100,activation = 'relu')(dense3)
    dense5 = Dense(50,activation = 'relu')(dense4)
    dense6 = Dense(25,activation = 'relu')(dense5)
    dense7 = Dense(10,activation = 'relu')(dense6)
    predictions = Dense(df_train_Y.shape[1],activation='sigmoid')(dense7)
    
    #model compiliation
    model = Model(inputs=inputs,outputs = predictions)
    model.compile(optimizer='adam',loss='mse')
    
    #model training    
    test_plate_number = test_plate[:5]
    filepath = os.path.join('../../Models/',f'{test_plate_number}_{task_channel}.h5')
    my_callbacks = [ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=False, save_weights_only=False, mode='auto', period=1)]
    model.fit(df_train_X,df_train_Y,epochs = 5,batch_size=1024*8,verbose=0,shuffle=True,validation_split=0.2,callbacks=my_callbacks)
    return model
    
    

In [31]:
#    print_results(test_plate_number, task_channel, "Overall", "DNN", "None", "MSE", str(mean_squared_error(model_pred,channel_task_y)))
def print_results(plate_number, channel, family, model, _type, metric, value):
    """
    This function is creating a csv named: 'results' that contains all of the models’ performance (e.g. MSE) for each plate and each family of attributes
    plate_number: ID of palte
    channel: The channel we aim to predict
    family: features united by their charactheristics (e.g., Granularity, Texture)
    model: the model name
    _type: scaling method (e.g., MinMax Scaler or StandardScaler)
    metric: MSE/MAE
    value: value of the metric error    
    """
    results_path = "/storage/users/AmitAdirProject/Data/Results/"
    files_list = os.listdir(results_path)
    if 'results.csv' not in files_list:
        file1 = open("/storage/users/AmitAdirProject/Data/Results/results.csv","a+")     
        file1.write("Plate,Channel,Family,Model,Type,Metric,Value \n")
        file1.write(plate_number+","+channel+","+family+","+model+","+_type+","+metric+","+value+"\n")
        file1.close()
    else:
        file1 = open("/storage/users/AmitAdirProject/Data/Results/results.csv","a+")
        file1.write(plate_number+","+channel+","+family+","+model+","+_type+","+metric+","+value+"\n")
        file1.close()

In [32]:
def get_family_MSE(test_plate_number, task_channel, model, _type, df, channel_task_y):    
    """
    This function is calculating the MSE/MAE measures for plates based on different models
    test_plate_number: ID of the examine plate
    task_channel: Channel we aim to predict
    model: model name
    _type: scaling method (e.g., MinMax Scaler or StandardScaler)
    df: prediction of any given ML model which aim to predict the channel_task_y
    channel_task_y: features corresponding to the 'task channel' (channel we aim to predict)    
    """
    Families = {'Granularity':[],
               'Intensity':[],
               'Location':[],
               'RadialDistribution':[],
               'Texture':[]}

    for name in (channel_task_y.columns):
        if '_Granularity' in name:
            Families['Granularity'].append(name)
        elif '_Intensity' in name:
            Families['Intensity'].append(name)
        elif '_Location' in name:
            Families['Location'].append(name)        
        elif '_RadialDistribution' in name:
            Families['RadialDistribution'].append(name)
        elif '_Texture' in name:
            Families['Texture'].append(name)
            
    for key in Families.keys():
        try:            
            print_results(test_plate_number, task_channel, key, model, _type, "MSE", str(mean_squared_error(df[Families[key]],channel_task_y[Families[key]])))
        except:
            if len(Families[key]) == 0:
                print('empty family {}'.format(key))
            else:
                print('problem in mse key')
            



In [33]:
def get_family_MAE(test_plate_number, task_channel, model, _type, df, channel_task_y):
    
    """
    This function is calculating the MSE/MAE measures for plates based on different models
    test_plate_number: ID of the examine plate
    task_channel: Channel we aim to predict
    model: model name
    _type: scaling method (e.g., MinMax Scaler or StandardScaler)
    df: prediction of any given ML model which aim to predict the channel_task_y
    channel_task_y: features corresponding to the 'task channel' (channel we aim to predict)    
    """
    
    Families = {'Granularity':[],
               'Intensity':[],
               'Location':[],
               'RadialDistribution':[],
               'Texture':[]}

    for name in (channel_task_y.columns):
        if '_Granularity' in name:
            Families['Granularity'].append(name)
        elif '_Intensity' in name:
            Families['Intensity'].append(name)
        elif '_Location' in name:
            Families['Location'].append(name)        
        elif '_RadialDistribution' in name:
            Families['RadialDistribution'].append(name)
        elif '_Texture' in name:
            Families['Texture'].append(name)
            
    for key in Families.keys():
        try:            
            print_results(test_plate_number, task_channel, key, model, _type, "MAE", str(mean_absolute_error(df[Families[key]],channel_task_y[Families[key]])))
        except:
            if len(Families[key]) == 0:
                print('empty family {}'.format(key))
            else:
                print('problem in mae key')
        

In [34]:
def main(path,scale_method):
    """
    This is the main function of the preprocessing steps.
    This function will iterate all over the sqlite files and do the following:
    1) prepate train + test files
    2) scale train + test files (x + y values separately)
    3) return: 
        task_channel -> string, reflect the relevant channel for test. For example, 'AGP'
        df_train_X -> DataFrame, (instances,features) for the train set
        df_train_Y -> DataFrame, (instances,labels) for the train set
        channel_task_x -> DataFrame, (instances,features) for the test set
        channel_task_y -> DataFrame, (instances,labels) for the test set
    """
    path_profiles = '/storage/users/AmitAdirProject/Data/mean_well_profiles/'
    csv_files= [_ for _ in os.listdir('/storage/users/AmitAdirProject/Data/sqlite/') if _.endswith(".csv")]    
    for task_channel in tqdm(CHANNELS):        
        # This is the current file that we will predict        
        for test_plate in csv_files:
            print(test_plate)
            if test_plate.endswith(".csv"):                
                channel_task_x, channel_task_y = split_channels_x_and_y(path + test_plate, task_channel)
                print(channel_task_x['label'].unique())
                
                channel_task_x_mock = channel_task_x[channel_task_x['label']=='mock']
                channel_task_x_treated = channel_task_x[channel_task_x['label']=='treated']
                
                channel_task_y_mock = channel_task_y.loc[channel_task_x_mock.index]
                channel_task_y_treated = channel_task_y.loc[channel_task_x_treated.index]
                
                channel_task_x_mock.drop(['label'],inplace=True,axis=1)
                channel_task_x_treated.drop(['label'],inplace=True,axis=1)
                
                
                std_df_treated_x ,min_max_df_treated_x = scale_data(channel_task_x_treated)
                std_df_treated_y ,min_max_df_treated_y = scale_data(channel_task_y_treated)
                std_df_mock_x ,min_max_df_mock_x = scale_data(channel_task_x_mock)
                std_df_mock_y ,min_max_df_mock_y = scale_data(channel_task_y_mock)

                
        # This is all other files X input
            list_x_df = []
            list_y_df = []
            
            
            for train_plate in tqdm(csv_files):
                if train_plate!=test_plate:
                    if train_plate.endswith(".csv"):
                        curr_x, curr_y = split_channels_x_and_y(path + train_plate, task_channel)
                        curr_x = curr_x[curr_x['label']=='mock']
                        curr_y = curr_y.loc[curr_x.index]                                              
                        curr_x.drop(['label'],inplace=True,axis=1)
                        
                        list_x_df.append(curr_x)                        
                        list_y_df.append(curr_y)
            
            df_train_X = pd.concat(list_x_df)
            df_train_Y = pd.concat(list_y_df)   
            
             # Scale for training set#
            std_df ,min_max_df = scale_data(df_train_X)            
            std_df_y ,min_max_df_y = scale_data(df_train_Y)
            
            #Scale for testing set - treated#
            std_df_channel_task_treated ,min_max_df_channel_task_treated = scale_data(channel_task_x_treated)
            std_df_y_test_treated ,min_max_df_y_test_treated = scale_data(channel_task_y_treated)
            
            #Scale for testing set - mock#
            std_df_channel_task_mock ,min_max_df_channel_task_mock = scale_data(channel_task_x_mock)
            std_df_y_test_mock ,min_max_df_y_test_mock = scale_data(channel_task_y_mock)   
            
            if scale_method == 'MinMax':
                #train set#
                df_train_X = min_max_df
                df_train_Y = min_max_df_y
                
                #treated #
                df_test_X_treated = min_max_df_channel_task_treated
                df_test_Y_treated = min_max_df_y_test_treated
                
                #mock#                
                df_test_X_mock = min_max_df_channel_task_mock
                df_test_Y_mock = min_max_df_y_test_mock
                
                
            elif scale_method == 'Std':
                #train set#
                df_train_X = std_df
                df_train_Y = std_df_y
                
                #treated #
                df_test_X_treated = std_df_channel_task_treated
                df_test_Y_treated = std_df_y_test_treated
                
                #mock#                
                df_test_X_mock = std_df_channel_task_mock
                df_test_Y_mock = std_df_y_test_mock
                
                
        # Model Creation - AVG MSE for each model:
            print(test_plate)
            print(task_channel+":")
            LR_model = create_LR(df_train_X, df_train_Y)
            Ridge_model = create_Ridge(df_train_X, df_train_Y)
            DNN_model = create_model_dnn(task_channel,df_train_X, df_train_Y,test_plate)
#             svr_model = create_SVR(task_channel,df_train_X, df_train_Y, channel_task_x, channel_task_y)
            #return task_channel,df_train_X, df_train_Y, channel_task_x, channel_task_y
    
            print('**************')
            print('LR')
            print('profile_treated:') 
            yhat_lr = pd.DataFrame(LR_model.predict(std_df_treated_x.values),columns=std_df_treated_y.columns)                           
            print('Linear Reg MSE: {}'.format(mean_squared_error(yhat_lr,std_df_treated_y.values)))  
            print('Linear Reg MAE: {}'.format(mean_absolute_error(yhat_lr,std_df_treated_y.values)))
            
            print_results(test_plate, task_channel, 'Overall', 'Linear Regression', 'Treated', 'MSE', str(mean_squared_error(yhat_lr,std_df_treated_y.values)))
            print_results(test_plate, task_channel, 'Overall', 'Linear Regression', 'Treated', 'MAE', str(mean_absolute_error(yhat_lr,std_df_treated_y.values)))
            
            get_family_MSE(test_plate, task_channel, "Linear Regression", "Treated", yhat_lr,std_df_treated_y)
            get_family_MAE(test_plate, task_channel, "Linear Regression", "Treated", yhat_lr,std_df_treated_y)
            
            #get_family_MSE(yhat_lr,std_df_treated_y)
            #get_family_MAE(yhat_lr,std_df_treated_y)
            
            print('profile_mock:')            
            yhat_lr = pd.DataFrame(LR_model.predict(std_df_mock_x.values),columns=std_df_mock_y.columns)   
            print('Linear Reg MSE: {}'.format(mean_squared_error(yhat_lr,std_df_mock_y.values)))  
            print('Linear Reg MAE: {}'.format(mean_absolute_error(yhat_lr,std_df_mock_y.values)))  
            
            print_results(test_plate, task_channel, 'Overall', 'Linear Regression', 'Mock', 'MSE', str(mean_squared_error(yhat_lr,std_df_mock_y.values)))
            print_results(test_plate, task_channel, 'Overall', 'Linear Regression', 'Mock', 'MAE', str(mean_absolute_error(yhat_lr,std_df_mock_y.values)))            
            #get_family_MSE(yhat_lr,std_df_mock_y)
            #get_family_MAE(yhat_lr,std_df_mock_y)
            get_family_MSE(test_plate, task_channel, "Linear Regression", "Mock", yhat_lr,std_df_mock_y)
            get_family_MAE(test_plate, task_channel, "Linear Regression", "Mock", yhat_lr,std_df_mock_y)
                          
            print('**************')
            
            print('**************')
            print('Ridge')
            print('profile_treated:') 
            yhat_ridge = pd.DataFrame(Ridge_model.predict(std_df_treated_x.values),columns=std_df_treated_y.columns)                           
            print('Ridge MSE: {}'.format(mean_squared_error(yhat_ridge,std_df_treated_y.values)))  
            print('Ridge MAE: {}'.format(mean_absolute_error(yhat_ridge,std_df_treated_y.values)))  
                          
            print_results(test_plate, task_channel, 'Overall', 'Ridge', 'Treated', 'MSE', str(mean_squared_error(yhat_ridge,std_df_treated_y.values)))
            print_results(test_plate, task_channel, 'Overall', 'Ridge', 'Treated', 'MAE', str(mean_absolute_error(yhat_ridge,std_df_treated_y.values)))
                    
                          
            get_family_MSE(test_plate, task_channel, "Ridge", "Treated", yhat_ridge,std_df_treated_y)
            get_family_MAE(test_plate, task_channel, "Ridge", "Treated", yhat_ridge,std_df_treated_y)
                          
            #get_family_MSE(yhat_lr,std_df_treated_y)
            #get_family_MAE(yhat_lr,std_df_treated_y)
            
            print('profile_mock:')            
            yhat_ridge = pd.DataFrame(Ridge_model.predict(std_df_mock_x.values),columns=std_df_mock_y.columns)   
            print('Ridge MSE: {}'.format(mean_squared_error(yhat_ridge,std_df_mock_y.values)))  
            print('Ridge Reg MAE: {}'.format(mean_absolute_error(yhat_ridge,std_df_mock_y.values)))  
            #get_family_MSE(yhat_ridge,std_df_mock_y)
            #get_family_MAE(yhat_ridge,std_df_mock_y)
            print_results(test_plate, task_channel, 'Overall', 'Ridge', 'Mock', 'MSE', str(mean_squared_error(yhat_ridge,std_df_mock_y.values)))
            print_results(test_plate, task_channel, 'Overall', 'Ridge', 'Mock', 'MAE', str(mean_absolute_error(yhat_ridge,std_df_mock_y.values)))
                    
                          
            get_family_MSE(test_plate, task_channel, "Ridge", "Mock", yhat_ridge,std_df_mock_y)
            get_family_MAE(test_plate, task_channel, "Ridge", "Mock", yhat_ridge,std_df_mock_y)
            print('**************')
            
            print('**************')
            print('DNN')
            print('profile_treated:') 
            yhat_DNN = pd.DataFrame(DNN_model.predict(std_df_treated_x.values),columns=std_df_treated_y.columns)                           
            print('DNN MSE: {}'.format(mean_squared_error(yhat_DNN,std_df_treated_y.values)))  
            print('DNN MAE: {}'.format(mean_absolute_error(yhat_DNN,std_df_treated_y.values)))  
            #get_family_MSE(yhat_DNN,std_df_treated_y)
            #get_family_MAE(yhat_DNN,std_df_treated_y)
            print_results(test_plate, task_channel, 'Overall', 'DNN', 'Treated', 'MSE', str(mean_squared_error(yhat_DNN,std_df_treated_y.values)))
            print_results(test_plate, task_channel, 'Overall', 'DNN', 'Treated', 'MAE', str(mean_absolute_error(yhat_DNN,std_df_treated_y.values)))
                    
                          
            get_family_MSE(test_plate, task_channel, "DNN", "Treated", yhat_DNN,std_df_treated_y)
            get_family_MAE(test_plate, task_channel, "DNN", "Treated", yhat_DNN,std_df_treated_y)
            
            print('profile_mock:')            
            yhat_DNN = pd.DataFrame(DNN_model.predict(std_df_mock_x.values),columns=std_df_mock_y.columns)   
            print('DNN MSE: {}'.format(mean_squared_error(yhat_DNN,std_df_mock_y.values)))  
            print('DNN MAE: {}'.format(mean_absolute_error(yhat_DNN,std_df_mock_y.values)))  
                          
                          
            print_results(test_plate, task_channel, 'Overall', 'DNN', 'Mock', 'MSE', str(mean_squared_error(yhat_DNN,std_df_mock_y.values)))
            print_results(test_plate, task_channel, 'Overall', 'DNN', 'Mock', 'MAE', str(mean_absolute_error(yhat_DNN,std_df_mock_y.values)))
                    
                          
            get_family_MSE(test_plate, task_channel, "DNN", "Mock", yhat_DNN,std_df_mock_y)
            get_family_MAE(test_plate, task_channel, "DNN", "Mock", yhat_DNN,std_df_mock_y)
            #get_family_MSE(yhat_DNN,std_df_mock_y)
            #get_family_MAE(yhat_DNN,std_df_mock_y)
            print('**************')

# Main

In [35]:
main('/storage/users/AmitAdirProject/Data/sqlite/','Std')


  0%|          | 0/5 [00:00<?, ?it/s][A

25740.csv
['treated' 'mock']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


  0%|          | 0/3 [00:00<?, ?it/s][A[A

 67%|██████▋   | 2/3 [00:02<00:01,  1.17s/it][A[A

100%|██████████| 3/3 [00:16<00:00,  5.55s/it][A[A


25740.csv
AGP:




**************
LR
profile_treated:
Linear Reg MSE: 283.274144796004
Linear Reg MAE: 11.452978850198235
profile_mock:
Linear Reg MSE: 12.02328905035015
Linear Reg MAE: 2.197586320243805
**************
**************
Ridge
profile_treated:
Ridge MSE: 0.48315951965045323
Ridge MAE: 0.45486520799422314
profile_mock:
Ridge MSE: 0.4616774645935883
Ridge Reg MAE: 0.450773522509407
**************
**************
DNN
profile_treated:
DNN MSE: 1.004427068168524
DNN MAE: 0.7163405812367527
profile_mock:
DNN MSE: 1.0047932857993072
DNN MAE: 0.7209210074244604
**************
26572.csv
['treated' 'mock']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


  0%|          | 0/3 [00:00<?, ?it/s][A[A

 33%|███▎      | 1/3 [00:14<00:28, 14.18s/it][A[A

100%|██████████| 3/3 [00:28<00:00,  9.40s/it][A[A


26572.csv
AGP:




**************
LR
profile_treated:
Linear Reg MSE: 1547.326967492584
Linear Reg MAE: 24.517394255027796
profile_mock:
Linear Reg MSE: 457.43191644666166
Linear Reg MAE: 13.382657295285147
**************
**************
Ridge
profile_treated:
Ridge MSE: 0.535214351261209
Ridge MAE: 0.4976070264395371
profile_mock:
Ridge MSE: 0.5149038091207776
Ridge Reg MAE: 0.4837232356920421
**************
**************
DNN
profile_treated:
DNN MSE: 0.9364946959423597
DNN MAE: 0.7015475285412931
profile_mock:
DNN MSE: 0.9359305790041638
DNN MAE: 0.7000625463701944
**************
25741.csv
['treated' 'mock']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


  0%|          | 0/3 [00:00<?, ?it/s][A[A

 33%|███▎      | 1/3 [00:14<00:29, 14.57s/it][A[A

100%|██████████| 3/3 [00:16<00:00,  5.64s/it][A[A


25741.csv
AGP:




**************
LR
profile_treated:
Linear Reg MSE: 361.31810856149946
Linear Reg MAE: 12.9372315468781
profile_mock:
Linear Reg MSE: 16.367040114455538
Linear Reg MAE: 2.6312581240805315
**************
**************
Ridge
profile_treated:
Ridge MSE: 0.5040439216167713
Ridge MAE: 0.4658486872565654
profile_mock:
Ridge MSE: 0.4616335757934757
Ridge Reg MAE: 0.4535976656595404
**************
**************
DNN
profile_treated:
DNN MSE: 0.9965343389048466
DNN MAE: 0.7097046600202886
profile_mock:
DNN MSE: 0.9952605874585776
DNN MAE: 0.7208458047577893



 20%|██        | 1/5 [02:58<11:54, 178.67s/it][A

**************
25740.csv
['treated' 'mock']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


  0%|          | 0/3 [00:00<?, ?it/s][A[A

 67%|██████▋   | 2/3 [00:02<00:01,  1.20s/it][A[A

100%|██████████| 3/3 [00:16<00:00,  5.66s/it][A[A


25740.csv
DNA:




**************
LR
profile_treated:
Linear Reg MSE: 52.65315516027113
Linear Reg MAE: 4.601240314873923
empty family Granularity
empty family Granularity
profile_mock:
Linear Reg MSE: 1.2707497964927836
Linear Reg MAE: 0.7443159953992123
empty family Granularity
empty family Granularity
**************
**************
Ridge
profile_treated:
Ridge MSE: 0.23861077873316688
Ridge MAE: 0.323464225176421
empty family Granularity
empty family Granularity
profile_mock:
Ridge MSE: 0.23505312950252297
Ridge Reg MAE: 0.31968266469931633
empty family Granularity
empty family Granularity
**************
**************
DNN
profile_treated:
DNN MSE: 0.9934409881064639
DNN MAE: 0.7682330228470875
empty family Granularity
empty family Granularity
profile_mock:
DNN MSE: 0.9977909988132532
DNN MAE: 0.7621417845414805
empty family Granularity
empty family Granularity
**************
26572.csv
['treated' 'mock']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


  0%|          | 0/3 [00:00<?, ?it/s][A[A

 33%|███▎      | 1/3 [00:15<00:30, 15.38s/it][A[A

100%|██████████| 3/3 [00:30<00:00, 10.08s/it][A[A


26572.csv
DNA:




**************
LR
profile_treated:
Linear Reg MSE: 289.2547192791121
Linear Reg MAE: 10.253658624373008
empty family Granularity
empty family Granularity
profile_mock:
Linear Reg MSE: 282.51313861732683
Linear Reg MAE: 10.37225695095983
empty family Granularity
empty family Granularity
**************
**************
Ridge
profile_treated:
Ridge MSE: 0.29906580926271
Ridge MAE: 0.3757093158387757
empty family Granularity
empty family Granularity
profile_mock:
Ridge MSE: 0.2889094118840918
Ridge Reg MAE: 0.3795598290820163
empty family Granularity
empty family Granularity
**************
**************
DNN
profile_treated:
DNN MSE: 0.8792424236746608
DNN MAE: 0.7118154671145503
empty family Granularity
empty family Granularity
profile_mock:
DNN MSE: 0.8794653117105659
DNN MAE: 0.7267295472441303
empty family Granularity
empty family Granularity
**************
25741.csv
['treated' 'mock']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


  0%|          | 0/3 [00:00<?, ?it/s][A[A

 33%|███▎      | 1/3 [00:14<00:28, 14.44s/it][A[A

100%|██████████| 3/3 [00:16<00:00,  5.62s/it][A[A


25741.csv
DNA:




**************
LR
profile_treated:
Linear Reg MSE: 63.13580097549375
Linear Reg MAE: 4.797561314528918
empty family Granularity
empty family Granularity
profile_mock:
Linear Reg MSE: 0.8046527686282413
Linear Reg MAE: 0.5837240576433379
empty family Granularity
empty family Granularity
**************
**************
Ridge
profile_treated:
Ridge MSE: 0.2924719943389991
Ridge MAE: 0.34782888921293237
empty family Granularity
empty family Granularity
profile_mock:
Ridge MSE: 0.24450468901813924
Ridge Reg MAE: 0.3281250775682175
empty family Granularity
empty family Granularity
**************
**************
DNN
profile_treated:
DNN MSE: 0.9869389143921282
DNN MAE: 0.7465565919109667
empty family Granularity
empty family Granularity
profile_mock:
DNN MSE: 0.9846496712658237
DNN MAE: 0.7620099138216492
empty family Granularity



 40%|████      | 2/5 [06:00<08:59, 179.72s/it][A

empty family Granularity
**************
25740.csv
['treated' 'mock']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


  0%|          | 0/3 [00:00<?, ?it/s][A[A

 67%|██████▋   | 2/3 [00:02<00:01,  1.16s/it][A[A

100%|██████████| 3/3 [00:16<00:00,  5.38s/it][A[A


25740.csv
ER:




**************
LR
profile_treated:
Linear Reg MSE: 107.47655348073008
Linear Reg MAE: 7.0172465856518835
profile_mock:
Linear Reg MSE: 17.46692237544921
Linear Reg MAE: 2.747418510334488
**************
**************
Ridge
profile_treated:
Ridge MSE: 0.2918910849655994
Ridge MAE: 0.3668786312365679
profile_mock:
Ridge MSE: 0.27870819927034945
Ridge Reg MAE: 0.35742704792772495
**************
**************
DNN
profile_treated:
DNN MSE: 1.0771211854252463
DNN MAE: 0.8258799753810963
profile_mock:
DNN MSE: 1.0761409473854264
DNN MAE: 0.8240053900797416
**************
26572.csv
['treated' 'mock']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


  0%|          | 0/3 [00:00<?, ?it/s][A[A

 33%|███▎      | 1/3 [00:14<00:28, 14.44s/it][A[A

100%|██████████| 3/3 [00:27<00:00,  9.33s/it][A[A


26572.csv
ER:




**************
LR
profile_treated:
Linear Reg MSE: 590.1942411585285
Linear Reg MAE: 14.514324071956871
profile_mock:
Linear Reg MSE: 247.44894526286373
Linear Reg MAE: 10.16459606669864
**************
**************
Ridge
profile_treated:
Ridge MSE: 0.32100648394251796
Ridge MAE: 0.38633932362018886
profile_mock:
Ridge MSE: 0.3236718398610685
Ridge Reg MAE: 0.3931691055472515
**************
**************
DNN
profile_treated:
DNN MSE: 0.8847509001105045
DNN MAE: 0.713235120197706
profile_mock:
DNN MSE: 0.8865388255506852
DNN MAE: 0.7270093333701177
**************
25741.csv
['treated' 'mock']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


  0%|          | 0/3 [00:00<?, ?it/s][A[A

 33%|███▎      | 1/3 [00:15<00:30, 15.21s/it][A[A

100%|██████████| 3/3 [00:17<00:00,  5.88s/it][A[A


25741.csv
ER:




**************
LR
profile_treated:
Linear Reg MSE: 138.48916318146595
Linear Reg MAE: 7.331486451940916
profile_mock:
Linear Reg MSE: 6.463804292357856
Linear Reg MAE: 1.464634234073988
**************
**************
Ridge
profile_treated:
Ridge MSE: 0.31185280735538506
Ridge MAE: 0.3775758886591511
profile_mock:
Ridge MSE: 0.2893928219545636
Ridge Reg MAE: 0.36618483127908413
**************
**************
DNN
profile_treated:
DNN MSE: 0.9960547213500528
DNN MAE: 0.7631128673207475
profile_mock:
DNN MSE: 0.9948321061487935
DNN MAE: 0.7700374994879197



 60%|██████    | 3/5 [09:03<06:01, 180.54s/it][A

**************
25740.csv
['treated' 'mock']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


  0%|          | 0/3 [00:00<?, ?it/s][A[A

 67%|██████▋   | 2/3 [00:02<00:01,  1.29s/it][A[A

100%|██████████| 3/3 [00:17<00:00,  5.73s/it][A[A


25740.csv
Mito:




**************
LR
profile_treated:
Linear Reg MSE: 247.15724096025235
Linear Reg MAE: 9.894752122516032
profile_mock:
Linear Reg MSE: 19.529694542122538
Linear Reg MAE: 2.8231880997996
**************
**************
Ridge
profile_treated:
Ridge MSE: 0.33662594116189715
Ridge MAE: 0.368056065204034
profile_mock:
Ridge MSE: 0.29397608453436247
Ridge Reg MAE: 0.3508415836269077
**************
**************
DNN
profile_treated:
DNN MSE: 0.9747360367622327
DNN MAE: 0.730093285765381
profile_mock:
DNN MSE: 0.9692222698370913
DNN MAE: 0.739422986466338
**************
26572.csv
['treated' 'mock']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


  0%|          | 0/3 [00:00<?, ?it/s][A[A

 33%|███▎      | 1/3 [00:14<00:28, 14.37s/it][A[A

100%|██████████| 3/3 [00:28<00:00,  9.44s/it][A[A


26572.csv
Mito:




**************
LR
profile_treated:
Linear Reg MSE: 1191.93274717944
Linear Reg MAE: 20.45706158698644
profile_mock:
Linear Reg MSE: 338.1206473634178
Linear Reg MAE: 11.848729465155147
**************
**************
Ridge
profile_treated:
Ridge MSE: 0.32780218098449304
Ridge MAE: 0.3626724851081243
profile_mock:
Ridge MSE: 0.2765365406014666
Ridge Reg MAE: 0.3377087993311659
**************
**************
DNN
profile_treated:
DNN MSE: 0.881364299696471
DNN MAE: 0.693294974297482
profile_mock:
DNN MSE: 0.8800320926500054
DNN MAE: 0.7025089564230831
**************
25741.csv
['treated' 'mock']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


  0%|          | 0/3 [00:00<?, ?it/s][A[A

 33%|███▎      | 1/3 [00:14<00:28, 14.31s/it][A[A

100%|██████████| 3/3 [00:16<00:00,  5.54s/it][A[A


25741.csv
Mito:




**************
LR
profile_treated:
Linear Reg MSE: 271.39152179023637
Linear Reg MAE: 11.3198949987863
profile_mock:
Linear Reg MSE: 16.072093227349324
Linear Reg MAE: 2.5954218261585797
**************
**************
Ridge
profile_treated:
Ridge MSE: 0.35031599828269216
Ridge MAE: 0.3731726535776036
profile_mock:
Ridge MSE: 0.3046198414921208
Ridge Reg MAE: 0.35739446189752727
**************
**************
DNN
profile_treated:
DNN MSE: 0.9484238048533322
DNN MAE: 0.7050218905564528
profile_mock:
DNN MSE: 0.9429893710432691
DNN MAE: 0.7270641658281628



 80%|████████  | 4/5 [12:09<03:02, 182.27s/it][A

**************
25740.csv
['treated' 'mock']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


  0%|          | 0/3 [00:00<?, ?it/s][A[A

 67%|██████▋   | 2/3 [00:02<00:01,  1.25s/it][A[A

100%|██████████| 3/3 [00:16<00:00,  5.65s/it][A[A


25740.csv
RNA:




**************
LR
profile_treated:
Linear Reg MSE: 160.0585927081043
Linear Reg MAE: 8.04305248386527
profile_mock:
Linear Reg MSE: 8.56180735600173
Linear Reg MAE: 1.8137212336455777
**************
**************
Ridge
profile_treated:
Ridge MSE: 0.25902687669206464
Ridge MAE: 0.33880490905994687
profile_mock:
Ridge MSE: 0.2670769310740674
Ridge Reg MAE: 0.3435825028908292
**************
**************
DNN
profile_treated:
DNN MSE: 0.961150421242561
DNN MAE: 0.7321216444499474
profile_mock:
DNN MSE: 0.9615631924584417
DNN MAE: 0.735081649047502
**************
26572.csv
['treated' 'mock']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


  0%|          | 0/3 [00:00<?, ?it/s][A[A

 33%|███▎      | 1/3 [00:15<00:30, 15.29s/it][A[A

100%|██████████| 3/3 [00:29<00:00,  9.95s/it][A[A


26572.csv
RNA:




**************
LR
profile_treated:
Linear Reg MSE: 573.5152743283792
Linear Reg MAE: 14.574814060520902
profile_mock:
Linear Reg MSE: 208.8341445563234
Linear Reg MAE: 8.626324133747312
**************
**************
Ridge
profile_treated:
Ridge MSE: 0.2621294733580515
Ridge MAE: 0.33329498697319127
profile_mock:
Ridge MSE: 0.23206181060190983
Ridge Reg MAE: 0.31884284038661703
**************
**************
DNN
profile_treated:
DNN MSE: 0.9100011442756106
DNN MAE: 0.7042098974685067
profile_mock:
DNN MSE: 0.9089159891199629
DNN MAE: 0.7105496149108913
**************
25741.csv
['treated' 'mock']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


  0%|          | 0/3 [00:00<?, ?it/s][A[A

 33%|███▎      | 1/3 [00:14<00:28, 14.23s/it][A[A

100%|██████████| 3/3 [00:16<00:00,  5.51s/it][A[A


25741.csv
RNA:




**************
LR
profile_treated:
Linear Reg MSE: 156.80927280628677
Linear Reg MAE: 7.720386587653934
profile_mock:
Linear Reg MSE: 5.925402175618691
Linear Reg MAE: 1.4475490259556165
**************
**************
Ridge
profile_treated:
Ridge MSE: 0.2745524546147444
Ridge MAE: 0.34528826990837796
profile_mock:
Ridge MSE: 0.26555754238899676
Ridge Reg MAE: 0.34375824189135074
**************
**************
DNN
profile_treated:
DNN MSE: 0.9716367433805546
DNN MAE: 0.7319892744089642
profile_mock:
DNN MSE: 0.9703365905963541
DNN MAE: 0.7437493222206973



100%|██████████| 5/5 [15:18<00:00, 183.68s/it][A

**************



