# Running MoCHI on DTS datasets - *all variants* where dead have been subsampled to 9% in each of the DTS datasets aside from DTS10 and DTS11 (which are all alive) 

# This version of analysis is **without** DTS02 library

1. Allowing for 1st order terms or 1st and 2nd order terms
2. Fitting the following functions: Sigmoid, ELU and Linear
3. `l2_regularization_factor` = 10^(-5)
4. Running a joint model (1) and separate per dataset models (8)

Using **all live variants + subsampled dead (9%)** from the files generated in notebook M0: 

`mochi_live_and_subsampled_dead_variants_DTS01_20240308.tsv` and so on for all the DTS datasets except DTS02 (01, 05, 10, 11, 13, 14 and 15)

12.04.2024

In [1]:
import pymochi
from pymochi.data import MochiData
from pymochi.models import MochiTask
from pymochi.project import MochiProject
from pymochi.report import MochiReport
import pandas as pd
import numpy as np
import pickle

In [2]:
%%bash

pip freeze

asttokens @ file:///home/conda/feedstock_root/build_artifacts/asttokens_1698341106958/work
certifi @ file:///home/conda/feedstock_root/build_artifacts/certifi_1707022139797/work/certifi
comm @ file:///home/conda/feedstock_root/build_artifacts/comm_1710320294760/work
cycler @ file:///home/conda/feedstock_root/build_artifacts/cycler_1696677705766/work
debugpy @ file:///home/conda/feedstock_root/build_artifacts/debugpy_1707444393922/work
decorator @ file:///home/conda/feedstock_root/build_artifacts/decorator_1641555617451/work
exceptiongroup @ file:///home/conda/feedstock_root/build_artifacts/exceptiongroup_1704921103267/work
executing @ file:///home/conda/feedstock_root/build_artifacts/executing_1698579936712/work
fonttools @ file:///home/conda/feedstock_root/build_artifacts/fonttools_1710865504921/work
importlib_metadata @ file:///home/conda/feedstock_root/build_artifacts/importlib-metadata_1709821103657/work
ipykernel @ file:///home/conda/feedstock_root/build_artifacts/ipykernel_170899

In [3]:
%%bash

#!/usr/bin/env

pwd


/lustre/scratch126/gengen/projects/amyloid_beta_epistasis/DTS_joint_analysis/selected_DTS_datasets_analysis


In [4]:
filedir = '/lustre/scratch126/gengen/projects/amyloid_beta_epistasis/DTS_joint_analysis/files/'

datasets = ['DTS01',
            #'DTS02',
            'DTS05','DTS10','DTS11','DTS13','DTS14','DTS15']

filenames = ['mochi_live_and_subsampled_dead_variants_' + dataset + '_20240308.tsv' for dataset in datasets]

output_dir = '/lustre/scratch126/gengen/projects/amyloid_beta_epistasis/DTS_joint_analysis/mochi_results/'

In [5]:
# For joint modelling making sure that the aa_seq is comlemented the full Ab sequence
# this was done in notebook M0

all_vars = {}

total_n_vars = 0

for i in range(len(datasets)):
    dataset = datasets[i]
    print(dataset)
    all_vars[dataset] = pd.read_csv(filedir + filenames[i], sep='\t')
    # check length of aa_seq, should be already complemented to full Ab sequence (42 aa)
    print(np.unique([len(elem) for elem in all_vars[dataset]['aa_seq']], return_counts=True), '\n')
    
    total_n_vars = total_n_vars + len(all_vars[dataset])

DTS01
(array([42]), array([9195])) 

DTS05
(array([42]), array([3058])) 

DTS10
(array([42]), array([47402])) 

DTS11
(array([42]), array([72768])) 

DTS13
(array([42]), array([11349])) 

DTS14
(array([42]), array([2105])) 

DTS15
(array([42]), array([36222])) 



In [6]:
# 213771 variants in total, 182099 without DTS02
total_n_vars

182099

In [7]:
# actual WT of Ab, artificially added to these tables in notebook M0
wt_vars = {}

for dataset in datasets:
    print(dataset)
    print(all_vars[dataset][all_vars[dataset]['WT'] == 1])
    wt_vars[dataset] = list(all_vars[dataset][all_vars[dataset]['WT'] == 1]['aa_seq'])[0]
    print('\n')

DTS01
                                          aa_seq  Nham_aa   WT  fitness  sigma
9194  DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA        0  1.0      0.0  100.0


DTS05
                                          aa_seq  Nham_aa   WT  fitness  sigma
3057  DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA        0  1.0      0.0  100.0


DTS10
                                           aa_seq  Nham_aa   WT  fitness  \
47401  DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA        0  1.0      0.0   

       sigma  
47401  100.0  


DTS11
                                           aa_seq  Nham_aa   WT  fitness  \
72767  DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA        0  1.0      0.0   

       sigma  
72767  100.0  


DTS13
                                           aa_seq  Nham_aa   WT  fitness  \
11348  DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA        0  1.0      0.0   

       sigma  
11348  100.0  


DTS14
                                          aa_seq  Nham_aa   WT  fitness  sigma
2104  DAEF

In [8]:
wt_vars

{'DTS01': 'DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA',
 'DTS05': 'DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA',
 'DTS10': 'DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA',
 'DTS11': 'DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA',
 'DTS13': 'DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA',
 'DTS14': 'DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA',
 'DTS15': 'DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA'}

In [9]:
# setting l2_regularization_factor to 10^(-5)
l2_regularization_factor_value = 0.00001

# Fitting a model with all DTS except DTS02

In [None]:
# started at 17:05 (12.04.2024)
# fold1 of model 1 began at X
# overall 2 models to fit (1 function fit x 2 N_max_interaction_order)
# with 64 Gb of RAM and 8 CPU cores
# using 80 Gb GPU RAM

k_folds = 10

transformations = ['Sigmoid', #'ELU', 'Linear'
                  ] 

N_max_interaction_order = [1, 2]

for n_max_interaction_order in N_max_interaction_order:
    print('N_max_interaction_order', n_max_interaction_order)
    
    for transformation in transformations:
        print('Transformation:', transformation)
        
        my_model_design = pd.DataFrame({
                   'phenotype': ['Nucleation_' + dataset for dataset in datasets],

                    # transformation can be one of: Linear, ReLU, SiLU, Sigmoid, SumOfSigmoids, 
                    #                               TwoStateFractionFolded, ThreeStateFractionFolded, FourStateFractionFolded

                   'transformation': [transformation for i in range(len(datasets))], 
                   'trait': ['Nucleation' for i in range(len(datasets))],
                   'file': [filedir + filenames[i] for i in range(len(datasets))]})

        mochi_project = MochiTask(
                   directory = output_dir + '20240412_max_terms_order_' + str(n_max_interaction_order) + '_' + transformation + '_l2_regularization_factor_' + str(l2_regularization_factor_value) + '_all_DTS_datasets_except_DTS02_all_live_variants_and_subsampled_dead',
                   data = MochiData(
                      model_design = my_model_design,
                      max_interaction_order = n_max_interaction_order,
                      k_folds = k_folds),
                l2_regularization_factor = l2_regularization_factor_value,

                # this was 200 before by default - now might be a better fit - can try different values in the future
            sos_architecture = [5])

        # started at X
        # takes about 6-7?? minutes
        mochi_project.grid_search() 

        for i in range(k_folds):
            mochi_project.fit_best(fold = i+1)

        temperature = 30

        mochi_report = MochiReport(
                   task = mochi_project, # changed from project = mochi_project, there must have been a change in agrument name
                   RT = (273+temperature)*0.001987)

        energies = mochi_project.get_additive_trait_weights(
                   RT = (273+temperature)*0.001987)

        mochi_project.save()
            
        #print('Done with l2_regularization_factor_value', l2_regularization_factor_value, '\n', '################')

        print('Done with transformation', transformation, '\n', '################')
        
    print('Done with n_max_interaction_order', n_max_interaction_order, '\n', '################')

N_max_interaction_order 1
Transformation: Sigmoid
Loading fitness data
One-hot encoding sequence features
One-hot encoding interaction features
Defining cross-validation groups
Defining coefficient groups
Done!
Performing grid search...
Fitting model:
{'fold': 1, 'seed': 1, 'grid_search': True, 'batch_size': 512, 'learn_rate': 0.05, 'num_epochs': 1000, 'num_epochs_grid': 100, 'l1_regularization_factor': 0.0, 'l2_regularization_factor': 1e-05, 'training_resample': True, 'early_stopping': True, 'scheduler_gamma': 0.98, 'scheduler_epochs': 10, 'loss_function_name': 'WeightedL1', 'sos_architecture': [5], 'sos_outputlinear': False}
Epoch 1; Avg_val_loss: 1.4102; WTcoef_1: 0.2014; WTres_1: 2.3650; WTres_2: -4.9663; WTres_3: 1.5430; WTres_4: -1.6084; WTres_5: 0.5444; WTres_6: -3.7805; WTres_7: -1.2248; 
Epoch 11; Avg_val_loss: 1.4394; WTcoef_1: 0.6947; WTres_1: 2.2444; WTres_2: -5.0312; WTres_3: 1.3471; WTres_4: -1.8414; WTres_5: 0.0964; WTres_6: -4.7414; WTres_7: -1.5298; 
Epoch 21; Avg_val_

# Model with only fully conservative libraries - DTS01, DTS05, DTS14

### This was done wrong because of how I iterated over filenames[i], I used ['DTS01','DTS05','DTS10'] instead of ['DTS01','DTS05','DTS14']

# Results from below are no longer used since I am rerunning mochi on all variants for these datasets (no subsampling)

In [10]:
datasets_fully_conservative = ['DTS01','DTS05','DTS14']

In [None]:
# started at X
# overall 2 models to fit (1 function fit x 2 N_max_interaction_order)
# with 64 Gb of RAM and 8 CPU cores
# using 80 Gb GPU RAM

k_folds = 10

transformations = ['Sigmoid', #'ELU', 'Linear'
                  ] 

N_max_interaction_order = [1, 2]

for n_max_interaction_order in N_max_interaction_order:
    print('N_max_interaction_order', n_max_interaction_order)
    
    for transformation in transformations:
        print('Transformation:', transformation)
        
        my_model_design = pd.DataFrame({
                   'phenotype': ['Nucleation_' + dataset for dataset in datasets_fully_conservative],

                    # transformation can be one of: Linear, ReLU, SiLU, Sigmoid, SumOfSigmoids, 
                    #                               TwoStateFractionFolded, ThreeStateFractionFolded, FourStateFractionFolded

                   'transformation': [transformation for i in range(len(datasets_fully_conservative))], 
                   'trait': ['Nucleation' for i in range(len(datasets_fully_conservative))],
            
            ######## the line below caused the bug, it iterated over i, not datasets --> selected DTS01, DTS05 and DTS10 [instead of DTS01, DTS05 and DTS14]
                   'file': [filedir + filenames[i] for i in range(len(datasets_fully_conservative))]}) 

        mochi_project = MochiTask(
                   directory = output_dir + '20240412_max_terms_order_' + str(n_max_interaction_order) + '_' + transformation + '_l2_regularization_factor_' + str(l2_regularization_factor_value) + '_DTS01_DTS05_DTS14_all_live_variants_and_subsampled_dead',
                   data = MochiData(
                      model_design = my_model_design,
                      max_interaction_order = n_max_interaction_order,
                      k_folds = k_folds),
                l2_regularization_factor = l2_regularization_factor_value,

                # this was 200 before by default - now might be a better fit - can try different values in the future
            sos_architecture = [5])

        # started at X
        # takes about 6-7?? minutes
        mochi_project.grid_search() 

        for i in range(k_folds):
            mochi_project.fit_best(fold = i+1)

        temperature = 30

        mochi_report = MochiReport(
                   task = mochi_project, # changed from project = mochi_project, there must have been a change in agrument name
                   RT = (273+temperature)*0.001987)

        energies = mochi_project.get_additive_trait_weights(
                   RT = (273+temperature)*0.001987)

        mochi_project.save()
            
        #print('Done with l2_regularization_factor_value', l2_regularization_factor_value, '\n', '################')

        print('Done with transformation', transformation, '\n', '################')
        
    print('Done with n_max_interaction_order', n_max_interaction_order, '\n', '################')

In [18]:
my_model_design = pd.DataFrame({
                   'phenotype': ['Nucleation_' + dataset for dataset in datasets_fully_conservative],

                    # transformation can be one of: Linear, ReLU, SiLU, Sigmoid, SumOfSigmoids, 
                    #                               TwoStateFractionFolded, ThreeStateFractionFolded, FourStateFractionFolded

                   'transformation': [transformation for i in range(len(datasets_fully_conservative))], 
                   'trait': ['Nucleation' for i in range(len(datasets_fully_conservative))],
                   'file': [filedir + filenames[i] for i in range(len(datasets_fully_conservative))]})


In [19]:
my_model_design

Unnamed: 0,phenotype,transformation,trait,file
0,Nucleation_DTS01,Sigmoid,Nucleation,/lustre/scratch126/gengen/projects/amyloid_bet...
1,Nucleation_DTS05,Sigmoid,Nucleation,/lustre/scratch126/gengen/projects/amyloid_bet...
2,Nucleation_DTS14,Sigmoid,Nucleation,/lustre/scratch126/gengen/projects/amyloid_bet...


In [20]:
my_model_design = pd.DataFrame({
                   'phenotype': ['Nucleation_' + dataset for dataset in datasets_fully_conservative],

                    # transformation can be one of: Linear, ReLU, SiLU, Sigmoid, SumOfSigmoids, 
                    #                               TwoStateFractionFolded, ThreeStateFractionFolded, FourStateFractionFolded

                   'transformation': [transformation for i in range(len(datasets_fully_conservative))], 
                   'trait': ['Nucleation' for i in range(len(datasets_fully_conservative))],
                   'file': [filedir + filenames[i] for i in range(len(datasets_fully_conservative))]})


In [25]:
my_model_design['file'][2]

'/lustre/scratch126/gengen/projects/amyloid_beta_epistasis/DTS_joint_analysis/files/mochi_live_and_subsampled_dead_variants_DTS10_20240308.tsv'

In [11]:
# just checking smth
# overall 2 models to fit (1 function fit x 2 N_max_interaction_order)
# with 64 Gb of RAM and 8 CPU cores
# using 80 Gb GPU RAM

k_folds = 10

transformations = ['Sigmoid', #'ELU', 'Linear'
                  ] 

N_max_interaction_order = [2, 1]

for n_max_interaction_order in N_max_interaction_order:
    print('N_max_interaction_order', n_max_interaction_order)
    
    for transformation in transformations:
        print('Transformation:', transformation)
        
        my_model_design = pd.DataFrame({
                   'phenotype': ['Nucleation_' + dataset for dataset in datasets_fully_conservative],

                    # transformation can be one of: Linear, ReLU, SiLU, Sigmoid, SumOfSigmoids, 
                    #                               TwoStateFractionFolded, ThreeStateFractionFolded, FourStateFractionFolded

                   'transformation': [transformation for i in range(len(datasets_fully_conservative))], 
                   'trait': ['Nucleation' for i in range(len(datasets_fully_conservative))],
                   'file': [filedir + filenames[i] for i in range(len(datasets_fully_conservative))]})

        mochi_project = MochiTask(
                   directory = output_dir + '20240412_copy_max_terms_order_' + str(n_max_interaction_order) + '_' + transformation + '_l2_regularization_factor_' + str(l2_regularization_factor_value) + '_DTS01_DTS05_DTS14_all_live_variants_and_subsampled_dead',
                   data = MochiData(
                      model_design = my_model_design,
                      max_interaction_order = n_max_interaction_order,
                      k_folds = k_folds),
                l2_regularization_factor = l2_regularization_factor_value,

                # this was 200 before by default - now might be a better fit - can try different values in the future
            sos_architecture = [5])

        # started at X
        # takes about 6-7?? minutes
        mochi_project.grid_search() 

        for i in range(k_folds):
            mochi_project.fit_best(fold = i+1)

        temperature = 30

        mochi_report = MochiReport(
                   task = mochi_project, # changed from project = mochi_project, there must have been a change in agrument name
                   RT = (273+temperature)*0.001987)

        energies = mochi_project.get_additive_trait_weights(
                   RT = (273+temperature)*0.001987)

        mochi_project.save()
            
        #print('Done with l2_regularization_factor_value', l2_regularization_factor_value, '\n', '################')

        print('Done with transformation', transformation, '\n', '################')
        
    print('Done with n_max_interaction_order', n_max_interaction_order, '\n', '################')

N_max_interaction_order 2
Transformation: Sigmoid
Loading fitness data
One-hot encoding sequence features
One-hot encoding interaction features
... Total theoretical features (order:count): 2:1000
... Total retained features (order:count): 2:1000 (100.0%)
Defining cross-validation groups
Defining coefficient groups


KeyboardInterrupt: 

# Lower priority: running MoCHI separately for each DTS dataset

This is good to have but not super urgent right now

In [11]:
filenames_full = [filedir + filename for filename in filenames]

In [12]:
filenames_full

['/lustre/scratch126/gengen/projects/amyloid_beta_epistasis/DTS_joint_analysis/files/mochi_live_and_subsampled_dead_variants_DTS01_20240308.tsv',
 '/lustre/scratch126/gengen/projects/amyloid_beta_epistasis/DTS_joint_analysis/files/mochi_live_and_subsampled_dead_variants_DTS05_20240308.tsv',
 '/lustre/scratch126/gengen/projects/amyloid_beta_epistasis/DTS_joint_analysis/files/mochi_live_and_subsampled_dead_variants_DTS10_20240308.tsv',
 '/lustre/scratch126/gengen/projects/amyloid_beta_epistasis/DTS_joint_analysis/files/mochi_live_and_subsampled_dead_variants_DTS11_20240308.tsv',
 '/lustre/scratch126/gengen/projects/amyloid_beta_epistasis/DTS_joint_analysis/files/mochi_live_and_subsampled_dead_variants_DTS13_20240308.tsv',
 '/lustre/scratch126/gengen/projects/amyloid_beta_epistasis/DTS_joint_analysis/files/mochi_live_and_subsampled_dead_variants_DTS14_20240308.tsv',
 '/lustre/scratch126/gengen/projects/amyloid_beta_epistasis/DTS_joint_analysis/files/mochi_live_and_subsampled_dead_variants

In [13]:
datasets # i need indices 0, 1 and 5

['DTS01', 'DTS05', 'DTS10', 'DTS11', 'DTS13', 'DTS14', 'DTS15']

In [14]:
filenames_full[0].split('/')[-1].split('.')[0]

'mochi_live_and_subsampled_dead_variants_DTS01_20240308'

In [None]:
# started at 14:01 on 26.02.2024
# overall 48 models to fit (3 function fits x 2 N_max_interaction_order x 8 datasets)
# with 32 Gb of RAM and 8 CPU cores
# using 80 Gb GPU RAM

k_folds = 10

transformations = ['Sigmoid', 'ELU', 'Linear'] 

N_max_interaction_order = [1, 2]


for n_max_interaction_order in N_max_interaction_order:
    print('N_max_interaction_order', n_max_interaction_order)
    
    for transformation in transformations:
        print('Transformation:', transformation)
        
        for i in range(len(datasets)):
            curr_file_path = filenames_full[i]
            curr_file_name = curr_file_path.split('/')[-1].split('.')[0]
            curr_dataset = datasets[i]
            
            print('File', curr_file_path)
        
            my_model_design = pd.DataFrame({
                       'phenotype': ['Nucleation_' + curr_dataset],

                        # transformation can be one of: Linear, ReLU, SiLU, Sigmoid, SumOfSigmoids, 
                        #                               TwoStateFractionFolded, ThreeStateFractionFolded, FourStateFractionFolded

                       'transformation': [transformation], 
                       'trait': ['Nucleation'],
                       'file': [curr_file_path]})

            mochi_project = MochiTask(
                       directory = output_dir + '20240226_max_terms_order_' + str(n_max_interaction_order) + '_' + transformation + '_l2_regularization_factor_' + str(l2_regularization_factor_value) + '_' + curr_dataset + '_all_variants',
                       data = MochiData(
                          model_design = my_model_design,
                          max_interaction_order = n_max_interaction_order,
                          k_folds = k_folds),
                    l2_regularization_factor = l2_regularization_factor_value,

                    # this was 200 before by default - now might be a better fit - can try different values in the future
                sos_architecture = [5])

            # started at X
            # takes about 6-7?? minutes
            mochi_project.grid_search() 

            for i in range(k_folds):
                mochi_project.fit_best(fold = i+1)

            temperature = 30

            mochi_report = MochiReport(
                       task = mochi_project, # changed from project = mochi_project, there must have been a change in agrument name
                       RT = (273+temperature)*0.001987)

            energies = mochi_project.get_additive_trait_weights(
                       RT = (273+temperature)*0.001987)

            mochi_project.save()

            print('Done with file', l2_regularization_factor_value, '\n', '################')

        print('Done with transformation', transformation, '\n', '################')
        
    print('Done with n_max_interaction_order', n_max_interaction_order, '\n', '################')

N_max_interaction_order 1
Transformation: Sigmoid
File /lustre/scratch126/gengen/projects/amyloid_beta_epistasis/DTS_joint_analysis/files/mochi_all_variants_DTS01_20240220.tsv
Loading fitness data
One-hot encoding sequence features
One-hot encoding interaction features
Defining cross-validation groups
Defining coefficient groups
Done!
Performing grid search...
Fitting model:
{'fold': 1, 'seed': 1, 'grid_search': True, 'batch_size': 512, 'learn_rate': 0.05, 'num_epochs': 1000, 'num_epochs_grid': 100, 'l1_regularization_factor': 0.0, 'l2_regularization_factor': 1e-05, 'training_resample': True, 'early_stopping': True, 'scheduler_gamma': 0.98, 'scheduler_epochs': 10, 'loss_function_name': 'WeightedL1', 'sos_architecture': [5], 'sos_outputlinear': False}
Epoch 1; Avg_val_loss: 1.2313; WTcoef_1: 0.1330; WTres_1: 4.7096; 
Epoch 11; Avg_val_loss: 1.1108; WTcoef_1: 1.3171; WTres_1: 1.5341; 
Epoch 21; Avg_val_loss: 1.1155; WTcoef_1: 1.7369; WTres_1: 1.3428; 
Epoch 31; Avg_val_loss: 1.1019; WTco

In [15]:
# just checking smth
# overall 48 models to fit (3 function fits x 2 N_max_interaction_order x 8 datasets)
# with 32 Gb of RAM and 8 CPU cores
# using 80 Gb GPU RAM

k_folds = 10

transformations = ['Sigmoid', 'ELU', 'Linear'] 

N_max_interaction_order = [2, 1]


for n_max_interaction_order in N_max_interaction_order:
    print('N_max_interaction_order', n_max_interaction_order)
    
    for transformation in transformations:
        print('Transformation:', transformation)
        
        for i in [0]:
            curr_file_path = filenames_full[i]
            curr_file_name = curr_file_path.split('/')[-1].split('.')[0]
            curr_dataset = datasets[i]
            
            print('File', curr_file_path)
        
            my_model_design = pd.DataFrame({
                       'phenotype': ['Nucleation_' + curr_dataset],

                        # transformation can be one of: Linear, ReLU, SiLU, Sigmoid, SumOfSigmoids, 
                        #                               TwoStateFractionFolded, ThreeStateFractionFolded, FourStateFractionFolded

                       'transformation': [transformation], 
                       'trait': ['Nucleation'],
                       'file': [curr_file_path]})

            mochi_project = MochiTask(
                       directory = output_dir + '20240226_copy_max_terms_order_' + str(n_max_interaction_order) + '_' + transformation + '_l2_regularization_factor_' + str(l2_regularization_factor_value) + '_' + curr_dataset + '_all_variants',
                       data = MochiData(
                          model_design = my_model_design,
                          max_interaction_order = n_max_interaction_order,
                          k_folds = k_folds),
                    l2_regularization_factor = l2_regularization_factor_value,

                    # this was 200 before by default - now might be a better fit - can try different values in the future
                sos_architecture = [5])

            # started at X
            # takes about 6-7?? minutes
            mochi_project.grid_search() 

            for i in range(k_folds):
                mochi_project.fit_best(fold = i+1)

            temperature = 30

            mochi_report = MochiReport(
                       task = mochi_project, # changed from project = mochi_project, there must have been a change in agrument name
                       RT = (273+temperature)*0.001987)

            energies = mochi_project.get_additive_trait_weights(
                       RT = (273+temperature)*0.001987)

            mochi_project.save()

            print('Done with file', l2_regularization_factor_value, '\n', '################')

        print('Done with transformation', transformation, '\n', '################')
        
    print('Done with n_max_interaction_order', n_max_interaction_order, '\n', '################')

N_max_interaction_order 2
Transformation: Sigmoid
File /lustre/scratch126/gengen/projects/amyloid_beta_epistasis/DTS_joint_analysis/files/mochi_live_and_subsampled_dead_variants_DTS01_20240308.tsv
Loading fitness data
One-hot encoding sequence features
One-hot encoding interaction features
... Total theoretical features (order:count): 2:448
... Total retained features (order:count): 2:448 (100.0%)
Defining cross-validation groups
Defining coefficient groups
Done!
Performing grid search...


KeyboardInterrupt: 

In [16]:
# just checking smth
# overall 48 models to fit (3 function fits x 2 N_max_interaction_order x 8 datasets)
# with 32 Gb of RAM and 8 CPU cores
# using 80 Gb GPU RAM

k_folds = 10

transformations = ['Sigmoid', 'ELU', 'Linear'] 

N_max_interaction_order = [2, 1]


for n_max_interaction_order in N_max_interaction_order:
    print('N_max_interaction_order', n_max_interaction_order)
    
    for transformation in transformations:
        print('Transformation:', transformation)
        
        for i in [1]:
            curr_file_path = filenames_full[i]
            curr_file_name = curr_file_path.split('/')[-1].split('.')[0]
            curr_dataset = datasets[i]
            
            print('File', curr_file_path)
        
            my_model_design = pd.DataFrame({
                       'phenotype': ['Nucleation_' + curr_dataset],

                        # transformation can be one of: Linear, ReLU, SiLU, Sigmoid, SumOfSigmoids, 
                        #                               TwoStateFractionFolded, ThreeStateFractionFolded, FourStateFractionFolded

                       'transformation': [transformation], 
                       'trait': ['Nucleation'],
                       'file': [curr_file_path]})

            mochi_project = MochiTask(
                       directory = output_dir + '20240226_copy_max_terms_order_' + str(n_max_interaction_order) + '_' + transformation + '_l2_regularization_factor_' + str(l2_regularization_factor_value) + '_' + curr_dataset + '_all_variants',
                       data = MochiData(
                          model_design = my_model_design,
                          max_interaction_order = n_max_interaction_order,
                          k_folds = k_folds),
                    l2_regularization_factor = l2_regularization_factor_value,

                    # this was 200 before by default - now might be a better fit - can try different values in the future
                sos_architecture = [5])

            # started at X
            # takes about 6-7?? minutes
            mochi_project.grid_search() 

            for i in range(k_folds):
                mochi_project.fit_best(fold = i+1)

            temperature = 30

            mochi_report = MochiReport(
                       task = mochi_project, # changed from project = mochi_project, there must have been a change in agrument name
                       RT = (273+temperature)*0.001987)

            energies = mochi_project.get_additive_trait_weights(
                       RT = (273+temperature)*0.001987)

            mochi_project.save()

            print('Done with file', l2_regularization_factor_value, '\n', '################')

        print('Done with transformation', transformation, '\n', '################')
        
    print('Done with n_max_interaction_order', n_max_interaction_order, '\n', '################')

N_max_interaction_order 2
Transformation: Sigmoid
File /lustre/scratch126/gengen/projects/amyloid_beta_epistasis/DTS_joint_analysis/files/mochi_live_and_subsampled_dead_variants_DTS05_20240308.tsv
Loading fitness data
One-hot encoding sequence features
One-hot encoding interaction features
... Total theoretical features (order:count): 2:240
... Total retained features (order:count): 2:240 (100.0%)
Defining cross-validation groups
Defining coefficient groups
Done!
Performing grid search...
Fitting model:
{'fold': 1, 'seed': 1, 'grid_search': True, 'batch_size': 512, 'learn_rate': 0.05, 'num_epochs': 1000, 'num_epochs_grid': 100, 'l1_regularization_factor': 0.0, 'l2_regularization_factor': 1e-05, 'training_resample': True, 'early_stopping': True, 'scheduler_gamma': 0.98, 'scheduler_epochs': 10, 'loss_function_name': 'WeightedL1', 'sos_architecture': [5], 'sos_outputlinear': False}
Epoch 1; Avg_val_loss: 2.5757; WTcoef_1: 0.0727; WTres_1: -4.4592; 
Epoch 11; Avg_val_loss: 1.4733; WTcoef_1

KeyboardInterrupt: 

In [17]:
# just checking smth
# overall 48 models to fit (3 function fits x 2 N_max_interaction_order x 8 datasets)
# with 32 Gb of RAM and 8 CPU cores
# using 80 Gb GPU RAM

k_folds = 10

transformations = ['Sigmoid', 'ELU', 'Linear'] 

N_max_interaction_order = [2, 1]


for n_max_interaction_order in N_max_interaction_order:
    print('N_max_interaction_order', n_max_interaction_order)
    
    for transformation in transformations:
        print('Transformation:', transformation)
        
        for i in [5]:
            curr_file_path = filenames_full[i]
            curr_file_name = curr_file_path.split('/')[-1].split('.')[0]
            curr_dataset = datasets[i]
            
            print('File', curr_file_path)
        
            my_model_design = pd.DataFrame({
                       'phenotype': ['Nucleation_' + curr_dataset],

                        # transformation can be one of: Linear, ReLU, SiLU, Sigmoid, SumOfSigmoids, 
                        #                               TwoStateFractionFolded, ThreeStateFractionFolded, FourStateFractionFolded

                       'transformation': [transformation], 
                       'trait': ['Nucleation'],
                       'file': [curr_file_path]})

            mochi_project = MochiTask(
                       directory = output_dir + '20240226_copy_max_terms_order_' + str(n_max_interaction_order) + '_' + transformation + '_l2_regularization_factor_' + str(l2_regularization_factor_value) + '_' + curr_dataset + '_all_variants',
                       data = MochiData(
                          model_design = my_model_design,
                          max_interaction_order = n_max_interaction_order,
                          k_folds = k_folds),
                    l2_regularization_factor = l2_regularization_factor_value,

                    # this was 200 before by default - now might be a better fit - can try different values in the future
                sos_architecture = [5])

            # started at X
            # takes about 6-7?? minutes
            mochi_project.grid_search() 

            for i in range(k_folds):
                mochi_project.fit_best(fold = i+1)

            temperature = 30

            mochi_report = MochiReport(
                       task = mochi_project, # changed from project = mochi_project, there must have been a change in agrument name
                       RT = (273+temperature)*0.001987)

            energies = mochi_project.get_additive_trait_weights(
                       RT = (273+temperature)*0.001987)

            mochi_project.save()

            print('Done with file', l2_regularization_factor_value, '\n', '################')

        print('Done with transformation', transformation, '\n', '################')
        
    print('Done with n_max_interaction_order', n_max_interaction_order, '\n', '################')

N_max_interaction_order 2
Transformation: Sigmoid
File /lustre/scratch126/gengen/projects/amyloid_beta_epistasis/DTS_joint_analysis/files/mochi_live_and_subsampled_dead_variants_DTS14_20240308.tsv
Loading fitness data
One-hot encoding sequence features
One-hot encoding interaction features
... Total theoretical features (order:count): 2:176
... Total retained features (order:count): 2:176 (100.0%)
Defining cross-validation groups
Defining coefficient groups
Done!
Performing grid search...
Fitting model:
{'fold': 1, 'seed': 1, 'grid_search': True, 'batch_size': 512, 'learn_rate': 0.05, 'num_epochs': 1000, 'num_epochs_grid': 100, 'l1_regularization_factor': 0.0, 'l2_regularization_factor': 1e-05, 'training_resample': True, 'early_stopping': True, 'scheduler_gamma': 0.98, 'scheduler_epochs': 10, 'loss_function_name': 'WeightedL1', 'sos_architecture': [5], 'sos_outputlinear': False}
Epoch 1; Avg_val_loss: 3.5256; WTcoef_1: 0.0341; WTres_1: -3.5643; 
Epoch 11; Avg_val_loss: 2.0552; WTcoef_1

KeyboardInterrupt: 