###### This notebook contains code for generating scripts for training the Epistatic Transformer model and hyperparameter search using Optuna

In [1]:
device = "cuda:0"

import pandas as pd
import os
import sys
sys.path.append('../model')
from utils import amino_acid_to_number, tokenize, Tee
from functions import get_A2N_list, tokenize, make_train_val_test_lists_rand, prepare_data
import pickle
from scipy.stats import pearsonr
import numpy as np
import matplotlib.pyplot as plt
source_path = "../"
sys.path.append(source_path + 'model')
from utils import amino_acid_to_number, tokenize, Tee
from functions import get_A2N_list, tokenize, make_train_val_test_lists_rand, prepare_data
from models import make_predictions, ProtDataset, Transformer_2k, LinearModel

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import random

import seaborn as sns

In [2]:
def write_line(line, filename):
    if os.path.exists(filename):
        with open(filename, 'a') as file:
            file.write(line + '\n')
    else: 
        with open(filename, 'w') as file:
            file.write(line + '\n')      

In [3]:
def make_command(filename, data_name, prefix, train_percent, seed, 
                 train_list=None, val_list=None, test_list=None,
                iter2=200, iter4=200, iter8=200):
    write_line("python3 run_script-CLI.py\\", filename)
    write_line("--device cuda:0 \\", filename)
    write_line(f"--data_name {data_name} \\", filename)
    write_line(f"--prefix {prefix} \\", filename)
    write_line(f"--train_percent {train_percent}\\", filename)
    write_line("--fit_linear\\", filename)
    write_line(f"--seed {seed}\\", filename)
    if train_list is not None:
        write_line("--specify_train\\", filename)
        write_line(f"--train_list {train_list}\\", filename)
    if val_list is not None:
        write_line("--specify_val\\", filename)
        write_line(f"--val_list {val_list}\\", filename)
    if test_list is not None:
        write_line("--specify_test\\", filename)
        write_line(f"--test_list {test_list}\\", filename)
        
    write_line(f"--iter2 {iter2}\\", filename)
    write_line(f"--iter4 {iter4}\\", filename)
    write_line(f"--iter8 {iter8}", filename)    
        
    # with open(filename, 'r') as file:
    #     content = file.read()
    # with open(filename, 'w') as file:
    #     file.write(content[:-2])
    
    write_line("\n", filename)

In [4]:
# data_path = "../Data/Data_prepared/"
data_path = "/blue/juannanzhou/ProteinLLE/Data/Data_prepared/"

In [5]:
!rm ../Data/Data_prepared/train_lists/*
!rm ../run_scripts/command*

rm: cannot remove '../Data/Data_prepared/train_lists/*': No such file or directory
rm: cannot remove '../run_scripts/command*': No such file or directory


In [6]:
filename = "../run_scripts/" + "commands.txt"

### Random All proteins

In [42]:
filename = "../run_scripts/" + "random_train_commands.txt"

In [43]:
!rm {filename}

rm: cannot remove '../run_scripts/random_train_commands.txt': No such file or directory


In [44]:
with open("../Data/Data_prepared/Protein_set.txt", 'r') as file:
    content = file.read()
    
protein_list = content.split('\n')[: -1]  

In [45]:
prefix = "Random"
seed = 0
for data_name in protein_list:
    protein_name = data_name.split(".")[0]
    for train_percent in [20, 50, 80]:
        print(protein_name)
        make_command(filename, protein_name, prefix, train_percent, seed, train_list=None, val_list=None, 
                    iter2=200, iter4=200, iter8=200)

Faure2023_1_lenient
Faure2023_1_lenient
Faure2023_1_lenient
Faure2023_3_binding
Faure2023_3_binding
Faure2023_3_binding
Faure2023_3_abundance
Faure2023_3_abundance
Faure2023_3_abundance
Sinai2021
Sinai2021
Sinai2021
Chen2023
Chen2023
Chen2023
Somermeyer2022_cgreGFP
Somermeyer2022_cgreGFP
Somermeyer2022_cgreGFP
Somermeyer2022_ppluGFP
Somermeyer2022_ppluGFP
Somermeyer2022_ppluGFP
Somermeyer2022_amacGFP
Somermeyer2022_amacGFP
Somermeyer2022_amacGFP
Pokusaeva_2019_S2
Pokusaeva_2019_S2
Pokusaeva_2019_S2
Pokusaeva_2019_S5
Pokusaeva_2019_S5
Pokusaeva_2019_S5
Pokusaeva_2019_S12
Pokusaeva_2019_S12
Pokusaeva_2019_S12


### GFP

In [11]:
data_name = "Somermeyer2022_4GFP"
datafile = pd.read_csv("../Data/Data_prepared/Somermeyer2022_4GFP.csv")

#### Focal gene

In [15]:
data_name = "Somermeyer2022_4GFP"
datafile = pd.read_csv("../Data/Data_prepared/Somermeyer2022_4GFP.csv")

# datafile = datafile[datafile.hd < 20]

phenotypes, seqs, seqs1h = prepare_data(datafile)
n, L, AA_size = seqs1h.shape

seqs1hf = seqs1h.reshape(-1, AA_size*L)
seqs1hf = seqs1hf.to(device).float()

seqs_ex = seqs + AA_size*torch.tensor(range(L))
X = seqs_ex.to(device)
y = phenotypes.to(device)

In [16]:
# Train list for each gene, with small set of data from other genes
# 50% of all data for focal gene
nrep = 3
train_file_names = []
val_file_names = []
gene_names = []
for gene in datafile.gene.unique():
    print(gene)
    experiment = f"focal_{gene}"
    sub = np.where(datafile.gene == gene)[0]
    sub = list(sub)
    comp_list = list(set(range(len(datafile))).difference(sub))
    for i in range(nrep):
        # train_list_focal = random.sample(sub, int(.5*len(sub)))
        train_list_focal = random.sample(sub, int(.5*len(sub)))        
        train_list_supp = random.sample(comp_list, int(.1 * len(train_list_focal)))    
        train_list = train_list_supp + train_list_focal
        comp_list_focal = set(sub).difference(train_list)
        comp_list_focal = list(comp_list_focal)

        diff = L - seqs1hf[comp_list_focal].matmul(seqs1hf[train_list_focal].T)
        meandiff = diff.mean(1)
        meandiff = meandiff.cpu().detach().numpy()

        val_list = np.array(comp_list_focal)[meandiff > np.quantile(meandiff, .7)]
        val_list = list(val_list)
        print(len(val_list))
        
        train_file_name = data_path + "train_lists/" + f"{data_name}_" + experiment + "_train_list_rep_" + str(i) + ".pkl"
        val_file_name = data_path + "train_lists/" + f"{data_name}_" + experiment + "_val_list_rep_" + str(i) + ".pkl"

        train_file_names.append(train_file_name)
        val_file_names.append(val_file_name)
        gene_names.append(gene)

        with open(train_file_name, 'wb') as file:
            pickle.dump(train_list, file)
        with open(val_file_name, 'wb') as file:
            pickle.dump(val_list, file)

amacGFP
5325
5325
cgreGFP
3925
3925
ppluGFP
4835
4835
avGFP
8104
8103


In [18]:
for i in range(len(train_file_names)):
    prefix = gene_names[i] + "_focal"
    train_percent = 10000
    seed = i
    train_list = train_file_names[i]
    val_list = val_file_names[i]
    make_command(filename, data_name, prefix, train_percent, seed, train_list=train_list, val_list=val_list, 
                iter2=200, iter4=200, iter8=200)

In [19]:
train_file_name = "/blue/juannanzhou/ProteinLLE/Data/Data_prepared/train_lists/Somermeyer2022_4GFP_focal_ppluGFP_train_list_rep_0.pkl"
with open(train_file_name, 'rb') as file:
    train_list = pickle.load(file)

val_file_name = "/blue/juannanzhou/ProteinLLE/Data/Data_prepared/train_lists/Somermeyer2022_4GFP_focal_ppluGFP_val_list_rep_0.pkl"
with open(val_file_name, 'rb') as file:
    val_list = pickle.load(file)

datafile.iloc[train_list].gene.value_counts()

datafile.iloc[val_list].gene.value_counts()

ppluGFP    4835
Name: gene, dtype: int64

#### HD 2

In [12]:
# HD random training list
train_file_names = []
val_file_names = []
experiment = "HD2"

for i in range(2):
    train_list = random.sample(list(np.where(datafile.hd <= 2)[0]), 40000)
    val_list = random.sample(list(np.where(datafile.hd > 2)[0]), 10000)
    
    train_file_name = data_path + "train_lists/" + f"{data_name}_" + experiment + "_train_list_rep_" + str(i) + ".pkl"
    val_file_name = data_path + "train_lists/" + f"{data_name}_" + experiment + "_val_list_rep_" + str(i) + ".pkl"
    
    train_file_names.append(train_file_name)
    val_file_names.append(val_file_name)
    
    with open(train_file_name, 'wb') as file:
        pickle.dump(train_list, file)
    with open(val_file_name, 'wb') as file:
        pickle.dump(val_list, file)

In [1]:
train_percent = 10000
seed = 0

for i in range(len(train_file_names)):
    prefix = experiment_names[i]
    train_list = train_file_names[i]
    val_list = val_file_names[i]
    make_command(filename, data_name, prefix, train_percent, seed, train_list=train_list, val_list=val_list, 
                iter2=200, iter4=200, iter8=200)