# Generation of Synthetic Promoter Library

# Introduction

This notebook generates a synthetic promoter library. The synthetic promoters are generated within the distance that has been experimentaly sampled.

## System initiation

Loading all necessary libraries.

In [None]:
import os
import pandas as pd
import joblib
import pickle
import itertools
import random
import numpy as np
import time
import matplotlib.pyplot as plt

from ExpressionExpert_Functions import init_Exp2, Data_Src_Load, Sequence_Conserved_Adjusted, Insert_row_, ExpressionStrength_HeatMap, SequenceRandomizer_Parallel, Sequence_Ref_DiffSum, list_onehot, list_integer
%matplotlib inline

### Variable setting

We load the naming conventions from 'config.txt'

In [None]:
Name_Dict = init_Exp2('config_EcolPtai.txt')

Y_Col_Name = eval(Name_Dict['Y_Col_Name'])
File_Base = Name_Dict['Data_File'].split('.')[0]
Data_Folder = 'data-{}'.format(File_Base) 
Sequence_Distance_cutoff = float(Name_Dict['Sequence_Distance_cutoff'])
Synth_Seq_MaxNumber = int(Name_Dict['Synth_Seq_MaxNumber'])
Fig_Type = Name_Dict['Figure_Type']


## Data loading

General information on the data source csv-file is stored in the 'config.txt' file generated in the '0-Workflow' notebook. The sequence and expression data is stored in a csv file with an identifier in column 'ID' (not used for anything), the DNA-sequence in column 'Sequence', and the expression strength in column 'promoter activity'. While loading, the sequence is converted to a label encrypted sequence, ['A','C','G','T'] replaced by [0,1,2,3], and a one-hot encoding.

In [None]:
SeqDat = Data_Src_Load(Name_Dict)
SeqDat.head(3)

## Setting of exploration boundaries

### Extraction of experimentaly tested sequence positions

In [None]:
# removing non-informative positions where no base diversity exists, base one hot encoding

SeqDat_Hadj, Positions_removed, PSEntropy = Sequence_Conserved_Adjusted(SeqDat, Name_Dict, Entropy_cutoff=float(Name_Dict['Entropy_cutoff']))

# removing non-informative positions where no base diversity exists, based one hot encoding
idx = 0
Measure_Name = Y_Col_Name[idx]
# Expression_Column = '{}_scaled'.format(Measure_Name)
Expr_avg = ExpressionStrength_HeatMap(SeqDat_Hadj, Measure_Name)
Expr_avg = Insert_row_(Positions_removed, Expr_avg, np.zeros([len(Positions_removed),4])*np.nan)
Seq_Pos_Sampled = Expr_avg.notnull().astype('int')


In [None]:
# First the positions for nucleotide exchanges are determined
# The maximum sequence distance for experimentaly teste sequences to the reference was four nucleotides (cf. histogram of sequence distance in the statistical analysis notebook)
# We require an minimum positional entropy of 0.15 bit to assume sufficient sampling for reasonable prediction accuracy (cf. position entropy in the statistical analysis notebook)
# The random forest machine learner has assigned reasonable feature importance only to regions [-35 - -30] and [-12 - -7]
# Hence, mutations are only assigned to positions [-35, -34, -30, -10, -9, -8]
# For each position the following amount of nucleotides were tested [-35:4, -34:4, -30:3, -10:4, -9:3, -8:4]
# In total 2304 combinations are possible, these are further constrained by the maximum nucleotide distance to the reference sequence

# Reference promoter sequence
if Name_Dict['RefSeq'] is not '':
    RefSeq = Name_Dict['RefSeq']
    print('use reference sequence')
else:    
    # using the one-hot encoding the most common nucleotide on each position is calculated.
    Alphabet = ['A','C','G','T']
    Pos_Nucl_Sum = np.sum(np.dstack(SeqDat['Sequence'].values), axis=2)
    RefSeq_list = list([Alphabet[Pos_idx] for Pos_idx in np.argmax(Pos_Nucl_Sum, axis=1)])
    RefSeq = ''.join(RefSeq_list)
print('Reference sequence:', RefSeq)

# Nucleotides that can be randomized because they sampling is above the entropy threshold
Pos_random = -1*(len(RefSeq)-np.arange(PSEntropy.shape[0])[PSEntropy>float(Name_Dict['Entropy_cutoff'])])
Base_SequencePosition = Seq_Pos_Sampled.iloc[Pos_random]
print('Relevant positions and tested nucleotides')
print(Base_SequencePosition)
# calculating how many different sequences exist
# we check how many bases for each sufficiently sampled position are possible
# combinatorics gives the final answer
mybases, mycount = np.unique(np.sum(Base_SequencePosition.values, axis=1), return_counts=True)
Comb_Numb = np.prod(np.power(mybases, mycount))
print('Overall number of possible sequences:', Comb_Numb)

RefSeq_File = os.path.join(Data_Folder, 'RefSeq.txt')
NWT_File = os.path.join(Data_Folder, 'NucleotideWeightTable.pkl')
with open(RefSeq_File, 'w') as f:
    f.write(RefSeq)
Base_SequencePosition.to_pickle(NWT_File)

In [None]:
# if the space of possible sequences is lower than the maximum limit space, each sequences is generated
# if the space of possible sequences is larger than the maximum limit space, random sequences within the exploratory space are generated

if Comb_Numb < Synth_Seq_MaxNumber:
    # Deletion of non-tested nucleotides
    Pos_rand_numb = len(Pos_random)
    Alphabet = ['A','C','G','T']
    Seq_Base = np.tile(Alphabet, [Pos_rand_numb, 1]).tolist()
    # identification of positions where not all four nucleotides were tested
    Pos_Del, Nucl_Del = np.where(Seq_Pos_Sampled.iloc[Pos_random].values == 0)
    # replacing all nucleotides to be replaced by an 'X'
    myArr = np.array(Seq_Base)
    myArr[tuple([Pos_Del,Nucl_Del])] = 'X'
    Position_list = myArr.tolist()
    Seq_Base = list()
    for Position in Position_list:
        Seq_Base.append(list(set(Position).difference(set('X'))))
    Seq_Base_comb = list(itertools.product(*Seq_Base))
    for index in Pos_random+40: #+40
        RefSeq_list[index] = '{}'
    RefSeq_base = ''.join(RefSeq_list)
    # setting up the final promoter list
    Seq_Base_comb = [RefSeq_base.format(*Nucleotide_replace) for Nucleotide_replace in Seq_Base_comb]

    print('generated sequences: ',len(Seq_Base_comb))
else:
    Seq_Base_comb = SequenceRandomizer_Parallel(RefSeq, Base_SequencePosition, n=Synth_Seq_MaxNumber)
    
print('Preliminary generation of {} sequences.'.format(len(Seq_Base_comb)))

### Measuring synthetic sequence distance to reference

#### Generating arche-type reference
The reference is generated as the most commonly tested nucleotides on each position.

In [None]:
# setting up the full promoters
# Reference promoter sequence
# using the one-hot encoding the most common nucleotide on each position is calculated.

# For histogram of sequence diversity you can either root the distance to the most common nucleotide on each position or provide an external reference
if Name_Dict['RefSeq'] is not '':
    RefSeq = Name_Dict['RefSeq']
    print('use reference sequence')
else:    
    # using the one-hot encoding the most common nucleotide on each position is calculated.
    Alphabet = ['A','C','G','T']
    Pos_Nucl_Sum = np.sum(np.dstack(SeqDat['Sequence'].values), axis=2)
    RefSeq_list = list([Alphabet[Pos_idx] for Pos_idx in np.argmax(Pos_Nucl_Sum, axis=1)])
    RefSeq = ''.join(RefSeq_list)

print('Reference sequence:', RefSeq)

# # RefSeq = SeqDat['Sequence_letter-encrypted'][0]
# # RefSeq_list = list(RefSeq) #int(Pos_random+40)
# for index in Pos_random+40: #+40
#     RefSeq_list[index] = '{}'
# RefSeq_base = ''.join(RefSeq_list)
# # setting up the final promoter list
# SynMatrix_full = [RefSeq_base.format(*Nucleotide_replace) for Nucleotide_replace in Seq_Base_comb]
SynMatrix_full = Seq_Base_comb

# determining the amino acid substitutions of the snythetic promoters relative to the reference promoter
SeqDat_wRef = SynMatrix_full.copy()
SeqDat_wRef.insert(0, RefSeq)
RefSeq_Dist = Sequence_Ref_DiffSum(SeqDat_wRef)
NearDist_Bool = np.array(RefSeq_Dist)<Sequence_Distance_cutoff

RefSeq_NearDist = RefSeq_Dist[NearDist_Bool]
SynSeq_NearDist = np.array(SynMatrix_full)[NearDist_Bool]
SynSeq_ND_numb = SynSeq_NearDist.shape[0]
print('Number of sequences with less than {:.0f}% nucleotide changes to the reference: {}'.format(Sequence_Distance_cutoff*100, SynSeq_ND_numb))

plt.hist(RefSeq_NearDist*100) # BG42: [3,6:221]; BG35:[2,222:]
plt.xlabel('Sequence distance in %')
plt.ylabel('Occurence')
plt.tight_layout()
Fig_ID = Name_Dict['SampSeqDist_File']
SampSeqDist_File = os.path.join(Data_Folder, '{}_{}_{}.{}'.format(time.strftime('%Y%m%d'), File_Base, Fig_ID, Fig_Type))
# plt.savefig(SampSeqDist_File, bbox_inches='tight', format=Fig_Type)
plt.show()

### Removing measured sequences from library

In [None]:
Seq_Measured = np.unique(SeqDat['Sequence_letter-encrypted'].values)
print('Number of different promoter sequences measured: ', Seq_Measured.shape[0])

MeasSeq_Idx = [list(np.arange(SynSeq_ND_numb)[SynSeq_NearDist == np.array(SeqMeas.upper())]) for SeqMeas in Seq_Measured]
MeasSeq_Idx = np.unique(np.asarray(list(filter(None, MeasSeq_Idx))))
print('Library sequences already measured: ', MeasSeq_Idx)
# Some sequences are often repeatedly measured in the experiments, e.g. sequence 1116 in 'SynSeq_NearDist'
# print('Sequence of synthetic sequence 1116 measured 93 times', SynSeq_NearDist[1116])

# deletion of measured sequences
SynSeq_NearDist = np.delete(SynSeq_NearDist, MeasSeq_Idx)
print('Number of new sequences in exploratory region: ', SynSeq_NearDist.shape[0])

## Synthetic library expression strength

In [None]:
ML_Type = Name_Dict['ML_Regressor']
ML_Date = Name_Dict['ML_Date']
Sequence_column = Name_Dict['Sequence_column']
Measure_Numb = int(Name_Dict['Library_Expression'])
SynSeq_df = pd.DataFrame({Sequence_column: SynSeq_NearDist})
SynSeq_df[Name_Dict['ID_Col_Name']] = SynSeq_df.index
# GC content calculation
AddFeat = eval(Name_Dict['Add_Feat'])[0]
GCcont = [(SynSeq_df[Sequence_column][i].count('G')+SynSeq_df[Sequence_column][i].count('C'))/len(SynSeq_df[Sequence_column][i]) for i in range(len(SynSeq_df))]
SynSeq_df[AddFeat] = GCcont

for Meas_Idx in range(Measure_Numb):
    print('Prediction of', Y_Col_Name[Meas_Idx])
    
    # loading the random forest model
    Regressor_File = os.path.join(Data_Folder, '{}_{}_{}_{}-Regressor.pkl'.format(ML_Date, File_Base, Y_Col_Name[Meas_Idx].replace(' ','-'), ML_Type))
    ML_Best = joblib.load(Regressor_File)
    
    # loading data preparation parameters
    Parameter_File = os.path.join(Data_Folder, '{}_{}_{}_{}-Params.pkl'.format(ML_Date, File_Base, Y_Col_Name[Meas_Idx].replace(' ','-'), ML_Type))
    Data_Prep_Params = pickle.load(open(Parameter_File, 'rb'))
    
    Positions_removed = Data_Prep_Params['Positions_removed']
    # if the data was standardized we load the corresponding function
    if eval(Name_Dict['Data_Standard']):
        Scaler_DictName = '{}_Scaler'.format(Y_Col_Name[Meas_Idx])
        Expr_Scaler = Data_Prep_Params[Scaler_DictName]
    
    # prediction of expression strength
    n = len(SynSeq_NearDist)
    # one-hot encoded input with noninformative positions removed
    X_Test = np.array(list_onehot(np.delete(list_integer(SynSeq_NearDist),Positions_removed, axis=1))).reshape(n,-1)  
    # adding the additional feature, here GC-content
    X_Test = np.append(X_Test,SynSeq_df[AddFeat].values.reshape(-1,1), axis=1)
    Y_Test = ML_Best.predict(X_Test)
    # if the data was standardized we inverse transform to get original activity
    if eval(Name_Dict['Data_Standard']):
        Y_Test = Expr_Scaler.inverse_transform(Y_Test)
    SynSeq_df[Y_Col_Name[Meas_Idx]] = Y_Test

SynSeq_df = SynSeq_df.sort_values(by=Y_Col_Name[0])    
Csv_ID = Name_Dict['Csv_ID']
SynCsv_File = os.path.join('{}_{}_{}.csv'.format(time.strftime('%Y%m%d'), File_Base, Csv_ID)) #'data-PromLib_EcolPtai\\TillTest_predicted.xlsx'     
SynSeq_df.to_csv(SynCsv_File, index=None)


### Generating new config file for analysis of synthetic library

In [None]:
# constructing the config_synth.txt file
Name_Dict['Data_File'] = SynCsv_File
with open('config_synth.txt', 'w') as f:
    print('# This file contains the naming conventions for all output files. It is automatically generated when going through step "0-Workflow".', file=f)
    for key, value in Name_Dict.items():
        print('{}: {}'.format(key, value), file=f)