The intention of this notebook is to ***sample/create the sequence data***  for the  T1626 dataset provided by Merck&Co. 

##Notebook Setup##

In [0]:
#Imports:
import os
import numpy as np
import pandas as pd
import time
import requests
import json
from collections import defaultdict

In [0]:
#Connect to google drive:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
#Set up file paths:
data_folder_loc = "gdrive/My Drive/iGEM/Databases/Merck&Co/Data/Source/"
merck_and_co_excel_loc = data_folder_loc + "merck_and_co.xls"
target_data_folder_loc = "gdrive/My Drive/iGEM/Databases/Merck&Co/Data/T1626/sequence_sampler_T1626/"
sequence_dict_loc = target_data_folder_loc+"sequence_dict_T1626.json"
sequence_dict_ext_loc = target_data_folder_loc+"sequence_dict_ext_T1626.json"
print(os.path.isdir(data_folder_loc))
print(os.path.isdir(target_data_folder_loc))
print(os.path.isfile(merck_and_co_excel_loc))

True
True
True


## Sample Base Data ##

In [0]:
#Read excel table T1626:
T1626_table = pd.read_excel(io=merck_and_co_excel_loc, sheet_name=0, header=0)

In [0]:
#remove characters from column heads that could couse problems
T1626_table.columns = map(lambda x: x.replace("[", "").replace("]", "").replace(">", "_greater_").replace("<", "_smaller_").replace(" ", "_").replace(".", "_").replace("Δ", "delta_").replace("Å", "A").replace("(", "_").replace(")", "_"), T1626_table.columns)

### Define semantically grouped lists of features  ###


In [0]:
########################################
target_label = 'dtm'

sequence_identity_related_columns = ['Mutation', 'dtm', 'wt', 'resi', 'mut', 'pdbId', 'protein']

features_type_A = ['beta_first', 'beta_second', 'beta_diff', 'alpha_first', 'alpha_second', 'alpha_diff', 'coil_first', 'coil_second', 'coil_diff', 'mutability_first', 'mutability_second', 'mutability_diff', 'flexibility_first', 'flexibility_second', 'flexibility_diff', 'aromatic_first', 'aromatic_second', 'aromatic_diff', 'polarity_first', 'polarity_second', 'polarity_diff', 'pi_first', 'pi_second', 'pi_diff', 'consensus_hydrophobicity_first', 'consensus_hydrophobicity_second', 'consensus_hydrophobicity_diff', 'hydrophobicity_first', 'hydrophobicity_second', 'hydrophobicity_diff', 'bulkiness_first', 'bulkiness_second', 'bulkiness_diff', 'mol_wt_first', 'mol_wt_second', 'mol_wt_diff', 'vdw_vol_first', 'vdw_vol_second', 'vdw_vol_diff', 'branched_first', 'branched_second', 'branched_diff', 'logd_first', 'logd_second', 'logd_diff', 'charge_first', 'charge_second', 'charge_diff', 'polarity2_first', 'polarity2_second', 'polarity2_diff', 'dg_o_w_first', 'dg_o_w_second', 'dg_o_w_diff']
features_type_L = ['wtss', 'totalarea', 'fractionburied', 'hydrophobicratio', 'hydrophobicarea']
# version where wtss is excluded, because its not available in all sub datasets
features_type_L_available = ['totalarea', 'fractionburied', 'hydrophobicratio', 'hydrophobicarea'] 
features_type_G = ['dmobility', 'ddipole_moment', 'dasa_vdw', 'deccen', 'dasa_hph', 'dhelicity', 'dzdipole', 'dr_solv', 'dapp_charge', 'dsed_const', 'dpi_3d', 'dzquadrupole', 'dasa_hyd', 'dzeta', 'dhenry', 'dpi_seq', 'dnet_charge', 'dvolume', 'dcoeff_fric', 'dr_gyr', 'ddebye', 'dhyd_moment', 'dcoeff_280']

feature_variant_polarity = 'variant_polarity'
features_energy_BL = ['r_bioluminate_delta_Stability_Solv_SA', 'r_bioluminate_delta_Stability_SelfCont', 'r_bioluminate_delta_Stability_Packing', 'r_bioluminate_delta_Stability_Covalent', 'r_bioluminate_delta_Stability_Reference', 'r_bioluminate_delta_Stability_vdW', 'r_bioluminate_delta_Stability_Hbond', 'r_bioluminate_delta_Stability_Coulomb', 'r_bioluminate_delta_Stability_Solv_GB', 'r_bioluminate_delta_Stability_Lipo', 'r_bioluminate_delta_Stability']
feature_energy_ddG_BL = 'r_bioluminate_delta_Stability' #PRObABLY?
features_energy_CART = ['cart_ddg', 'cart_fa_dun_dev', 'cart_cart_bonded', 'cart_fa_intra_sol_xover4', 'cart_dslf_fa13', 'cart_fa_intra_elec', 'cart_fa_dun_semi', 'cart_hbond_sr_bb', 'cart_fa_sol', 'cart_lk_ball_bridge', 'cart_rama_prepro', 'cart_hbond_sc', 'cart_hbond_lr_bb', 'cart_lk_ball_bridge_uncpl', 'cart_hbond_bb_sc', 'cart_lk_ball_iso', 'cart_fa_intra_rep_xover4', 'cart_fa_atr', 'cart_fa_rep', 'cart_lk_ball', 'cart_fa_dun_rot', 'cart_omega', 'cart_ref', 'cart_hxl_tors', 'cart_fa_intra_atr_xover4', 'cart_p_aa_pp', 'cart_fa_elec']
feature_energy_ddG_CART = 'cart_ddg'
features_energy_DS = ['ds_vdw', 'ds_elec', 'ds_entropy', 'ds_ddg', 'dS']
feature_energy_ddG_DS = 'ds_ddg'
features_energy_MOE = ['dasa', 'dEff_vdw', 'dEff_elc', 'dEsg', 'dEself', 'dh_ema', 'dh_emd', 'da_acc', 'da_don', 'da_asa_p', 'db_rotN', 'da_nH', 'dgly', 'da_heavy']
#feature_energy_ddG_MOE = #DONT KNOW REALLY?
#PROBLEM WITH MONO: DOCUMENTATION AND DATASET DONT MATCH, HOWEVER, DATASET CONSISTENT IN IT SELF
features_energy_MONO = ['mono_ddg', 'mono_fa_atr', 'mono_fa_rep', 'mono_fa_sol', 'mono_fa_intra_rep', 'mono_pro_close', 'mono_fa_pair', 'mono_hbond_sr_bb', 'mono_hbond_lr_bb', 'mono_hbond_bb_sc', 'mono_hbond_sc', 'mono_dslf_ss_dst', 'mono_dslf_cs_ang', 'mono_dslf_ss_dih', 'mono_dslf_ca_dih', 'mono_rama', 'mono_omega', 'mono_fa_dun', 'mono_p_aa_pp', 'mono_ref']
feature_energy_ddG_MONO = 'mono_ddg'

featrures_ALG_CART_tiE = features_type_A + features_type_L + features_type_G +features_energy_CART
features_all_used_by_paper = features_type_A + features_type_L + features_type_G + features_energy_BL + features_energy_CART + features_energy_DS + features_energy_MOE + features_energy_MONO

HM_features = ['proteinAbbreviation', 'proteinCommon', 'Organism', 'source', 'ddG', 'delta_Tmexp', 'Tmexp_wt', 'delta_delta_Hmexp', 'delta_Hmexp_wt', 'delta_delta_CPexp', 'delta_CPexp_wt', 'delta_delta_Gexp_T_', 'T', 'Nres', 'R__A_', 'Ref_', 'pH', 'Exp_Tech_', 'wtPolarity', 'mutPolarity']
HM_feature_sequemce_Tm = 'Tmexp_wt'
HM_feature_organism_growth_T = 'T'
HM_feature_organism_pH = 'pH'
HM_feature_sequence_len = 'Nres'
HM_features_promising = [HM_feature_sequemce_Tm,HM_feature_organism_growth_T,HM_feature_organism_pH,HM_feature_sequence_len]

#features that dont make sense to use beacuase they would probably not be available beforehand:
HM_unrealistic_potentially_useful_features = ['ddG','delta_delta_Hmexp', 'delta_delta_CPexp']
#features that I dont really know what they mean
HM_potentially_useful_features = ['delta_Hmexp_wt', 'delta_CPexp_wt']

features_all_available_that_make_sense = features_all_used_by_paper + HM_features_promising

## Sample sequences

### Retrieve IDs and raw sequences from PDB

In [0]:
sequence_ids = T1626_table["Mutation"].apply(lambda x: x.split("@")[0]).unique()

In [0]:
pdb_ids = pd.Series(list(filter(lambda x: not "_" in x, sequence_ids))).unique()

In [0]:
pdb_ids



```
array(['1AKY', '1AM7', '1ANK', '1AQH', '1AVR', '1AYF', '1AZP', '1BNI',
       '1BRF', '1BVC', '1C52', '1C8C', '1C9O', '1CHK', '1CSP', '1CYO',
       '1E0W', '1E21', '1E65', '1ESF', '1EY0', '1EZM', '1F6R', '1FHL',
       '1FNA', '1FVK', '1G5A', '1GV5', '1H7M', '1H8V', '1HFY', '1I4N',
       '1IHB', '1IO2', '1IRO', '1JIW', '1JNX', '1JU3', '1KE4', '1KF2',
       '1KF3', '1KF5', '1KFW', '1L63', '1LHM', '1LNI', '1LZ1', '1MJ5',
       '1MJC', '1OLR', '1ONC', '1PGA', '1POH', '1QLP', '1RBP', '1RN1',
       '1RRO', '1RTP', '1SHF', '1SHG', '1SUP', '1T69', '1TCA', '1TPK',
       '1TTQ', '1WQ5', '1YCC', '1YEA', '1YNR', '1YU5', '1ZDR', '1ZYM',
       '2A01', '2CI2', '2CNC', '2CPP', '2CTH', '2HBB', '2HIP', '2LZM',
       '2OV0', '2RN2', '2TRX', '3D2C', '3KS3', '3MBP', '3SIL', '3UUE',
       '451C', '4BLM', '4LYZ', '4U2B', '5DFR', '5PTI'], dtype=object)
```



In [0]:
def fasta_to_dict(fasta_string):
  result_dict = {}
  fasta_split = fasta_string.replace("\n","").replace(" ","").split(">")[1:]
  for split in fasta_split:
    temp = split.split("|")
    result_dict[temp[0].split(":")[1]] = temp[-1][8:]
  return result_dict

In [0]:
def request_pdb_id_sequence(pdb_id):
  url = 'https://www.rcsb.org/pdb/download/downloadFastaFiles.do?'
  params = {
    'structureIdList':pdb_id,
    'compressionType':'uncompressed'
  }
  r = requests.post(url, data = params, timeout=60)
  temp_result = fasta_to_dict(r.text)
  return {pdb_id:temp_result}

In [0]:
def create_pdb_sequence_dict_requester(list_of_ids, destination_save):
  list_of_ids_filtered = list(filter(lambda x: str(x)!='nan',list_of_ids))
  result = {}
  counter = 0
  fails = []
  for x in list_of_ids_filtered:
    if counter%10 == 0:
      print("Currently at ",counter)
    try:
      result.update(request_pdb_id_sequence(x))
    except:
      print("Failed: ",x)
      fails.append(x)
    time.sleep(1)
    counter += 1
  with open(destination_save, 'w') as fout:
    json.dump(result, fout)
  print("Success")
  return fails

In [0]:
failed_requests = create_pdb_sequence_dict_requester(pdb_ids,sequence_dict_loc)



```
Currently at  0
Currently at  10
Currently at  20
Currently at  30
Currently at  40
Currently at  50
Currently at  60
Currently at  70
Failed:  2A01
Currently at  80
Currently at  90
Success
```



In [0]:
#2A01 manually checked from pdb file:
"""
SEQRES   1 A  243  ASP GLU PRO PRO GLN SER PRO TRP ASP ARG VAL LYS ASP          
SEQRES   2 A  243  LEU ALA THR VAL TYR VAL ASP VAL LEU LYS ASP SER GLY          
SEQRES   3 A  243  ARG ASP TYR VAL SER GLN PHE GLU GLY SER ALA LEU GLY          
SEQRES   4 A  243  LYS GLN LEU ASN LEU LYS LEU LEU ASP ASN TRP ASP SER          
SEQRES   5 A  243  VAL THR SER THR PHE SER LYS LEU ARG GLU GLN LEU GLY          
SEQRES   6 A  243  PRO VAL THR GLN GLU PHE TRP ASP ASN LEU GLU LYS GLU          
SEQRES   7 A  243  THR GLU GLY LEU ARG GLN GLU MET SER LYS ASP LEU GLU          
SEQRES   8 A  243  GLU VAL LYS ALA LYS VAL GLN PRO TYR LEU ASP ASP PHE          
SEQRES   9 A  243  GLN LYS LYS TRP GLN GLU GLU MET GLU LEU TYR ARG GLN          
SEQRES  10 A  243  LYS VAL GLU PRO LEU ARG ALA GLU LEU GLN GLU GLY ALA          
SEQRES  11 A  243  ARG GLN LYS LEU HIS GLU LEU GLN GLU LYS LEU SER PRO          
SEQRES  12 A  243  LEU GLY GLU GLU MET ARG ASP ARG ALA ARG ALA HIS VAL          
SEQRES  13 A  243  ASP ALA LEU ARG THR HIS LEU ALA PRO TYR SER ASP GLU          
SEQRES  14 A  243  LEU ARG GLN ARG LEU ALA ALA ARG LEU GLU ALA LEU LYS          
SEQRES  15 A  243  GLU ASN GLY GLY ALA ARG LEU ALA GLU TYR HIS ALA LYS          
SEQRES  16 A  243  ALA THR GLU HIS LEU SER THR LEU SER GLU LYS ALA LYS          
SEQRES  17 A  243  PRO ALA LEU GLU ASP LEU ARG GLN GLY LEU LEU PRO VAL          
SEQRES  18 A  243  LEU GLU SER PHE LYS VAL SER PHE LEU SER ALA LEU GLU          
SEQRES  19 A  243  GLU TYR THR LYS LYS LEU ASN THR GLN """
#later turned out 1URP was missing too?
dict_additional = {"2A01":{"A":"DEPPQSPWDRVKDLATVYVDVLKDSGRDYVSQFEGSALGKQLNLKLLDNWDSVTSTFSKLREQLGPVTQEFWDNLEKETEGLRQEMSKDLEEVKAKVQPYLDDFQKKWQEEMELYRQKVEPLRAELQEGARQKLHELQEKLSPLGEEMRDRARAHVDALRTHLAPYSDELRQRLAARLEALKENGGARLAEYHAKATEHLSTLSEKAKPALEDLRQGLLPVLESFKVSFLSALEEYTKKLNTQ"},
                   "1URP":{"A":"KDTIALVVSTLNNPFFVSLKDGAQKEADKLGYNLVVLDSQNNPAKELANVQDLTVRGTKILLINPTDSDAVGNAVKMANQANIPVITLDRQATKGEVVSHIASDNVLGGKIAGDYIAKKAGEGAKVIELQGIAGTSAARERGEGFQQAVAAHKFNVLASQPADFDRIKGLNVMQNLLTAHPDVQAVFAQNDEMALGALRALQTAGKSDVMVVGFDGTPDGEKAVNDGKLAATIAQLPDQIGAKGVETADKVLKGEKVQAKYPVDLKLVVKQ"}}

In [0]:
#create updated sequence dictionary
pdb_sequence_dict = None
with open(sequence_dict_loc, 'r') as f:
        pdb_sequence_dict = json.load(f)
pdb_sequence_dict.update(dict_additional) 
with open(sequence_dict_ext_loc, 'w') as fout:
    json.dump(pdb_sequence_dict, fout)

### Create mutations

In [0]:
pdb_special_ids = pd.Series(list(filter(lambda x: "_" in x, sequence_ids))).unique()

In [0]:
pdb_special_ids



```
array(['1BNI_H102A', '1TPK_R', '1URP_L265C', '1YCC_C102A', '1YU5_D1',
       '1YU5_D2', '5PTI_M52L'], dtype=object)
```



"The experimental techniques used for measuring the protein melting temperatures andother thermodynamic quantities are indicated in the dataset. These are differential scanningcalorimetry (DSC), circular dichroism (CD), absorbance (Abs), and fluorescence.The Protein DataBank (PDB) code [9] of the best resolved 3D X-ray structure of eachwild type protein is specified in the dataset. For a few entries, the PDB code is labeled with asubscript. This means that the wild type structure of the protein whose∆Tmwas measuredwas unavailable, and that the structure of an almost identical protein was used instead,under the assumption that the impact of the modification on the structure is negligible. Inparticular, the 1bnih102acode means that the structure is obtained from the PDB structure1bni with the His residue at position 102 manually substituted into an Ala. The sameprocedure is used for the PDB structures 1yccc102a, 1urpl265cand 5ptim52l. The other PDBcodes with subscripts,i.e1tpkr, 1yu5d1and 1yu5d2, refer to experimentally characterizedproteins whose sequences have been manually truncated by a few residues compared to theoriginal PDB structure. Note that we checked that the mutations or truncated residues inthese pseudo-wild type proteins are all distant from the mutations whose∆Tmwas measured,so that they may be assumed as not interfering." Pucci et al. 2016

In [0]:
mutations = T1626_table[["Mutation"]].copy()

In [0]:
mutations.head()

Unnamed: 0,Mutation
0,1AKY@A@I213F
1,1AKY@A@N169D
2,1AKY@A@Q48E
3,1AKY@A@T110H
4,1AKY@A@T77H


In [0]:
#retrieve sequence dict
pdb_sequence_dict = None
with open(sequence_dict_ext_loc, 'r') as f:
        pdb_sequence_dict = json.load(f)

**Manually looked up offsets for sequences:** (in breaket: the number of affected mutations)
* case 0: all mutation fit => no offset needed
* case 1: mutations fit when applying offset found in PDB: 1H7M, 1SHF, 1YCC, 1FNA, 1GV5, 1I4N, 1JNX, 3UUE, 3SIL, 1MJC, 1KE4, 1KFW, 1YU5, 4BLM, 1YCC, 1FNA
* case 2: mutations fit when applying uniquely identifiable offset or offset-1: 1BNI (34) (mistakenly in this category, all mutations seem fit with offset 0), 1TPK (6+15), 4BLM (4), 1YEA (1) (made mistake I think, did +1 instead of -1), 3KS3 (2), 1AYF (18) (mistakenly in this category, turns out every offset with specified PDB offset 3)

In [0]:
start_offset_dict = {"1H7M":[-2],"1SHF":[83],"1YCC":[-5],"1FNA":[5],"1GV5":[89],"1I4N":[1],"1JNX":[1645],"3UUE":[25],"3SIL":[3],"1MJC":[1],"1KE4":[3],"1KFW":[9], "1YU5":[9], "4BLM":[25], "1YCC":[-5], "1FNA":[5], "1YEA":[-11], "3KS3":[-1], "1BNI":[0,1], "1TPK":[-5,-6],"4BLM":[28,29],"1AYF":[3,-3]}

In [0]:
def find_corresponding_sequence(protein_id,seq_dict):
  assert len(protein_id.split("_"))==1, "Invalid pdb id."
  if "A" in seq_dict[protein_id]:
    return seq_dict[protein_id]["A"] #checked preemtively, all mutations on A variant
  elif "1" in seq_dict[protein_id]: #assumed that 1 is the numeric equivalent of A
    return seq_dict[protein_id]["1"]
  elif len(seq_dict[protein_id])==1: #only one sequence => must be the one
    return seq_dict[protein_id][list(seq_dict[protein_id].keys())[0]]
  else:
    print("WARNIN: ",protein_id, " (Took first sequence due to ambiguity!)") #I know it happens only one time, and first seqence matches mutations...
    return seq_dict[protein_id][list(seq_dict[protein_id].keys())[0]]

In [0]:
#create the appropriate mutant base on the wild-type and mutation definition (and additional data like offset etc.)
def mutate_seq(seq_id, seq, mutation, offset_dict):
  assert len(seq_id.split("_"))==1, "Invalid pdb id."
  mut_idx = int(mutation[1:-1])-1 #strip mutation definition dow to position and convert position to index
  wt = mutation[0]
  mut = mutation[-1]
  
  #handle offset lists
  if seq_id in offset_dict:
    for offset in offset_dict[seq_id]:
      if mut_idx-offset<len(seq) and mut_idx-offset>=0:
        if seq[mut_idx-offset]==wt:
          mut_idx -= offset
          return seq[:mut_idx]+mut.strip()+seq[mut_idx+1:], offset #return the mutated sequences and the offset that was used
  
  if mut_idx >=len(seq) or mut_idx<0:
    print("index error: ", seq_id, mutation)
    
  if seq[mut_idx] != wt:
    print("Mutation mismatch! ID: ", seq_id, " Mutation: ", mutation, " Index: ", mut_idx, " Found: ",seq[mut_idx])
  return seq[:mut_idx]+mut.strip()+seq[mut_idx+1:], 0 

In [0]:
#find the appropriate wild-type sequence and premute it if necessary 
def find_base_sequence(mutation_encoding, seq_dict, start_offset_dict):
  protein_id = mutation_encoding.split("@")[0]
  temp_split = protein_id.split("_") #just take the sequence as is from the dictionary
  if len(temp_split)==1:
    return find_corresponding_sequence(temp_split[0], seq_dict)
  elif len(temp_split[1]) <= 2: #truncated ones: dont really know how they were truncated, just take the default sequence
    return find_corresponding_sequence(temp_split[0], seq_dict)
    print("Truncated: ", protein_id)
  else: #premutate sequences appropriately
    temp_seq, temp_offset = mutate_seq(temp_split[0], find_corresponding_sequence(temp_split[0], seq_dict), temp_split[1], start_offset_dict)
    return temp_seq

In [0]:
mutations["base_sequence"] = mutations["Mutation"].apply(lambda x: find_base_sequence(x, pdb_sequence_dict, start_offset_dict)) 



```
WARNIN:  1JIW  (Took first sequence due to ambiguity!)
WARNIN:  1JIW  (Took first sequence due to ambiguity!)
WARNIN:  1JIW  (Took first sequence due to ambiguity!)
WARNIN:  1JIW  (Took first sequence due to ambiguity!)
```



In [0]:
def create_mutations_offset_columns(mutations_df, start_offset_dict):
  temp_list = []
  temp_list2 = []
  for i in range(len(mutations_df)):
    temp_split = mutations_df.iloc[i]["Mutation"].split("@")
    tmp1, tmp2 = mutate_seq(seq_id=temp_split[0].split("_")[0], seq=mutations_df.iloc[i]["base_sequence"], mutation=temp_split[2], offset_dict=start_offset_dict)
    temp_list.append(tmp1)
    temp_list2.append(tmp2)
  return temp_list, temp_list2

In [0]:
mutated_sequence, offsets = create_mutations_offset_columns(mutations,start_offset_dict)

In [0]:
mutations["mutated_sequence"] = mutated_sequence
mutations["mutation_offsets"] = offsets

In [0]:
mutations.iloc[1500:1510]

Unnamed: 0,Mutation,base_sequence,mutated_sequence,mutation_offsets
1500,2TRX@A@T77V,SDKIIHLTDDSFDTDVLKADGAILVDFWAEWCGPCKMIAPILDEIA...,SDKIIHLTDDSFDTDVLKADGAILVDFWAEWCGPCKMIAPILDEIA...,0
1501,3D2C@A@M134E,AEHNPVVMVHGIGGSSSNFEGIKSYLVSQGWSRDKLYAVDFWDKTG...,AEHNPVVMVHGIGGSSSNFEGIKSYLVSQGWSRDKLYAVDFWDKTG...,0
1502,3D2C@A@M137P,AEHNPVVMVHGIGGSSSNFEGIKSYLVSQGWSRDKLYAVDFWDKTG...,AEHNPVVMVHGIGGSSSNFEGIKSYLVSQGWSRDKLYAVDFWDKTG...,0
1503,3D2C@A@S163P,AEHNPVVMVHGIGGSSSNFEGIKSYLVSQGWSRDKLYAVDFWDKTG...,AEHNPVVMVHGIGGSSSNFEGIKSYLVSQGWSRDKLYAVDFWDKTG...,0
1504,3KS3@A@E234P,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,-1
1505,3KS3@A@K170P,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,-1
1506,3MBP@A@P133A,KIEEGKLVIWINGDKGYNGLAEVGKKFEKDTGIKVTVEHPDKLEEK...,KIEEGKLVIWINGDKGYNGLAEVGKKFEKDTGIKVTVEHPDKLEEK...,0
1507,3MBP@A@P133S,KIEEGKLVIWINGDKGYNGLAEVGKKFEKDTGIKVTVEHPDKLEEK...,KIEEGKLVIWINGDKGYNGLAEVGKKFEKDTGIKVTVEHPDKLEEK...,0
1508,3MBP@A@P159A,KIEEGKLVIWINGDKGYNGLAEVGKKFEKDTGIKVTVEHPDKLEEK...,KIEEGKLVIWINGDKGYNGLAEVGKKFEKDTGIKVTVEHPDKLEEK...,0
1509,3MBP@A@P159S,KIEEGKLVIWINGDKGYNGLAEVGKKFEKDTGIKVTVEHPDKLEEK...,KIEEGKLVIWINGDKGYNGLAEVGKKFEKDTGIKVTVEHPDKLEEK...,0


In [0]:
#check mutations:
for i in range(len(mutations)):
  mut = mutations.iloc[i]["Mutation"].split("@")[2]
  index = int(mut[1:-1])-1-mutations.iloc[i]["mutation_offsets"]
  if mutations.iloc[i]["base_sequence"][index] != mut[0] or mutations.iloc[i]["mutated_sequence"][index] != mut[-1]:
    print("Upsi!")
print("Done!")



```
Done!
```



## Merge dataframes and export to CSV

In [0]:
T1626_table["base_sequence"] = mutations.base_sequence
T1626_table["mutated_sequence"] = mutations.mutated_sequence
T1626_table["mutation_offsets"] = mutations.mutation_offsets

In [0]:
T1626_table.to_csv(target_data_folder_loc+"T1626_with_sequences.csv")

In [0]:
mutations.to_csv(target_data_folder_loc+"T1626_sequences_only.csv")

## Transform sequence data to numpy arrays

In [0]:
# check for unusable sequences:
print((T1626_table["base_sequence"].apply(lambda x: len(x))>650).sum())
print((T1626_table["base_sequence"].apply(lambda x: len(x))<50).sum())



```
0
0
```



In [0]:
IUPAC_Extended_Dic_Transf = {"A":1,"C":2,"D":3,"E":4,"F":5,"G":6,"H":7,"I":8,"K":9,"L":10,"M":11,"N":12,"P":13,"Q":14,"R":15,"S":16,"T":17,"V":18,"W":19,"Y":20,"B":21,"X":21,"Z":21,"J":21,"U":21,"O":21}

In [0]:
def seq_transf_pad(seq, target_len):
    return np.pad(np.array(list(map(lambda x: IUPAC_Extended_Dic_Transf[x], seq)),dtype="int8"),(0, target_len - len(seq)), 'constant')

In [0]:
# only mesophilic xor thermophilic
def get_X_padded_tranf(df_u,column_name,seq_len=650):
    df_u = df_u[[column_name]]
    X_all = np.zeros((len(df_u),seq_len),dtype="int8")

    for i in range(len(df_u)):
        X_all[i] = seq_transf_pad(df_u.iloc[i][column_name],seq_len)

    return X_all

In [0]:
X_wild = get_X_padded_tranf(T1626_table, column_name="base_sequence", seq_len=650)

In [0]:
X_mut = get_X_padded_tranf(T1626_table, column_name="mutated_sequence", seq_len=650)

In [0]:
# check for unwanted amino acids
print(np.array(list(map(lambda x: 21 in x, X_wild))).sum())
print(np.array(list(map(lambda x: 21 in x, X_mut))).sum())



```
0
0
```



In [0]:
np.save(target_data_folder_loc+"T1626_X_wild", X_wild)

In [0]:
np.save(target_data_folder_loc+"T1626_X_mut", X_mut)

## Create ambiguous mutation filter (_f)

In [0]:
ambiguous_mut_seq = ["1YEA", "3KS3", "1BNI", "1TPK", "4BLM", "1AYF"]

In [0]:
unambiguous_mut_seq_filter = list(map(lambda x: not(x.split("@")[0].split("_")[0] in ambiguous_mut_seq), T1626_table[["Mutation"]].values.reshape((len(T1626_table[["Mutation"]],)))))

In [0]:
unambiguous_mut_df = pd.DataFrame()
unambiguous_mut_df["Unambiguous"] = unambiguous_mut_seq_filter

In [0]:
unambiguous_mut_df.to_csv(target_data_folder_loc+"unambiguous_mut_filter_T1626.csv",index=False)

In [0]:
print("Number of ambiguous mutations: ",len(unambiguous_mut_df)-unambiguous_mut_df["Unambiguous"].sum())

Number of ambiguous mutations:  80
