The intention of this notebook is to ***sample/create the sequence data* ** from the  T96 dataset provided by Merck&Co. 

##Notebook Setup##

In [0]:
#Imports:
import os
import numpy as np
import pandas as pd
import time
import requests
import json
from collections import defaultdict

In [0]:
#Connect to google drive:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
#Set up file paths:
data_folder_loc = "gdrive/My Drive/iGEM/Databases/Merck&Co/Data/Source/"
merck_and_co_excel_loc = data_folder_loc + "merck_and_co.xls"
target_data_folder_loc = "gdrive/My Drive/iGEM/Databases/Merck&Co/Data/T96/sequence_sampler_T96/"
print(os.path.isdir(data_folder_loc))
print(os.path.isdir(target_data_folder_loc))
print(os.path.isfile(merck_and_co_excel_loc))

True
True
True


## Sample Base Data ##

In [0]:
#Read excel table T1626:
T96_table = pd.read_excel(io=merck_and_co_excel_loc, sheet_name=2, header=0)

In [0]:
T96_table.head()

Unnamed: 0,Mutation,dtm,wt,mut,Variant,Pucci MutCount,Gkase MutCount,fractionburied,hydrophobicarea,hydrophobicratio,...,BL_tiE_pred,BL_tiE_SLG_pred,Cart_tiE_pred,Cart_tiE_SLG_pred,DS_tiE_pred,DS_tiE_SLG_pred,MOE_tiE_pred,MOE_tiE_SLG_pred,Mono_tiE_pred,Mono_tiE_SLG_pred
0,GKR3BB@A@A158W,-8.647296,A,W,A->W,1,1,0.964901,99.031,0.418066,...,-9.429717,-10.196714,-6.089868,-1.264069,-7.659227,-9.317076,-1.366947,-5.875091,-6.252649,-2.853494
1,GKR3BB@A@D107K,-9.025969,D,K,D->K,10,1,0.813487,59.081,0.210992,...,1.129832,1.521148,-3.468148,0.818945,-3.992939,0.596031,-2.747829,0.992325,-1.797963,1.319017
2,GKR3BB@A@D107L,-6.39141,D,L,D->L,4,2,0.813487,59.081,0.210992,...,3.598429,4.331543,1.597182,1.051449,0.149311,2.264583,-2.602893,0.080114,-1.047028,1.3193
3,GKR3BB@A@D107R,-10.608879,D,R,D->R,3,5,0.813487,59.081,0.210992,...,2.562117,6.335236,1.233499,3.727824,-0.168061,4.649483,-2.620306,4.535145,-0.996889,5.133992
4,GKR3BB@A@D134S,-7.898822,D,S,D->S,5,1,0.561373,44.733,0.218313,...,-1.361471,1.863765,-2.536633,1.621084,-1.568452,0.926985,-2.252484,0.71095,-1.377593,0.624818


## Sample sequences

### Retrieve IDs

In [0]:
pdb_ids = T96_table["Mutation"].apply(lambda x: x.split("@")[0]).unique()

In [0]:
pdb_ids

array(['GKR3BB'], dtype=object)

#### Manually retrieve the sequence

Mutant selection and structure generation. We chose Guanylate Kinase (GK) from

Branchiostoma Floridae (UNIPROT: C3YEM4_BRAFL) as a model system for prospective pre-
diction because it is monomeric, highly expressed, easily purified, and displays two state-fold-
ing. More importantly, GK is not in the original set and has only ~20% sequence identity and

~70% homology to the closest protein in the dataset (Adenylate Kinase, pdbId 1AKY). The
template sequence used in this research also incorporated seven mutations R11T, S24T, L54E,
Y59F, I86L, S99A, and K138E which were identified during an evolution campaign to alter
guanylate kinase function. Having a sequence identity of ~56% to mouse GK, the pdbId 1LVG
was used as the structure template to generate the homology model using MOE with standard

simulation parameters. The crystallographic ligands ADP and GMP were also modeled to gen-
erate a similar induced fit.

```
>tr|C3YEM4|C3YEM4_BRAFL Uncharacterized protein OS=Branchiostoma floridae OX=7739 GN=BRAFLDRAFT_123769 PE=3 SV=1
MALPRPVVICGPSGSGKSTLYNKLLKEFPGVFQLSVSHTTRQPRPGELNGREYHFINRDQ
FQENIKQGDFLEWAEFSGNIYGTSKKALEEVQSNNVIPILDIDTQGVRNVKKASLEAVYI
FIKPPSIDVLEKRLRSRKTETEEALQKRLSAARNELEYGLKPGNFQHIITNDDLDVAYEK
LKGILIKSQMPLAMATGSSSSVVNSFLDKPAASATTVNSSSQD
```



In [0]:
protein_seq = """MALPRPVVICGPSGSGKSTLYNKLLKEFPGVFQLSVSHTTRQPRPGELNGREYHFINRDQFQENIKQGDFLEWAEFSGNIYGTSKKALEEVQSNNVIPILDIDTQGVRNVKKASLEAVYIFIKPPSIDVLEKRLRSRKTETEEALQKRLSAARNELEYGLKPGNFQHIITNDDLDVAYEKLKGILIKSQMPLAMATGSSSSVVNSFLDKPAASATTVNSSSQD"""

In [0]:
pre_mutation_list = ["R11T", "S24T", "L54E", "Y59F", "I86L", "S99A", "K138E"]

### Create mutations

In [0]:
mutations = T96_table[["Mutation"]].copy()

In [0]:
mutations.head()

Unnamed: 0,Mutation
0,GKR3BB@A@A158W
1,GKR3BB@A@D107K
2,GKR3BB@A@D107L
3,GKR3BB@A@D107R
4,GKR3BB@A@D134S


In [0]:
def mutate_seq(seq_id, seq, mutation, offset_dict):
  assert len(seq_id.split("_"))==1, "Invalid pdb id."
  mut_idx = int(mutation[1:-1])-1
  wt = mutation[0]
  mut = mutation[-1]
  
  #handle offset lists
  if seq_id in offset_dict:
    for offset in offset_dict[seq_id]:
      if mut_idx-offset<len(seq) and mut_idx-offset>=0:
        if seq[mut_idx-offset]==wt:
          mut_idx -= offset
          return seq[:mut_idx]+mut.strip()+seq[mut_idx+1:], offset
  
  if mut_idx >=len(seq) or mut_idx<0:
    print("index error: ", seq_id, mutation)
    
  if seq[mut_idx] != wt:
    print("Mutation mismatch! ID: ", seq_id, " Mutation: ", mutation, " Index: ", mut_idx, " Found: ",seq[mut_idx])
  return seq[:mut_idx]+mut.strip()+seq[mut_idx+1:], 0 

In [0]:
def apply_premutations(seq_id, seq, mutation_list, offset_dict):
  temp_seq = seq[:]
  for mut in mutation_list:
    temp_seq = mutate_seq(seq_id, temp_seq, mut, offset_dict)[0]
  return temp_seq

In [0]:
#manually looked up offsets for sequences:
start_offset_dict = {"GKR3BB":[6]}

In [0]:
#apply mutations to sequence to get the wt form used in paper
wt_sequence = apply_premutations("GKR3BB", protein_seq, pre_mutation_list, start_offset_dict)

In [0]:
wt_sequence

'MALPTPVVICGPSGSGKTTLYNKLLKEFPGVFQLSVSHTTRQPRPGEENGREFHFINRDQFQENIKQGDFLEWAEFSGNLYGTSKKALEEVQANNVIPILDIDTQGVRNVKKASLEAVYIFIKPPSIDVLEERLRSRKTETEEALQKRLSAARNELEYGLKPGNFQHIITNDDLDVAYEKLKGILIKSQMPLAMATGSSSSVVNSFLDKPAASATTVNSSSQD'

In [0]:
#since there is only one protein the wildtype sequence will be same for all entries
mutations["base_sequence"] = [wt_sequence] * len(mutations)

In [0]:
mutations.head()

Unnamed: 0,Mutation,base_sequence
0,GKR3BB@A@A158W,MALPTPVVICGPSGSGKTTLYNKLLKEFPGVFQLSVSHTTRQPRPG...
1,GKR3BB@A@D107K,MALPTPVVICGPSGSGKTTLYNKLLKEFPGVFQLSVSHTTRQPRPG...
2,GKR3BB@A@D107L,MALPTPVVICGPSGSGKTTLYNKLLKEFPGVFQLSVSHTTRQPRPG...
3,GKR3BB@A@D107R,MALPTPVVICGPSGSGKTTLYNKLLKEFPGVFQLSVSHTTRQPRPG...
4,GKR3BB@A@D134S,MALPTPVVICGPSGSGKTTLYNKLLKEFPGVFQLSVSHTTRQPRPG...


In [0]:
def create_mutations_offset_columns(mutations_df, start_offset_dict):
  temp_list = []
  temp_list2 = []
  for i in range(len(mutations_df)):
    temp_split = mutations_df.iloc[i]["Mutation"].split("@")
    tmp1, tmp2 = mutate_seq(seq_id=temp_split[0].split("_")[0], seq=mutations_df.iloc[i]["base_sequence"], mutation=temp_split[2], offset_dict=start_offset_dict)
    temp_list.append(tmp1)
    temp_list2.append(tmp2)
  return temp_list, temp_list2

In [0]:
#create mutated sequences whilst keeping track of offset
mutated_sequence, offsets = create_mutations_offset_columns(mutations,start_offset_dict)

In [0]:
mutations["mutated_sequence"] = mutated_sequence
mutations["mutation_offsets"] = offsets

In [0]:
mutations.head()

Unnamed: 0,Mutation,base_sequence,mutated_sequence,mutation_offsets
0,GKR3BB@A@A158W,MALPTPVVICGPSGSGKTTLYNKLLKEFPGVFQLSVSHTTRQPRPG...,MALPTPVVICGPSGSGKTTLYNKLLKEFPGVFQLSVSHTTRQPRPG...,6
1,GKR3BB@A@D107K,MALPTPVVICGPSGSGKTTLYNKLLKEFPGVFQLSVSHTTRQPRPG...,MALPTPVVICGPSGSGKTTLYNKLLKEFPGVFQLSVSHTTRQPRPG...,6
2,GKR3BB@A@D107L,MALPTPVVICGPSGSGKTTLYNKLLKEFPGVFQLSVSHTTRQPRPG...,MALPTPVVICGPSGSGKTTLYNKLLKEFPGVFQLSVSHTTRQPRPG...,6
3,GKR3BB@A@D107R,MALPTPVVICGPSGSGKTTLYNKLLKEFPGVFQLSVSHTTRQPRPG...,MALPTPVVICGPSGSGKTTLYNKLLKEFPGVFQLSVSHTTRQPRPG...,6
4,GKR3BB@A@D134S,MALPTPVVICGPSGSGKTTLYNKLLKEFPGVFQLSVSHTTRQPRPG...,MALPTPVVICGPSGSGKTTLYNKLLKEFPGVFQLSVSHTTRQPRPG...,6


In [0]:
#check mutations:
for i in range(len(mutations)):
  mut = mutations.iloc[i]["Mutation"].split("@")[2]
  index = int(mut[1:-1])-1-mutations.iloc[i]["mutation_offsets"]
  if mutations.iloc[i]["base_sequence"][index] != mut[0] or mutations.iloc[i]["mutated_sequence"][index] != mut[-1]:
    print("Upsi!")
print("Done!")

Done!


## Merge dataframes and export to CSV

In [0]:
T96_table["base_sequence"] = mutations.base_sequence
T96_table["mutated_sequence"] = mutations.mutated_sequence
T96_table["mutation_offsets"] = mutations.mutation_offsets

In [0]:
T96_table.to_csv(target_data_folder_loc+"T96_with_sequences.csv")

In [0]:
mutations.to_csv(target_data_folder_loc+"T96_sequences_only.csv")

## Transform sequence data to numpy arrays

In [0]:
# check for unusable sequences:
print((T96_table["base_sequence"].apply(lambda x: len(x))>650).sum())
print((T96_table["base_sequence"].apply(lambda x: len(x))<50).sum())

0
0


In [0]:
IUPAC_Extended_Dic_Transf = {"A":1,"C":2,"D":3,"E":4,"F":5,"G":6,"H":7,"I":8,"K":9,"L":10,"M":11,"N":12,"P":13,"Q":14,"R":15,"S":16,"T":17,"V":18,"W":19,"Y":20,"B":21,"X":21,"Z":21,"J":21,"U":21,"O":21}

In [0]:
def seq_transf_pad(seq, target_len):
    return np.pad(np.array(list(map(lambda x: IUPAC_Extended_Dic_Transf[x], seq)),dtype="int8"),(0, target_len - len(seq)), 'constant')

In [0]:
# only mesophilic xor thermophilic
def get_X_padded_tranf(df_u,column_name,seq_len=650):
    df_u = df_u[[column_name]]
    X_all = np.zeros((len(df_u),seq_len),dtype="int8")

    for i in range(len(df_u)):
        X_all[i] = seq_transf_pad(df_u.iloc[i][column_name],seq_len)

    return X_all

In [0]:
X_wild = get_X_padded_tranf(T96_table, column_name="base_sequence", seq_len=650)

In [0]:
X_mut = get_X_padded_tranf(T96_table, column_name="mutated_sequence", seq_len=650)

In [0]:
# check for unwanted amino acids
print(np.array(list(map(lambda x: 21 in x, X_wild))).sum())
print(np.array(list(map(lambda x: 21 in x, X_mut))).sum())

0
0


In [0]:
np.save(target_data_folder_loc+"T96_X_wild", X_wild)

In [0]:
np.save(target_data_folder_loc+"T96_X_mut", X_mut)