# Mapping training sequences to ints

*Note: there is definitely a lot of repetitive code - will go back and write functions to cut this down

In [None]:
# importing transformers
!git clone https://github.com/huggingface/transformers.git
%cd transformers
!pip install -e .

In [None]:
from google.colab import drive
# mounts my Google Drive
drive_name = '/content/drive'
drive.mount(drive_name, force_remount=True)
drive_folder = ''
drive_location = drive_name + '/My Drive/Protein Batches/Current Batches' + drive_folder  # Change this to where your files are located
data_location = drive_location + '/'

In [None]:
# creates a new directory, datafps, that contains two files
%cd /content/
!mkdir datafps
%cd datafps/

!cp /content/drive/'My Drive/Protein Batches/Current Batches'/sample_batch_train.csv .
!cp /content/drive/'My Drive/Protein Batches/Current Batches'/sample_batch_val.csv .

In [None]:
# load training data

import pandas as pd
# reads a .csv file into DataFrame object, keeping columns labeled chainA and chainB
df = pd.read_csv("sample_batch_train.csv", usecols=['chainA', 'chainB'])
print("Number of data points: ", df.shape[0])

#displays a random sample of 10 rows in the DataFrame
sampled = df.sample(10)
print(sampled)

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

In [None]:
# get values from columns in dataframe and store them as ndarrays
my_inputs = df.chainA.values
my_labels = df.chainB.values

# add a "@" and "!" character to the beginning and end of every
# sequence for both chainA and chainB.
# store these edited sequences in two lists, one for inputs (chainA) and
# labels (chainB).
list_my_inputs = []
for seqA in my_inputs:
  stripped = seqA.strip()
  whitespace_removed = stripped.replace(' ', '')
  new_seq = '@' + whitespace_removed + '!'
  list_my_inputs.append(new_seq)

list_my_labels = []
for seqB in my_labels:
  stripped = seqB.strip()
  whitespace_removed = stripped.replace(' ', '')
  new_seq = '@' + whitespace_removed + '!'
  list_my_labels.append(new_seq)

# sort list of sequences by length of the sequence
list_my_inputs.sort(key=len)
list_my_labels.sort(key=len)

# length of longest sequence = 433
print(len(list_my_inputs[-1]))
print(len(list_my_labels[-1]))

In [None]:
# pad all chainA sequences with "#"
for i in range(len(list_my_inputs)):
  input = list_my_inputs[i]
  list_my_inputs.remove(list_my_inputs[i])
  edited_input = input.rjust(433, '#')
  list_my_inputs.insert(i, edited_input)

# pad all chainB sequences with "#"
for i in range(len(list_my_labels)):
  input = list_my_labels[i]
  list_my_labels.remove(list_my_labels[i])
  edited_input = input.rjust(433, '#')
  list_my_labels.insert(i, edited_input)

print(list_my_inputs)
print(list_my_labels)

In [None]:
# a dict that maps protein sequences to ints
aa_to_idx = {"A":1, "B": 2, "C":3, "D":4, "E":5, "F":6, "G":7, "H":8, "I":9, 
             "J":10, "K": 11, "L":12,"M":13, "N":14, "O":15, "P":16, "Q":17, 
             "R":18, "S":19, "T":20, "U":21,"V":22, "W":23, "X":24, "Y":25, 
             "Z":26, "@":27, "!":28, "#":29}
# a dict that maps the ints back to protein sequences
idx_to_aa = {1:"A", 2:"B", 3:"C", 4:"D", 5:"E", 6:"F", 7:"G", 8:"H", 9:"I", 
             10:"J", 11:"K", 12:"L", 13:"M", 14:"N", 15:"O", 16:"P", 17:"Q", 
             18:"R", 19:"S", 20:"T", 21: "U", 22:"V", 23:"W", 24:"X", 25:"Y",
             26:"Z", 27:"@", 28:"!", 29:"#"}

In [None]:
# split up each sequence into individual letters and append them to new list
list_of_aa_chainA = []
for seq in list_my_inputs:
  temp_list = []
  for letter in seq:
    if letter in aa_to_idx:
      temp_list.append(letter)
  list_of_aa_chainA.append(temp_list)

list_of_aa_chainB = []
for seq in list_my_labels:
  temp_list = []
  for letter in seq:
    if letter in aa_to_idx:
      temp_list.append(letter)
  list_of_aa_chainB.append(temp_list)

# map letters to int based on aa_to_idx
mapped_chainA_to_tokens = [[aa_to_idx[j] for j in i] for i in list_of_aa_chainA]
mapped_chainB_to_tokens = [[aa_to_idx[j] for j in i] for i in list_of_aa_chainB]

print(mapped_chainA_to_tokens)
print(mapped_chainB_to_tokens)

# Mapping validation sequences to ints

In [None]:
# load validation data

import pandas as pd
# reads a .csv file into DataFrame object, keeping columns labeled chainA and chainB
df2 = pd.read_csv("sample_batch_val.csv", usecols=['chainA', 'chainB'])
print(df2)

                                               chainA                                             chainB
0    NTGIVSSFFTYTGPAHGTQWDEIDIEFLGKDTTKVQFNYYTNGVG...   NTGIVSSFFTYTGPAHGTQWDEIDIEFLGKDTTKVQFNYYTNGVG...
1    APPSVFAEVPQAQPVLVFKLIADFREDPDPRKVNLGVGAYRTDDC...   APPSVFAEVPQAQPVLVFKLIADFREDPDPRKVNLGVGAYRTDDC...
2    APPSVFAEVPQAQPVLVFKLIADFREDPDPRKVNLGVGAYRTDDC...   APPSVFAEVPQAQPVLVFKLIADFREDPDPRKVNLGVGAYRTDDC...
3    PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPK...   PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPK...
4    PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPK...   PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPK...
5    MRIILLGAPGAGKGTQAQFIMEKYGIPQISTGDMLRAAVKSGSEL...   MRIILLGAPGAGKGTQAQFIMEKYGIPQISTGDMLRAAVKSGSEL...
6    IVGGYTCAANSIPYQVSLNSGSHFCGGSLINSQWVVSAAHCYKSR...   SSGSSYPSLLQCLKAPVLSNSSCKSSYPGQITGNMICVGFLQGGK...
7                                            GAAVWWWW                                           GAAVWWWW
8    TPEMPVLENRAAQGNITAPGGARRLTGDQTAALRNSLSDKPAKNI...  

In [None]:
# get values from columns in dataframe and store them as ndarrays
my_inputs_2 = df2.chainA.values
my_labels_2 = df2.chainB.values

# add a "@" and "!" character to the beginning and end of every
# sequence for both chainA and chainB.
# store these edited sequences in two lists, one for inputs (chainA) and
# labels (chainB).
list_my_inputs_2 = []
for seqA in my_inputs_2:
  stripped = seqA.strip()
  whitespace_removed = stripped.replace(' ', '')
  new_seq = '@' + whitespace_removed + '!'
  list_my_inputs_2.append(new_seq)

list_my_labels_2 = []
for seqB in my_labels_2:
  stripped = seqB.strip()
  whitespace_removed = stripped.replace(' ', '')
  new_seq = '@' + whitespace_removed + '!'
  list_my_labels_2.append(new_seq)

# sort list of sequences by length of the sequence
list_my_inputs_2.sort(key=len)
list_my_labels_2.sort(key=len)

# length of longest sequence = 544
print(len(list_my_inputs_2[-1]))
print(len(list_my_labels_2[-1]))

534
544


In [None]:
# pad all chainA sequences with "#"
for i in range(len(list_my_inputs_2)):
  input = list_my_inputs_2[i]
  list_my_inputs_2.remove(list_my_inputs_2[i])
  edited_input = input.rjust(544, '#')
  list_my_inputs_2.insert(i, edited_input)

# pad all chainB sequences with "#"
for i in range(len(list_my_labels_2)):
  input = list_my_labels_2[i]
  list_my_labels_2.remove(list_my_labels_2[i])
  edited_input = input.rjust(544, '#')
  list_my_labels_2.insert(i, edited_input)

print(list_my_inputs_2)
print(list_my_labels_2)

['###########################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################@XXX!', '##################################################################################################################################################################################################################################################################################################################################################################################################################################################################

In [None]:
# split up each sequence into individual letters and append them to new list
list_of_aa_chainA_2 = []
for seq in list_my_inputs_2:
  temp_list = []
  for letter in seq:
    if letter in aa_to_idx:
      temp_list.append(letter)
  list_of_aa_chainA_2.append(temp_list)

list_of_aa_chainB_2 = []
for seq in list_my_labels_2:
  temp_list = []
  for letter in seq:
    if letter in aa_to_idx:
      temp_list.append(letter)
  list_of_aa_chainB_2.append(temp_list)

# map letters to int based on aa_to_idx
mapped_chainA_to_tokens_2 = [[aa_to_idx[j] for j in i] for i in list_of_aa_chainA_2]
mapped_chainB_to_tokens_2 = [[aa_to_idx[j] for j in i] for i in list_of_aa_chainB_2]

print(mapped_chainA_to_tokens_2)
print(mapped_chainB_to_tokens_2)

[[29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29

# Training

In [None]:
# set up training
from transformers import GPT2Config, GPT2Model, GPT2PreTrainedModel
import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import matplotlib.pyplot as plt
import torch.nn as nn
import time
from torch.nn import CrossEntropyLoss

In [None]:
train_array_A = np.array(mapped_chainA_to_tokens)
train_array_B = np.array(mapped_chainB_to_tokens)

val_array_A = np.array(mapped_chainA_to_tokens_2)
val_array_B = np.array(mapped_chainB_to_tokens_2)

In [None]:
train_chainA_tensor = torch.from_numpy(train_array_A)
train_chainB_tensor = torch.from_numpy(train_array_B)

train_dataset = TensorDataset(train_chainA_tensor, train_chainB_tensor)
train_dataloader = DataLoader(train_dataset)

val_chainA_tensor = torch.from_numpy(val_array_A)
val_chainB_tensor = torch.from_numpy(val_array_B)

val_dataset = TensorDataset(val_chainA_tensor, val_chainB_tensor)
val_dataloader = DataLoader(val_dataset)

print(val_chainA_tensor.size())

In [None]:
class MyDataParallel(nn.DataParallel):
    """
    Allow nn.DataParallel to call model's attributes.
    """
    def __getattr__(self, name):
        try:
            return super().__getattr__(name)
        except AttributeError:
            return getattr(self.module, name)

class GPT_Protein(GPT2LMHeadModel):
    def __init__(self, config):
        super().__init__(config)
        self.transformer = GPT2Model(config)
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        self.init_weights()

    def forward(
        self,
        input_ids,
        labels,
        past=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
    ):
        transformer_outputs = self.transformer(
            input_ids,
            past=past,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
        )
        hidden_states = transformer_outputs[0]

        lm_logits = self.lm_head(hidden_states)

        outputs = (lm_logits,) + transformer_outputs[1:]
        if labels is not None:
            # Shift so that tokens < n predict n
            # shift_logits = lm_logits[..., :-1, :].contiguous()
            # shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
            outputs = (loss,) + outputs

        return outputs  # (loss), lm_logits, presents, (all hidden_states), (attentions)

In [None]:
gpt_config = GPT2Config(vocab_size=30, n_positions=784, n_ctx=784, n_embd=64, n_layer=3, 
                        n_head=2, activation_function='gelu_new', resid_pdrop=0.1, embd_pdrop=0.1, 
                        attn_pdrop=0.1, layer_norm_epsilon=1e-05, initializer_range=0.02, 
                        summary_type='cls_index', summary_use_proj=True, summary_activation=None, 
                        summary_proj_to_labels=True, summary_first_dropout=0.1, bos_token_id=21, 
                        eos_token_id=22)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
import datetime
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
def train_model():
    model = GPT_Protein(gpt_config)
    model.to(device)

    total_steps = len(train_chainA_tensor)
    train_loss = []
    best_loss = 1e7
    val_loss = []

    num_epochs = 1000

    for epoch_i in range(0, num_epochs):
      print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, num_epochs), flush=True)
      t0 = time.time()

      total_loss = 0
      model.train()

      for batch in train_dataloader:
          chainA, chainB = batch[0], batch[1]
          length = batch[0].size()[1]
          # chains = chains.to(device)
          # import IPython ; IPython.embed() ; exit(1)
          seq_len = length # batch.shape[1]
          # labels = batch[:].type(torch.LongTensor).squeeze().to(device)
          # coords = batch[:].to(device)
          outputs = model(input_ids=chainA, head_mask=None, inputs_embeds=None, labels=chainB)

          loss = outputs[0]
          curr_loss = loss.item() / seq_len
          total_loss += curr_loss
          if curr_loss < best_loss:
            best_loss = curr_loss
            torch.save(model.state_dict(), "gpt_shift_std_train.pt")

          model.zero_grad()
          loss.backward()
          torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

          optimizer.step()

          if (i + 1) % 100 == 0:
            print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(
              epoch_i + 1, 
              num_epochs, 
              i + 1, 
              total_steps,
              loss.item()), flush=True
            )

      avg_train_loss = total_loss / total_steps
      train_loss.append(avg_train_loss)

      print("  Training epoch took: {:}".format(format_time(time.time() - t0)), flush=True)
      print("  Training loss: {0:.2f}".format(avg_train_loss), flush=True)    

      print("Running Validation...", flush=True)

      model.eval()

      eval_loss, eval_accuracy = 0, 0
      best_loss = 10000
      nb_eval_steps, nb_eval_examples = 0, 0

      for batch in val_dataloader:
          chainA, chainB = batch[0], batch[1]
          length = batch[0].size()[1]
          seq_len = length # batch.shape[1]
          # labels = batch[:,:,0].type(torch.LongTensor).squeeze().to(device)
          # coords = batch[:,:,1:].to(device)
          with torch.no_grad(): 
            outputs = model(input_ids=chainA, head_mask=None, inputs_embeds=None, labels=chainB)
          curr_val_loss = outputs[0]
                      
          eval_loss += curr_val_loss.item() / seq_len
          nb_eval_steps += 1

      if eval_loss < best_loss:
        best_loss = eval_loss
        torch.save(model.state_dict(), "gpt_shift_std_val.pt")
      avg_val_loss = eval_loss/nb_eval_steps * 1e7  # for scaling
      val_loss.append(avg_val_loss)
      print("  Validation loss: {0:.2f}".format(avg_val_loss), flush=True)
      # print("  Validation took: {:}".format(format_time(time.time() - t0)))
    print("Training complete.", flush=True)
    return train_loss, val_loss


def plot_loss(train_loss):
    plt.rcParams["figure.figsize"] = (12,6)
    plt.plot(train_loss, 'b-o')
    plt.plot(val_loss, 'r-o')
    plt.title("Training and Validation Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.savefig("gpt_shift_std.png", dpi=200)

train_loss, val_loss = train_model()
plot_loss(train_loss)
plot_loss(val_loss)