## Variant effect prediction: Three models for classifying pathogenic mutations in human genes

To run this demo tutorial, please make sure that you've downloaded the GitHub repo <> and that you've changed the 'main_path' variable below to point to where your installation lives on your local system. If you are not using Google Colab to run, please also set the 'using_google_colab' variable to False.

In [12]:
# Change this to whatever your directory you're working from is - this should be
# the head level with Code and Data directories.
main_path = '/content/drive/MyDrive/DeepLearning_Summer2022/Final_Project/'

# Input the uniprot ID that you would like to make predictions for here.
# For this demonstration, I will use the KCNQ1 gene 
# (https://www.uniprot.org/uniprotkb/P51787/entry)
uniprot_id = 'P51787'

# Set to False if you're running locally
using_google_colab = True

In [50]:
import matplotlib.pyplot as plt
import pandas as pd
import keras
import numpy as np
from keras import models
from keras import layers
from keras.layers import GRU, LSTM
from keras.layers import Dropout
from keras.utils.np_utils import to_categorical
import copy
import random as python_random
import tensorflow as tf
from tensorflow.keras import regularizers
from keras.callbacks import ModelCheckpoint
from google.colab import files
import sklearn.preprocessing
import scipy

import sys
import importlib

# set seeds
np.random.seed(768)
python_random.seed(869)
tf.random.set_seed(1234)

if using_google_colab:
  from google.colab import drive
  drive.mount('/content/drive')

sys.path.append(main_path+"Code")

# This is our library with our class definitions
import vep
importlib.reload(vep)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


<module 'vep' from '/content/drive/MyDrive/DeepLearning_Summer2022/Final_Project/Code/vep.py'>

## Model 1 - predicting pathogenicity based just on the original ("wildtype") and the mutant amino acids

In [25]:
import requests

url = requests.get('https://rest.uniprot.org/uniprotkb/P51787.fasta')
if url:
  fa = url.content.decode('utf-8')

len(fa)

809

In [32]:
kcnq1 = vep.Protein(uniprot_id)
kcnq1.sequence

'MAAASSPPRAERKRWGWGRLPGARRGSAGLAKKCPFSLELAEGGPAGGALYAPIAPGAPGPAPPASPAAPAAPPVASDLGPRPPVSLDPRVSIYSTRRPVLARTHVQGRVYNFLERPTGWKCFVYHFAVFLIVLVCLIFSVLSTIEQYAALATGTLFWMEIVLVVFFGTEYVVRLWSAGCRSKYVGLWGRLRFARKPISIIDLIVVVASMVVLCVGSKGQVFATSAIRGIRFLQILRMLHVDRQGGTWRLLGSVVFIHRQELITTLYIGFLGLIFSSYFVYLAEKDAVNESGRVEFGSYADALWWGVVTVTTIGYGDKVPQTWVGKTIASCFSVFAISFFALPAGILGSGFALKVQQKQRQKHFNRQIPAAASLIQTAWRCYAAENPDSSTWKIYIRKAPRSHTLLSPSPKPKKSVVVKKKKFKLDKDNGVTPGEKMLTVPHITCDPPEERRLDHFSVDGYDSSVRKSPTLLEVSMPHFMRTNSFAEDLDLEGETLLTPITHISQLREHHRATIKVIRRMQYFVAKKKFQQARKPYDVRDVIEQYSQGHLNLMVRIKELQRRLDQSIGKPSLFISVSEKSKDRGSNTIGARLNRVEDKVTQLDQRLALITDMLHQLLSLHGGSTPGSGGPPREGGAHITQPCGSGGSVDPELFLPSNTLPTYEQLTVPRRGPDEGS'

In [27]:
''.join(fa.split('\n')[1:])

'MAAASSPPRAERKRWGWGRLPGARRGSAGLAKKCPFSLELAEGGPAGGALYAPIAPGAPGPAPPASPAAPAAPPVASDLGPRPPVSLDPRVSIYSTRRPVLARTHVQGRVYNFLERPTGWKCFVYHFAVFLIVLVCLIFSVLSTIEQYAALATGTLFWMEIVLVVFFGTEYVVRLWSAGCRSKYVGLWGRLRFARKPISIIDLIVVVASMVVLCVGSKGQVFATSAIRGIRFLQILRMLHVDRQGGTWRLLGSVVFIHRQELITTLYIGFLGLIFSSYFVYLAEKDAVNESGRVEFGSYADALWWGVVTVTTIGYGDKVPQTWVGKTIASCFSVFAISFFALPAGILGSGFALKVQQKQRQKHFNRQIPAAASLIQTAWRCYAAENPDSSTWKIYIRKAPRSHTLLSPSPKPKKSVVVKKKKFKLDKDNGVTPGEKMLTVPHITCDPPEERRLDHFSVDGYDSSVRKSPTLLEVSMPHFMRTNSFAEDLDLEGETLLTPITHISQLREHHRATIKVIRRMQYFVAKKKFQQARKPYDVRDVIEQYSQGHLNLMVRIKELQRRLDQSIGKPSLFISVSEKSKDRGSNTIGARLNRVEDKVTQLDQRLALITDMLHQLLSLHGGSTPGSGGPPREGGAHITQPCGSGGSVDPELFLPSNTLPTYEQLTVPRRGPDEGS'

In [35]:
np.array([1,1,1,2,2,2,3,4,5,6,7,8]).reshape(4,3)

array([[1, 1, 1],
       [2, 2, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [36]:
# Call and fit sklearn's one hot encoder for our possible labels
possible_labels = np.array(['pathogenic','benign']).reshape(-1,1)
ohe_label_fx = sklearn.preprocessing.OneHotEncoder()
ohe_label_fx = ohe_label_fx.fit(possible_labels)

pathogenic_index = ohe_label_fx.categories_

In [42]:
np.where(pathogenic_index == 'pathogenic')

(array([], dtype=int64),)

In [49]:
list(pathogenic_index[0]).index('pathogenic')

1

In [48]:
np.where(pathogenic_index[0]=='pathogenic')

(array([1]),)