# Make a Prediction

In [1]:
# PDB chain: 5NHN.A (beta)
sequence = ("SKGEELFTGVVPILVELDGDVNGHKFSVRGEGEGDATNGKLTLKFICTTGKLPVPWPTLVTTLTYGVQCFSRYPDHMKRH"
            "DFFKSAMPEGYVQERTISFKDDGTYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNFNSFNVYITADKQKNGI"
            "KANFKIRHNVEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSVLSKDPNEKRDHMVLLEFVTAAGITH"
           )
print(sequence)

SKGEELFTGVVPILVELDGDVNGHKFSVRGEGEGDATNGKLTLKFICTTGKLPVPWPTLVTTLTYGVQCFSRYPDHMKRHDFFKSAMPEGYVQERTISFKDDGTYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNFNSFNVYITADKQKNGIKANFKIRHNVEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSVLSKDPNEKRDHMVLLEFVTAAGITH


In [2]:
from gensim.models import Word2Vec  
from sklearn import svm
from sklearn.externals import joblib
import word2vecutils

In [3]:
# parameters
n_gram = 3 # size of n-gram

## Load Word2vec model

In [4]:
model = Word2Vec.load("./word2vecmodel")

## Load Classifier model

In [5]:
classifier = joblib.load("./classifier")

In [6]:
classifier

SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=13, shrinking=True,
  tol=0.001, verbose=False)

## Input a Protein Sequence

## Calculate n-grams

In [7]:
ngrams = word2vecutils.ngrammer(sequence, n_gram)
print(ngrams)

['SKG', 'KGE', 'GEE', 'EEL', 'ELF', 'LFT', 'FTG', 'TGV', 'GVV', 'VVP', 'VPI', 'PIL', 'ILV', 'LVE', 'VEL', 'ELD', 'LDG', 'DGD', 'GDV', 'DVN', 'VNG', 'NGH', 'GHK', 'HKF', 'KFS', 'FSV', 'SVR', 'VRG', 'RGE', 'GEG', 'EGE', 'GEG', 'EGD', 'GDA', 'DAT', 'ATN', 'TNG', 'NGK', 'GKL', 'KLT', 'LTL', 'TLK', 'LKF', 'KFI', 'FIC', 'ICT', 'CTT', 'TTG', 'TGK', 'GKL', 'KLP', 'LPV', 'PVP', 'VPW', 'PWP', 'WPT', 'PTL', 'TLV', 'LVT', 'VTT', 'TTL', 'TLT', 'LTY', 'TYG', 'YGV', 'GVQ', 'VQC', 'QCF', 'CFS', 'FSR', 'SRY', 'RYP', 'YPD', 'PDH', 'DHM', 'HMK', 'MKR', 'KRH', 'RHD', 'HDF', 'DFF', 'FFK', 'FKS', 'KSA', 'SAM', 'AMP', 'MPE', 'PEG', 'EGY', 'GYV', 'YVQ', 'VQE', 'QER', 'ERT', 'RTI', 'TIS', 'ISF', 'SFK', 'FKD', 'KDD', 'DDG', 'DGT', 'GTY', 'TYK', 'YKT', 'KTR', 'TRA', 'RAE', 'AEV', 'EVK', 'VKF', 'KFE', 'FEG', 'EGD', 'GDT', 'DTL', 'TLV', 'LVN', 'VNR', 'NRI', 'RIE', 'IEL', 'ELK', 'LKG', 'KGI', 'GID', 'IDF', 'DFK', 'FKE', 'KED', 'EDG', 'DGN', 'GNI', 'NIL', 'ILG', 'LGH', 'GHK', 'HKL', 'KLE', 'LEY', 'EYN', 'YNF', 'NFN'

## Calculate Feature Vector using Word2vec Model

In [8]:
featureVector = word2vecutils.average_word_vec_scaled(ngrams, model.wv)
print(featureVector)

[ 1.28778052  0.44674076  1.12697301 -0.58095954 -0.17015619 -1.39887458
 -0.49693654  1.49372242 -0.01038107 -0.86541491 -1.79464511 -0.80845894
 -0.76976113 -1.47495654 -0.42284002 -1.61085131 -1.12840114 -1.76804052
 -0.44402376 -0.54647153 -0.81864762  0.07283831 -1.18833596 -0.21399596
  1.87218445 -0.22772358  0.93551314 -0.20331395  1.92181731  0.19185781
  0.91030794 -0.21221369 -0.6987725   0.64262204  0.40463771 -0.02731242
  1.16311523 -0.70965216  0.84768798  2.4154429  -0.5087721   0.90970767
 -1.29587341  1.17157026  0.04594684 -0.06091798  0.6589035   1.08781872
  0.10689725  0.74261839]


## Predict Fold Class

In [9]:
predictions = classifier.predict([featureVector])
probabilities = classifier.predict_proba([featureVector])

print("Probabilities:")
print(classifier.classes_)
print(probabilities[0])
print("\nPrediction:", predictions[0])

Probabilities:
['alpha' 'alpha+beta' 'beta']
[0.0044486  0.13276207 0.86278933]

Prediction: beta
