# Calculate Features
This notebooks takes a dataset with protein sequence and fold type classification and calculates a feature vector for each protein sequence.

In [6]:
# parameters
n_gram = 2 # size of n-gram
feature_col = "features" # feature vector
value_col = "foldType" # the 

In [7]:
from gensim.models import Word2Vec                                                                                                                                                      
import pandas as pd
import numpy as np     

In [9]:
df = pd.read_json("./secondaryStructure.json")

### Split sequence into bi-grams

In [10]:
def ngrammer(s,n):
    '''helper funtion to split sequence into n-grams
    
    Attributes:
       s (string): sequence of amino acid
       n (int): n-gram
    '''
    ngram = []                                                                                
    i = 0                                                                                     
    if len(s) < 1:                                                                            
        return []                                                                             
    while i < len(s) - n + 1:                                                                 
        ngram.append(s[i: i + n])                                                             
        i += 1                                                                                
    return ngram

In [11]:
df['ngram'] = df.sequence.apply(ngrammer, n=n_gram)
df.head(3)

Unnamed: 0,Exptl.,FreeRvalue,R-factor,alpha,beta,coil,foldType,length,pdbChainId,resolution,secondary_structure,sequence,ngram
0,XRAY,0.29,0.16,0.345455,0.206061,0.448485,alpha+beta,330,12AS.A,2.2,CCCCHHHHHHHHHHHHHHHHHHHHHHHCEEECCCCSEEETTSSCSC...,MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQD...,"[MK, KT, TA, AY, YI, IA, AK, KQ, QR, RQ, QI, I..."
1,XRAY,0.26,0.19,0.469945,0.046448,0.483607,alpha,366,16VP.A,2.1,CCSCCCCCCCCHHHHHHHHHHHHTCTTHHHHHHHHHHCCCCCSTTS...,SRMPSPPMPVPPAALFNRLLDDLGFSAGPALCTMLDTWNEDLFSAL...,"[SR, RM, MP, PS, SP, PP, PM, MP, PV, VP, PP, P..."
10,XRAY,0.28,0.21,0.393103,0.186207,0.42069,alpha+beta,290,1A7J.A,2.5,CCTTSCEEEEESCCCCCCCTHHHHHHHHHHHHTCCEEEEEGGGGBS...,MSKKHPIISVTGSSGAGTSTVKHTFDQIFRREGVKAVSIEGDAFHR...,"[MS, SK, KK, KH, HP, PI, II, IS, SV, VT, TG, G..."


In [12]:
def ave_prot_vec(ngrams, word_vec):
    apv = np.zeros(word_vec.vector_size)
    count = 0
    for ng in ngrams:
        try:
            apv = apv + word_vec.word_vec(ng)
            count = count + 1
        except:
            pass
            

    apv = apv/count
    
    return apv

### Convert sequence to word2vec

In [15]:
# get ngram column as a list
ngrams = list(df.ngram)

# create word2vec model
model = Word2Vec(ngrams, size=50, window=13, min_count=5)

# train word2vec model
model.train(ngrams, total_examples=model.corpus_count, epochs=model.epochs)

# create a feature vector by averaging the word vector for the n-grams of each protein chain
df[feature_col] = df.ngram.apply(lambda ng: ave_prot_vec(ng, model.wv))

df.head(3)

Unnamed: 0,Exptl.,FreeRvalue,R-factor,alpha,beta,coil,foldType,length,pdbChainId,resolution,secondary_structure,sequence,ngram,features
0,XRAY,0.29,0.16,0.345455,0.206061,0.448485,alpha+beta,330,12AS.A,2.2,CCCCHHHHHHHHHHHHHHHHHHHHHHHCEEECCCCSEEETTSSCSC...,MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQD...,"[MK, KT, TA, AY, YI, IA, AK, KQ, QR, RQ, QI, I...","[-0.0050395432786059235, 0.6616196698143988, -..."
1,XRAY,0.26,0.19,0.469945,0.046448,0.483607,alpha,366,16VP.A,2.1,CCSCCCCCCCCHHHHHHHHHHHHTCTTHHHHHHHHHHCCCCCSTTS...,SRMPSPPMPVPPAALFNRLLDDLGFSAGPALCTMLDTWNEDLFSAL...,"[SR, RM, MP, PS, SP, PP, PM, MP, PV, VP, PP, P...","[-0.18981191015815083, 0.6604804486010785, -0...."
10,XRAY,0.28,0.21,0.393103,0.186207,0.42069,alpha+beta,290,1A7J.A,2.5,CCTTSCEEEEESCCCCCCCTHHHHHHHHHHHHTCCEEEEEGGGGBS...,MSKKHPIISVTGSSGAGTSTVKHTFDQIFRREGVKAVSIEGDAFHR...,"[MS, SK, KK, KH, HP, PI, II, IS, SV, VT, TG, G...","[-0.03272161577992579, -0.0687293606523463, -0..."


### Save DataFrame

In [16]:
df.to_json("./features.json")