# Calculate Features
This notebooks takes a dataset with protein sequence and fold type classification and calculates a feature vector for each protein sequence.

In [1]:
# parameters
n_gram = 2 # size of n-gram
feature_col = "features" # feature vector
value_col = "foldType" # the 

In [2]:
from gensim.models import Word2Vec                                                                                                                                                      
import pandas as pd
import numpy as np     

In [3]:
df = pd.read_json("./secondaryStructure.json")

### Split sequence into n-grams

In [4]:
def ngrammer(s,n):
    '''helper funtion to split sequence into n-grams
    
    Attributes:
       s (string): sequence of amino acid
       n (int): n-gram
    '''
    ngram = []                                                                                
    i = 0                                                                                     
    if len(s) < 1:                                                                            
        return []                                                                             
    while i < len(s) - n + 1:                                                                 
        ngram.append(s[i: i + n])                                                             
        i += 1                                                                                
    return ngram

In [5]:
df['ngram'] = df.sequence.apply(ngrammer, n=n_gram)
df.head(3)

Unnamed: 0,Exptl.,FreeRvalue,R-factor,alpha,beta,coil,foldType,length,pdbChainId,resolution,secondary_structure,sequence,ngram
1,XRAY,0.26,0.19,0.469945,0.046448,0.483607,alpha,366,16VP.A,2.1,CCSCCCCCCCCHHHHHHHHHHHHTCTTHHHHHHHHHHCCCCCSTTS...,SRMPSPPMPVPPAALFNRLLDDLGFSAGPALCTMLDTWNEDLFSAL...,"[SR, RM, MP, PS, SP, PP, PM, MP, PV, VP, PP, P..."
1000,XRAY,0.23,0.18,0.50463,0.00463,0.490741,alpha,216,1PBW.B,2.0,CCCCCCCCCCCCCCHHHHCCTTSCSCHHHHHHHHHHHHHHTTCTTT...,MEADVEQQALTLPDLAEQFAPPDIAPPLLIKLVEAIEKKGLECSTL...,"[ME, EA, AD, DV, VE, EQ, QQ, QA, AL, LT, TL, L..."
10002,XRAY,0.26,0.22,0.716172,0.006601,0.277228,alpha,303,4TQ3.A,2.408,CCCCCCCCCCCCCCCHHHHHHCGGGGHHHHHHHHHHHHHHCCTTSC...,MDSSLANINQIDVPSKYLRLLRPVAWLCFLLPYAVGFGFGITPNAS...,"[MD, DS, SS, SL, LA, AN, NI, IN, NQ, QI, ID, D..."


In [6]:
def ave_prot_vec(ngrams, word_vec):
    apv = np.zeros(word_vec.vector_size)
    count = 0
    for ng in ngrams:
        try:
            apv = apv + word_vec.word_vec(ng)
            count = count + 1
        except:
            pass
            

    apv = apv/count
    
    return apv

### Convert sequence to word2vec

In [7]:
# get ngram column as a list
ngrams = list(df.ngram)

# create word2vec model
model = Word2Vec(ngrams, size=50, window=13, min_count=3)

# train word2vec model
model.train(ngrams, total_examples=model.corpus_count, epochs=model.epochs)

# create a feature vector by averaging the word vector for the n-grams of each protein chain
df[feature_col] = df.ngram.apply(lambda ng: ave_prot_vec(ng, model.wv))

df.head(3)

Unnamed: 0,Exptl.,FreeRvalue,R-factor,alpha,beta,coil,foldType,length,pdbChainId,resolution,secondary_structure,sequence,ngram,features
1,XRAY,0.26,0.19,0.469945,0.046448,0.483607,alpha,366,16VP.A,2.1,CCSCCCCCCCCHHHHHHHHHHHHTCTTHHHHHHHHHHCCCCCSTTS...,SRMPSPPMPVPPAALFNRLLDDLGFSAGPALCTMLDTWNEDLFSAL...,"[SR, RM, MP, PS, SP, PP, PM, MP, PV, VP, PP, P...","[0.36903541487935065, 0.3921923931700828, -0.3..."
1000,XRAY,0.23,0.18,0.50463,0.00463,0.490741,alpha,216,1PBW.B,2.0,CCCCCCCCCCCCCCHHHHCCTTSCSCHHHHHHHHHHHHHHTTCTTT...,MEADVEQQALTLPDLAEQFAPPDIAPPLLIKLVEAIEKKGLECSTL...,"[ME, EA, AD, DV, VE, EQ, QQ, QA, AL, LT, TL, L...","[0.4600808152235871, 0.11398545059397124, -0.2..."
10002,XRAY,0.26,0.22,0.716172,0.006601,0.277228,alpha,303,4TQ3.A,2.408,CCCCCCCCCCCCCCCHHHHHHCGGGGHHHHHHHHHHHHHHCCTTSC...,MDSSLANINQIDVPSKYLRLLRPVAWLCFLLPYAVGFGFGITPNAS...,"[MD, DS, SS, SL, LA, AN, NI, IN, NQ, QI, ID, D...","[0.3141786257014764, 0.27331427708946615, -0.2..."


### Save DataFrame

In [8]:
df.to_json("./features.json")

## Next step
After you saved the dataset here, go back to the [0-Workflow.ipynb](./0-Workflow.ipynb) to run the next step of the analysis.