In [1]:
# Imports

import pandas as pd

In [2]:
df = pd.read_csv('2018-06-06-pdb-intersect-pisces.csv')
df.head()

Unnamed: 0,pdb_id,chain_code,seq,sst8,sst3,len,has_nonstd_aa,Exptl.,resolution,R-factor,FreeRvalue
0,1FV1,F,NPVVHFFKNIVTPRTPPPSQ,CCCCCBCCCCCCCCCCCCCC,CCCCCECCCCCCCCCCCCCC,20,False,XRAY,1.9,0.23,0.27
1,1LM8,H,DLDLEMLAPYIPMDDDFQLR,CCCCCCCCCBCCSCCCEECC,CCCCCCCCCECCCCCCEECC,20,False,XRAY,1.85,0.2,0.24
2,1O06,A,EEDPDLKAAIQESLREAEEA,CCCHHHHHHHHHHHHHHHTC,CCCHHHHHHHHHHHHHHHCC,20,False,XRAY,1.45,0.19,0.22
3,1QOW,D,CTFTLPGGGGVCTLTSECI*,CCTTSCTTCSSTTSSTTCCC,CCCCCCCCCCCCCCCCCCCC,20,True,XRAY,1.06,0.14,1.0
4,1RDQ,I,TTYADFIASGRTGRRNAIHD,CHHHHHHTSSCSSCCCCEEC,CHHHHHHCCCCCCCCCCEEC,20,False,XRAY,1.26,0.13,0.16


In [3]:
df.drop_duplicates() #remove duplicates if any in the dataset
df.dropna() #drop NaN values from the dataset if any
df.info() #analyse the columns and entries


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9078 entries, 0 to 9077
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   pdb_id         9078 non-null   object 
 1   chain_code     9078 non-null   object 
 2   seq            9078 non-null   object 
 3   sst8           9078 non-null   object 
 4   sst3           9078 non-null   object 
 5   len            9078 non-null   int64  
 6   has_nonstd_aa  9078 non-null   bool   
 7   Exptl.         9078 non-null   object 
 8   resolution     9078 non-null   float64
 9   R-factor       9078 non-null   float64
 10  FreeRvalue     9078 non-null   float64
dtypes: bool(1), float64(3), int64(1), object(6)
memory usage: 718.2+ KB


Everything looks fine!

In [4]:
df = df.drop(columns='sst8')
df.head()

Unnamed: 0,pdb_id,chain_code,seq,sst3,len,has_nonstd_aa,Exptl.,resolution,R-factor,FreeRvalue
0,1FV1,F,NPVVHFFKNIVTPRTPPPSQ,CCCCCECCCCCCCCCCCCCC,20,False,XRAY,1.9,0.23,0.27
1,1LM8,H,DLDLEMLAPYIPMDDDFQLR,CCCCCCCCCECCCCCCEECC,20,False,XRAY,1.85,0.2,0.24
2,1O06,A,EEDPDLKAAIQESLREAEEA,CCCHHHHHHHHHHHHHHHCC,20,False,XRAY,1.45,0.19,0.22
3,1QOW,D,CTFTLPGGGGVCTLTSECI*,CCCCCCCCCCCCCCCCCCCC,20,True,XRAY,1.06,0.14,1.0
4,1RDQ,I,TTYADFIASGRTGRRNAIHD,CHHHHHHCCCCCCCCCCEEC,20,False,XRAY,1.26,0.13,0.16


In [5]:
# NLP: Tokenization, Bag-of-Words, Word2Vec, Embedding
df['tokens'] = df['seq'].apply(lambda x: list(x)) #Split invidual characters into tokens
df.head()

Unnamed: 0,pdb_id,chain_code,seq,sst3,len,has_nonstd_aa,Exptl.,resolution,R-factor,FreeRvalue,tokens
0,1FV1,F,NPVVHFFKNIVTPRTPPPSQ,CCCCCECCCCCCCCCCCCCC,20,False,XRAY,1.9,0.23,0.27,"[N, P, V, V, H, F, F, K, N, I, V, T, P, R, T, ..."
1,1LM8,H,DLDLEMLAPYIPMDDDFQLR,CCCCCCCCCECCCCCCEECC,20,False,XRAY,1.85,0.2,0.24,"[D, L, D, L, E, M, L, A, P, Y, I, P, M, D, D, ..."
2,1O06,A,EEDPDLKAAIQESLREAEEA,CCCHHHHHHHHHHHHHHHCC,20,False,XRAY,1.45,0.19,0.22,"[E, E, D, P, D, L, K, A, A, I, Q, E, S, L, R, ..."
3,1QOW,D,CTFTLPGGGGVCTLTSECI*,CCCCCCCCCCCCCCCCCCCC,20,True,XRAY,1.06,0.14,1.0,"[C, T, F, T, L, P, G, G, G, G, V, C, T, L, T, ..."
4,1RDQ,I,TTYADFIASGRTGRRNAIHD,CHHHHHHCCCCCCCCCCEEC,20,False,XRAY,1.26,0.13,0.16,"[T, T, Y, A, D, F, I, A, S, G, R, T, G, R, R, ..."


In [6]:
from gensim.models import TfidfModel
from gensim.corpora import Dictionary


dict = Dictionary(df['tokens'])
corpus = [dict.doc2bow(row) for row in df['tokens']]

model = TfidfModel(corpus)
vector = model[corpus[0]]
for id, score in vector:
    token = dict[id]
    print(token, score)




F 0.31836795451322486
H 0.34542743198858766
I 0.08838619296927532
K 0.08460335914808222
N 0.2778554324211898
P 0.7712446779234371
Q 0.11658060339151352
R 0.08568386580884284
S 0.04640150869292991
T 0.15840891451449032
V 0.2101205524863875


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(df['sst3'])

X_train, X_test, y_train, y_test = train_test_split(model[corpus], y, test_size=0.3, random_state=42)

In [8]:
from keras.models import Sequential
from keras.layers import Dense, SimpleRNN

trainer = Sequential()
trainer.add(SimpleRNN(50, input_shape=(X_train.shape[1], 1)), return_sequences=True)
trainer.add(Dense(3, activation='softmax'))
trainer.compile(loss='mean_squared_error', optimizer='adam')

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
trainer.fit(X_train, y_train, epochs=10, batch_size=64)