In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

df_seq = pd.read_csv('data/pdb_data_seq.csv')
df_features = pd.read_csv('data/pdb_data_no_dups.csv')

In [2]:
protein_char = df_features[df_features.macromoleculeType == 'Protein']
protein_seq = df_seq[df_seq.macromoleculeType == 'Protein']

protein_char = protein_char[['structureId','classification']]
protein_seq = protein_seq[['structureId','sequence']]
protein_seq.head()

Unnamed: 0,structureId,sequence
4,101M,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...
7,102L,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAAKSE...
8,102M,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...
11,103L,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNSLDAAK...
12,103M,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...


In [3]:
protein_char.head()

Unnamed: 0,structureId,classification
2,101M,OXYGEN TRANSPORT
4,102L,HYDROLASE(O-GLYCOSYL)
5,102M,OXYGEN TRANSPORT
7,103L,HYDROLASE(O-GLYCOSYL)
8,103M,OXYGEN TRANSPORT


In [4]:
model_f = protein_char.set_index('structureId').join(protein_seq.set_index('structureId'))
model_f.head()

Unnamed: 0_level_0,classification,sequence
structureId,Unnamed: 1_level_1,Unnamed: 2_level_1
101M,OXYGEN TRANSPORT,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...
102L,HYDROLASE(O-GLYCOSYL),MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAAKSE...
102M,OXYGEN TRANSPORT,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...
103L,HYDROLASE(O-GLYCOSYL),MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNSLDAAK...
103M,OXYGEN TRANSPORT,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...


In [5]:
model_f = model_f.dropna()

In [10]:
counts = model_f.classification.value_counts()
types = np.asarray(counts[(counts > 1000)].index)
data = model_f[model_f.classification.isin(types)]
data.drop_duplicates()

Unnamed: 0_level_0,classification,sequence
structureId,Unnamed: 1_level_1,Unnamed: 2_level_1
10GS,TRANSFERASE/TRANSFERASE INHIBITOR,PPYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKAS...
117E,HYDROLASE,TYTTRQIGAKNTLEYKVYIEKDGKPVSAFHDIPLYADKENNIFNMV...
11AS,LIGASE,MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQD...
11BA,HYDROLASE,KESAAAKFERQHMDSGNSPSSSSNYCNLMMCCRKMTQGKCKPVNTF...
11GS,TRANSFERASE,MPPYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKA...
12GS,TRANSFERASE/TRANSFERASE INHIBITOR,MPPYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKA...
155C,ELECTRON TRANSPORT,XNEGDAAKGEKEFNKCKACHMIQAPDGTDIKGGKTGPNLYGVVGRK...
177L,HYDROLASE,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSEL...
17GS,TRANSFERASE,MPPYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKA...
19GS,TRANSFERASE,PPYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKAS...


In [11]:
X_train, X_test,y_train,y_test = train_test_split(data['sequence'], data['classification'], 
                                                  test_size = 0.2, random_state = 1)

vect = CountVectorizer(analyzer = 'char_wb', ngram_range = (4,4))

vect.fit(X_train)
X_train_df = vect.transform(X_train)
X_test_df = vect.transform(X_test)

print(vect.get_feature_names()[-20:])

KeyboardInterrupt: 

In [None]:
prediction = dict()

from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_df, y_train)
NB_pred = model.predict(X_test_df)
prediction["MultinomialNB"] = accuracy_score(NB_pred, y_test)
print( prediction['MultinomialNB'])

In [None]:
from sklearn.preprocessing import LabelEncoder

def tokenize(seq):
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(list(seq))
    return integer_encoded

max_len = data.sequence.str.len().max()

X = list(map(lambda x: tokenize(x), data.sequence))
X = list(map(lambda x: [-1] * (max_len - len(x)), X))
X = np.array(X)

y = LabelEncoder().fit_transform(data.classification.values)

In [12]:
data

Unnamed: 0_level_0,classification,sequence
structureId,Unnamed: 1_level_1,Unnamed: 2_level_1
10GS,TRANSFERASE/TRANSFERASE INHIBITOR,PPYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKAS...
10GS,TRANSFERASE/TRANSFERASE INHIBITOR,PPYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKAS...
117E,HYDROLASE,TYTTRQIGAKNTLEYKVYIEKDGKPVSAFHDIPLYADKENNIFNMV...
117E,HYDROLASE,TYTTRQIGAKNTLEYKVYIEKDGKPVSAFHDIPLYADKENNIFNMV...
11AS,LIGASE,MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQD...
11AS,LIGASE,MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQD...
11BA,HYDROLASE,KESAAAKFERQHMDSGNSPSSSSNYCNLMMCCRKMTQGKCKPVNTF...
11BA,HYDROLASE,KESAAAKFERQHMDSGNSPSSSSNYCNLMMCCRKMTQGKCKPVNTF...
11BG,HYDROLASE,KESAAAKFERQHMDSGNSPSSSSNYCNLMMCCRKMTQGKCKPVNTF...
11BG,HYDROLASE,KESAAAKFERQHMDSGNSPSSSSNYCNLMMCCRKMTQGKCKPVNTF...


In [None]:
X