In [1]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, accuracy_score

In [2]:
df = pd.read_csv('protein_df.csv')

In [3]:
df.head()

Unnamed: 0,classification,sequence
0,TRANSFERASE/TRANSFERASE INHIBITOR,PPYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKAS...
1,TRANSFERASE/TRANSFERASE INHIBITOR,PPYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKAS...
2,HYDROLASE,TYTTRQIGAKNTLEYKVYIEKDGKPVSAFHDIPLYADKENNIFNMV...
3,HYDROLASE,TYTTRQIGAKNTLEYKVYIEKDGKPVSAFHDIPLYADKENNIFNMV...
4,LIGASE,MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQD...


In [4]:
counts = df['classification'].value_counts()
types = np.asarray(counts[(counts > 1000)].index)
data = df[df.classification.isin(types)]

In [5]:
types

array(['HYDROLASE', 'TRANSFERASE', 'OXIDOREDUCTASE', 'IMMUNE SYSTEM',
       'LYASE', 'HYDROLASE/HYDROLASE INHIBITOR', 'TRANSCRIPTION',
       'VIRAL PROTEIN', 'TRANSPORT PROTEIN', 'VIRUS', 'SIGNALING PROTEIN',
       'ISOMERASE', 'LIGASE', 'MEMBRANE PROTEIN', 'PROTEIN BINDING',
       'STRUCTURAL PROTEIN', 'CHAPERONE',
       'STRUCTURAL GENOMICS, UNKNOWN FUNCTION', 'SUGAR BINDING PROTEIN',
       'DNA BINDING PROTEIN', 'PHOTOSYNTHESIS', 'ELECTRON TRANSPORT',
       'TRANSFERASE/TRANSFERASE INHIBITOR', 'METAL BINDING PROTEIN',
       'CELL ADHESION', 'UNKNOWN FUNCTION', 'PROTEIN TRANSPORT', 'TOXIN',
       'CELL CYCLE', 'RNA BINDING PROTEIN', 'DE NOVO PROTEIN', 'HORMONE',
       'GENE REGULATION', 'OXIDOREDUCTASE/OXIDOREDUCTASE INHIBITOR',
       'APOPTOSIS', 'MOTOR PROTEIN', 'PROTEIN FIBRIL', 'METAL TRANSPORT',
       'VIRAL PROTEIN/IMMUNE SYSTEM', 'CONTRACTILE PROTEIN',
       'FLUORESCENT PROTEIN', 'TRANSLATION', 'BIOSYNTHETIC PROTEIN'],
      dtype=object)

In [6]:
df.isnull().sum()

classification    0
sequence          3
dtype: int64

In [7]:
df = df.dropna()

In [8]:
X = df.sequence
y = df.classification
# Split Data
X_train, X_test,y_train,y_test = train_test_split(X, y, test_size = 0.25, random_state = 1)

In [9]:
# Create a Count Vectorizer to gather the unique elements in sequence
vect = CountVectorizer(analyzer = 'char_wb', ngram_range = (3,3))

#fit and transform
X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)

### Model 1 - Naive Bayes classifier

In [10]:
from sklearn.naive_bayes import MultinomialNB

In [11]:
nb = MultinomialNB()

In [12]:
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

In [13]:
accuracy_score(y_pred, y_test)

0.5613121620264785

In [None]:
nb.predict()

In [14]:
confusion_matrix(y_pred, y_test)

array([[ 155,    0,    0, ...,    5,    6,    2],
       [   1,  165,    1, ...,    0,    0,    0],
       [   0,    0,  476, ...,   40,    0,   32],
       ...,
       [   0,    0,    9, ...,  943,   29,   53],
       [   0,    0,    1, ...,  114,  165,    4],
       [   1,    0,    0, ...,  148,    2, 1051]])

In [15]:
print(classification_report(y_test, y_pred, target_names= types))

                                         precision    recall  f1-score   support

                              HYDROLASE       0.36      0.49      0.42       314
                            TRANSFERASE       0.54      0.64      0.59       257
                         OXIDOREDUCTASE       0.36      0.65      0.46       732
                          IMMUNE SYSTEM       0.50      0.38      0.43       675
                                  LYASE       0.85      0.55      0.67      1074
          HYDROLASE/HYDROLASE INHIBITOR       0.66      0.71      0.68       285
                          TRANSCRIPTION       0.45      0.76      0.56       397
                          VIRAL PROTEIN       0.48      0.48      0.48       784
                      TRANSPORT PROTEIN       0.58      0.60      0.59       741
                                  VIRUS       0.93      0.93      0.93       259
                      SIGNALING PROTEIN       0.77      0.48      0.59       412
                           

### Model 2 - AdaBoost


In [16]:
from sklearn.ensemble import AdaBoostClassifier

In [17]:
ab = AdaBoostClassifier()

In [18]:
ab.fit(X_train,y_train)
y_pred = ab.predict(X_test)
accuracy_score(y_pred, y_test)

0.21230116040563995

In [19]:
confusion_matrix(y_pred, y_test)

array([[  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0],
       ...,
       [  0,   0,   0, ...,  33,   2,   0],
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   6, ...,  41,   0, 604]])

In [20]:
print(classification_report(y_test, y_pred, target_names= types))

  _warn_prf(average, modifier, msg_start, len(result))


                                         precision    recall  f1-score   support

                              HYDROLASE       0.00      0.00      0.00       314
                            TRANSFERASE       0.00      0.00      0.00       257
                         OXIDOREDUCTASE       0.00      0.00      0.00       732
                          IMMUNE SYSTEM       0.00      0.00      0.00       675
                                  LYASE       0.00      0.00      0.00      1074
          HYDROLASE/HYDROLASE INHIBITOR       0.53      0.30      0.38       285
                          TRANSCRIPTION       0.00      0.00      0.00       397
                          VIRAL PROTEIN       0.00      0.00      0.00       784
                      TRANSPORT PROTEIN       0.00      0.00      0.00       741
                                  VIRUS       0.23      0.48      0.31       259
                      SIGNALING PROTEIN       0.58      0.40      0.47       412
                           