In [111]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.feature_extraction import FeatureHasher
from sklearn.preprocessing import MinMaxScaler

In [112]:
csv_train = pd.read_csv('../data/train.csv')
csv_predict = pd.read_csv('../data/test.csv')
csv_train

Unnamed: 0,Sequence,Active
0,DKWL,0
1,FCHN,0
2,KDQP,0
3,FNWI,0
4,NKRM,0
...,...,...
111995,GSME,0
111996,DLPT,0
111997,SGHC,0
111998,KIGT,0


In [113]:
print('Value count:')
print(csv_train.Active.value_counts())
print('%.2f %% of all proteins are active' % (csv_train.Active.value_counts()[1]/csv_train.Active.value_counts()[0]*100))

Value count:
0    107787
1      4213
Name: Active, dtype: int64
3.91 % of all proteins are active


In [114]:
# reduce data size for faster prototyping
data_train = csv_train.iloc[0:10000,:]
data_train.head()

Unnamed: 0,Sequence,Active
0,DKWL,0
1,FCHN,0
2,KDQP,0
3,FNWI,0
4,NKRM,0


In [117]:
# seperate features from labels
y = data_train.drop(['Sequence'], axis=1).values.ravel()

# feature engineering: every site seperate feature
sites = pd.DataFrame({'site_0':[],
                      'site_1':[],
                      'site_2':[],
                      'site_3':[]})
i = 0
for sequence in  data_train['Sequence']:
    sites.loc[i,'site_0'] = sequence[0]
    sites.loc[i,'site_1'] = sequence[1]
    sites.loc[i,'site_2'] = sequence[2]
    sites.loc[i,'site_3'] = sequence[3]
    i += 1

sites.head()

Unnamed: 0,site_0,site_1,site_2,site_3
0,D,K,W,L
1,F,C,H,N
2,K,D,Q,P
3,F,N,W,I
4,N,K,R,M


In [118]:
# feature extracion
h = FeatureHasher(n_features=4)
X = h.transform(sites.to_dict('records')).toarray()
X

array([[ 0., -1.,  0.,  1.],
       [ 0.,  0., -2.,  0.],
       [-1.,  0.,  0.,  1.],
       ...,
       [ 1.,  1., -1., -1.],
       [ 0., -1., -1.,  2.],
       [ 0.,  2., -1., -1.]])

In [119]:
# split data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('shapes:')
print('X_train ', np.shape(X_train))
print('X_test  ', np.shape(X_test))
print('y_train ', np.shape(y_train))
print('y_test  ', np.shape(y_test))

shapes:
X_train  (8000, 4)
X_test   (2000, 4)
y_train  (8000,)
y_test   (2000,)


In [120]:
# scaling data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [121]:
from sklearn.svm import SVC

clf = SVC(kernel='linear', 
          class_weight='balanced', # penalize
          probability=True)
clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)
y_predict = clf.predict(X_test)
fscore = f1_score(y_test, y_predict)
print('accuracy : ', accuracy)
print('how many zeros and ones: ', np.unique( y_predict ) )
print('F1 score: %.3f' % fscore)

accuracy :  0.706
how many zeros and ones:  [0 1]
F1 score: 0.140


In [27]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier

models = [
    ('SVM', SVC()),
    ('KNN', KNeighborsClassifier()),
    ('AB', AdaBoostClassifier()),
    ('RF', RandomForestClassifier()),
    ('GB', GradientBoostingClassifier()),
]

In [28]:
from sklearn.metrics import f1_score
for name, model in models:
    clf = model
    clf.fit(X_train, y_train)
    accuracy = clf.score(X_test, y_test)
    print(name, accuracy)
    
    y_predict = clf.predict(X_test)
    fscore = f1_score(y_test, y_predict)
    print('F1 score: ', name, fscore)
    

SVM 0.96325
F1 score:  SVM 0.0
KNN 0.96305
F1 score:  KNN 0.008053691275167786
AB 0.96315
F1 score:  AB 0.0
RF 0.96315
F1 score:  RF 0.0
GB 0.96315
F1 score:  GB 0.0


In [None]:
from sklearn.model_selection import RandomizedSearchCV

parameters = {'n_estimators': np.linspace(1, 100, num=40, dtype=int), 'learning_rate': np.linspace(0.01, 10, num=100)}

clf = RandomizedSearchCV(AdaBoostClassifier(), parameters)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))
print(clf.best_params_)