In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.feature_extraction import FeatureHasher
from sklearn.preprocessing import MinMaxScaler

In [2]:
data_train = pd.read_csv('../data/train.csv')
data_predict = pd.read_csv('../data/test.csv')
data_train

Unnamed: 0,Sequence,Active
0,DKWL,0
1,FCHN,0
2,KDQP,0
3,FNWI,0
4,NKRM,0
...,...,...
111995,GSME,0
111996,DLPT,0
111997,SGHC,0
111998,KIGT,0


Unnamed: 0,Sequence,Active
0,DKWL,0
1,FCHN,0
2,KDQP,0
3,FNWI,0
4,NKRM,0
...,...,...
111995,GSME,0
111996,DLPT,0
111997,SGHC,0
111998,KIGT,0


In [3]:
print('Value count:')
print(data_train.Active.value_counts())
print('%.2f %% of all proteins are active' % (data_train.Active.value_counts()[1]/data_train.Active.value_counts()[0]*100))

Value count:
0    107787
1      4213
Name: Active, dtype: int64
3.91 % of all proteins are active
Value count:
0    107787
1      4213
Name: Active, dtype: int64
3.91 % of all proteins are active


In [4]:
from sklearn.utils import resample

# Separate majority and minority classes
majority = data_train[data_train.Active==0]
minority = data_train[data_train.Active==1]

# Upsample minority class
minority_upsampled = resample(minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=majority.Active.value_counts()[0],    # to match majority class
                                 random_state=123) # reproducible results
# Combine majority class with upsampled minority class
data_train = pd.concat([majority, minority_upsampled])
# shuffle rows
data_train = data_train.sample(frac=1)
data_train.Active.value_counts()

1    107787
0    107787
Name: Active, dtype: int64

1    107787
0    107787
Name: Active, dtype: int64

In [None]:
# reduce data size for faster prototyping
data_train = data_train.iloc[0:5000,:]

data_train.head()

In [5]:
# seperate features from labels
y = data_train.drop(['Sequence'], axis=1).values.ravel()

def seperate_sites(data):
    
    # feature engineering: every site seperate feature
    sites = pd.DataFrame({'site_0':[],
                          'site_1':[],
                          'site_2':[],
                          'site_3':[]})
    i = 0
    for sequence in  data['Sequence']:
        sites.loc[i,'site_0'] = sequence[0]
        sites.loc[i,'site_1'] = sequence[1]
        sites.loc[i,'site_2'] = sequence[2]
        sites.loc[i,'site_3'] = sequence[3]
        i += 1
        
    return sites
    
sites_train = seperate_sites(data_train)

print(sites_train.head())

  site_0 site_1 site_2 site_3
0      S      S      R      C
1      L      F      H      T
2      W      S      P      Y
3      A      V      P      C
4      Q      W      L      T
  site_0 site_1 site_2 site_3
0      S      S      R      C
1      L      F      H      T
2      W      S      P      Y
3      A      V      P      C
4      Q      W      L      T


In [9]:
# export seperated features
sites_train.to_csv('../data/sites_train.csv')

In [10]:
# feature extracion
h = FeatureHasher(n_features=4)
X = h.transform(sites_train.to_dict('records')).toarray()
X

array([[ 0., -1.,  0., -1.],
       [ 1.,  0.,  1.,  0.],
       [-1., -1.,  1., -1.],
       ...,
       [ 0., -1.,  0., -1.],
       [ 3.,  0.,  0., -1.],
       [-1.,  0.,  1.,  0.]])

array([[ 0., -1.,  0., -1.],
       [ 1.,  0.,  1.,  0.],
       [-1., -1.,  1., -1.],
       ...,
       [ 0., -1.,  0., -1.],
       [ 3.,  0.,  0., -1.],
       [-1.,  0.,  1.,  0.]])

In [11]:
# split data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('shapes:')
print('X_train ', np.shape(X_train))
print('X_test  ', np.shape(X_test))
print('y_train ', np.shape(y_train))
print('y_test  ', np.shape(y_test))

shapes:
X_train  (172459, 4)
X_test   (43115, 4)
y_train  (172459,)
y_test   (43115,)
shapes:
X_train  (172459, 4)
X_test   (43115, 4)
y_train  (172459,)
y_test   (43115,)


In [12]:
# scaling data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [13]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(10, 10, 10), max_iter=1000)
mlp.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(10, 10, 10), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=1000,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(10, 10, 10), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=1000,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [14]:
from sklearn.metrics import classification_report, confusion_matrix
y_predict = mlp.predict(X_test)
print(confusion_matrix(y_test, y_predict))
print(classification_report(y_test, y_predict))

[[13280  8407]
 [ 5567 15861]]
              precision    recall  f1-score   support

           0       0.70      0.61      0.66     21687
           1       0.65      0.74      0.69     21428

    accuracy                           0.68     43115
   macro avg       0.68      0.68      0.67     43115
weighted avg       0.68      0.68      0.67     43115

[[13280  8407]
 [ 5567 15861]]
              precision    recall  f1-score   support

           0       0.70      0.61      0.66     21687
           1       0.65      0.74      0.69     21428

    accuracy                           0.68     43115
   macro avg       0.68      0.68      0.67     43115
weighted avg       0.68      0.68      0.67     43115



In [18]:
sites_predict = seperate_sites(data_predict)

KeyboardInterrupt: 

KeyboardInterrupt: 

In [17]:
# make prediction on given data set
prediction = mlp.predict(sites_predict)

ValueError: could not convert string to float: 'H'

ValueError: could not convert string to float: 'H'

In [None]:
# export results
prediction.csv_write('../data/results_1.csv')

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import f1_score
clf = SVC(kernel='linear', 
          class_weight='balanced', # penalize
          probability=True)
clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)
y_predict = clf.predict(X_test)
fscore = f1_score(y_test, y_predict)
print('accuracy : ', accuracy)
print('how many zeros and ones: ', np.unique( y_predict ) )
print('F1 score: %.3f' % fscore)

In [None]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

print(__doc__)


# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(
        SVC(), tuned_parameters, scoring='%s_macro' % score
    )
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf_2 = RandomForestClassifier(random_state=0, n_jobs=-1, class_weight="balanced")

clf_2.fit(X_train, y_train)
accuracy = clf_2.score(X_test, y_test)
y_predict = clf_2.predict(X_test)
fscore = f1_score(y_test, y_predict)

print('accuracy : ', accuracy)
print('how many zeros and ones: ', np.unique( y_predict ) )
print('F1 score: %.3f' % fscore)

In [None]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier

models = [
    ('SVM', SVC()),
    ('KNN', KNeighborsClassifier()),
    ('AB', AdaBoostClassifier()),
    ('RF', RandomForestClassifier()),
    ('GB', GradientBoostingClassifier()),
]

In [None]:
from sklearn.metrics import f1_score
for name, model in models:
    clf = model
    clf.fit(X_train, y_train)
    accuracy = clf.score(X_test, y_test)
    print(name, accuracy)
    
    y_predict = clf.predict(X_test)
    fscore = f1_score(y_test, y_predict)
    print('F1 score: ', name, fscore)
    

In [None]:
from sklearn.model_selection import RandomizedSearchCV

parameters = {'n_estimators': np.linspace(1, 100, num=40, dtype=int), 'learning_rate': np.linspace(0.01, 10, num=100)}

clf = RandomizedSearchCV(AdaBoostClassifier(), parameters)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))
print(clf.best_params_)