In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import resample
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neural_network import MLPClassifier


In [2]:
data_train = pd.read_csv('../data/train.csv')
data_predict = pd.read_csv('../data/test.csv')
data_train

Unnamed: 0,Sequence,Active
0,DKWL,0
1,FCHN,0
2,KDQP,0
3,FNWI,0
4,NKRM,0
...,...,...
111995,GSME,0
111996,DLPT,0
111997,SGHC,0
111998,KIGT,0


Unnamed: 0,Sequence,Active
0,DKWL,0
1,FCHN,0
2,KDQP,0
3,FNWI,0
4,NKRM,0
...,...,...
111995,GSME,0
111996,DLPT,0
111997,SGHC,0
111998,KIGT,0


In [3]:
print('Value count:')
print(data_train.Active.value_counts())
print('%.2f %% of all proteins are active' % (data_train.Active.value_counts()[1]/data_train.Active.value_counts()[0]*100))

Value count:
0    107787
1      4213
Name: Active, dtype: int64
3.91 % of all proteins are active
Value count:
0    107787
1      4213
Name: Active, dtype: int64
3.91 % of all proteins are active


In [4]:
# Separate majority and minority classes
majority = data_train[data_train.Active==0]
minority = data_train[data_train.Active==1]

# Upsample minority class
minority_upsampled = resample(minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=majority.Active.value_counts()[0],    # to match majority class
                                 random_state=123) # reproducible results
# Combine majority class with upsampled minority class
data_train = pd.concat([majority, minority_upsampled])
# shuffle rows
data_train = data_train.sample(frac=1)
data_train.Active.value_counts()

1    107787
0    107787
Name: Active, dtype: int64

1    107787
0    107787
Name: Active, dtype: int64

In [None]:
# # reduce data size for faster prototyping
# data_train = data_train.iloc[0:5000,:]
# data_train.head()

In [5]:
# seperate features from labels
y = data_train.drop(['Sequence'], axis=1).values.ravel()

def seperate_sites(data):
    
    # feature engineering: every site seperate feature
    sites = pd.DataFrame({'site_0':[],
                          'site_1':[],
                          'site_2':[],
                          'site_3':[]})
    i = 0
    for sequence in  data['Sequence']:
        sites.loc[i,'site_0'] = sequence[0]
        sites.loc[i,'site_1'] = sequence[1]
        sites.loc[i,'site_2'] = sequence[2]
        sites.loc[i,'site_3'] = sequence[3]
        i += 1
        
    return sites
    
sites_train = seperate_sites(data_train)

print(sites_train.head())

  site_0 site_1 site_2 site_3
0      S      S      R      C
1      L      F      H      T
2      W      S      P      Y
3      A      V      P      C
4      Q      W      L      T
  site_0 site_1 site_2 site_3
0      S      S      R      C
1      L      F      H      T
2      W      S      P      Y
3      A      V      P      C
4      Q      W      L      T


In [91]:
# # export seperated features
# sites_train.to_csv('../data/sites_train.csv')

# import seperated data again
sites_train = pd.read_csv('../data/sites_train.csv').drop(['Unnamed: 0'], axis=1)
sites_train

Unnamed: 0,site_0,site_1,site_2,site_3
0,S,S,R,C
1,L,F,H,T
2,W,S,P,Y
3,A,V,P,C
4,Q,W,L,T
...,...,...,...,...
215569,Y,H,F,C
215570,F,I,L,V
215571,Y,N,Q,G
215572,A,V,F,G


Unnamed: 0,site_0,site_1,site_2,site_3
0,S,S,R,C
1,L,F,H,T
2,W,S,P,Y
3,A,V,P,C
4,Q,W,L,T
...,...,...,...,...
215569,Y,H,F,C
215570,F,I,L,V
215571,Y,N,Q,G
215572,A,V,F,G


In [93]:
def feature_extraction(data):
    # turn X into dict
    X_dict = data.to_dict(orient='records') # turn each row as key-value pairs
    # instantiate a Dictvectorizer object for X
    dv_X = DictVectorizer(sparse=False) 
    # sparse = False makes the output is not a sparse matrix

    # apply dv_X on X_dict
    X_encoded = dv_X.fit_transform(X_dict)

    return X_encoded

X = feature_extraction(sites_train)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [94]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
print('shapes:')
print('X_train ', np.shape(X_train))
print('X_test  ', np.shape(X_test))
print('y_train ', np.shape(y_train))
print('y_test  ', np.shape(y_test))

shapes:
X_train  (194016, 80)
X_test   (21558, 80)
y_train  (194016,)
y_test   (21558,)
shapes:
X_train  (194016, 80)
X_test   (21558, 80)
y_train  (194016,)
y_test   (21558,)


In [95]:
# scaling data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [96]:
mlp = MLPClassifier(hidden_layer_sizes=(300,), alpha=0.0001, max_iter=500, \
                     random_state = 42)
mlp.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(300,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=500,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=42, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(300,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=500,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=42, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [85]:
y_predict = mlp.predict(X_test)
print(confusion_matrix(y_test, y_predict))
print(classification_report(y_test, y_predict))

[[10788    49]
 [    0 10721]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     10837
           1       1.00      1.00      1.00     10721

    accuracy                           1.00     21558
   macro avg       1.00      1.00      1.00     21558
weighted avg       1.00      1.00      1.00     21558

[[10788    49]
 [    0 10721]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     10837
           1       1.00      1.00      1.00     10721

    accuracy                           1.00     21558
   macro avg       1.00      1.00      1.00     21558
weighted avg       1.00      1.00      1.00     21558



In [97]:
sites_predict = seperate_sites(data_predict)
sites_predict

Unnamed: 0,site_0,site_1,site_2,site_3
0,H,W,F,K
1,M,W,P,W
2,A,L,D,V
3,N,T,L,G
4,L,H,Y,Y
...,...,...,...,...
47995,N,R,W,M
47996,M,M,M,K
47997,A,F,N,M
47998,C,R,Y,I


Unnamed: 0,site_0,site_1,site_2,site_3
0,H,W,F,K
1,M,W,P,W
2,A,L,D,V
3,N,T,L,G
4,L,H,Y,Y
...,...,...,...,...
47995,N,R,W,M
47996,M,M,M,K
47997,A,F,N,M
47998,C,R,Y,I


In [98]:
# # export seperated features
# sites_train.to_csv('../data/sites_predict.csv')

# import seperated data again
sites_train = pd.read_csv('../data/sites_predict.csv').drop(['Unnamed: 0'], axis=1)
sites_train

Unnamed: 0,site_0,site_1,site_2,site_3
0,S,S,R,C
1,L,F,H,T
2,W,S,P,Y
3,A,V,P,C
4,Q,W,L,T
...,...,...,...,...
215569,Y,H,F,C
215570,F,I,L,V
215571,Y,N,Q,G
215572,A,V,F,G


Unnamed: 0,site_0,site_1,site_2,site_3
0,S,S,R,C
1,L,F,H,T
2,W,S,P,Y
3,A,V,P,C
4,Q,W,L,T
...,...,...,...,...
215569,Y,H,F,C
215570,F,I,L,V
215571,Y,N,Q,G
215572,A,V,F,G


In [99]:
X_predict = feature_extraction(sites_predict)

In [101]:
# make prediction on given data set
prediction = mlp.predict(X_predict)

In [102]:
# export results
result = pd.DataFrame.from_records(prediction.reshape(-1,1))
result.to_csv('../results/results.csv', index=False, header=False)
result

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
...,...
47995,0
47996,0
47997,0
47998,0


Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
...,...
47995,0
47996,0
47997,0
47998,0
