# Machine Learning Model of QSAR biodegradation Classification Problem

## 1.0 Input data

In [39]:
# importing standard modules  
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score,recall_score,precision_score, f1_score,confusion_matrix,classification_report
import warnings
warnings.filterwarnings('ignore')

In [40]:
# read data
dataset = pd.read_csv('biodeg.csv', sep=';')

In [41]:
dataset.head(10)

Unnamed: 0,SpMax_L,J_Dz(e),nHM,F01[N-N],F04[C-N],NssssC,nCb-,C%,nCp,nO,...,C-026,F02[C-N],nHDon,SpMax_B(m),Psi_i_A,nN,SM6_B(m),nArCOOR,nX,experimental_class
0,3.919,2.6909,0,0,0,0,0,31.4,2,0,...,0,0,0,2.949,1.591,0,7.253,0,0,RB
1,4.17,2.1144,0,0,0,0,0,30.8,1,1,...,0,0,0,3.315,1.967,0,7.257,0,0,RB
2,3.932,3.2512,0,0,0,0,0,26.7,2,4,...,0,0,1,3.076,2.417,0,7.601,0,0,RB
3,3.0,2.7098,0,0,0,0,0,20.0,0,2,...,0,0,1,3.046,5.0,0,6.69,0,0,RB
4,4.236,3.3944,0,0,0,0,0,29.4,2,4,...,0,0,0,3.351,2.405,0,8.003,0,0,RB
5,4.236,3.4286,0,0,0,0,0,28.6,2,4,...,0,0,0,3.351,2.556,0,7.904,0,0,RB
6,5.0,5.0476,1,0,0,0,0,11.1,0,3,...,0,0,1,4.712,4.583,0,9.303,0,0,RB
7,4.525,3.8301,0,0,0,0,0,31.6,3,2,...,0,0,0,3.379,2.143,0,7.95,0,0,RB
8,4.596,3.0777,0,0,0,0,2,44.4,2,0,...,0,0,0,3.626,1.917,0,7.939,0,0,RB
9,5.04,3.6112,0,0,1,0,2,41.2,0,4,...,1,2,1,3.888,3.5,1,8.706,0,0,RB


In [42]:
dataset.shape

(1055, 42)

In [43]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1055 entries, 0 to 1054
Data columns (total 42 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   SpMax_L             1055 non-null   float64
 1   J_Dz(e)             1055 non-null   float64
 2   nHM                 1055 non-null   int64  
 3   F01[N-N]            1055 non-null   int64  
 4   F04[C-N]            1055 non-null   int64  
 5   NssssC              1055 non-null   int64  
 6   nCb-                1055 non-null   int64  
 7   C%                  1055 non-null   float64
 8   nCp                 1055 non-null   int64  
 9   nO                  1055 non-null   int64  
 10  F03[C-N]            1055 non-null   int64  
 11  SdssC               1055 non-null   float64
 12  HyWi_B(m)           1055 non-null   float64
 13  LOC                 1055 non-null   float64
 14  SM6_L               1055 non-null   float64
 15  F03[C-O]            1055 non-null   int64  
 16  Me    

Dataset is complete and ready to proceed now.

## 2.0 Separating variables and outputs (targets)

In [44]:
from sklearn.feature_selection import f_classif, SelectKBest

In [45]:
# x is feature attributes
x = dataset.iloc[:,:-1]
# y is target variable
y = dataset.iloc[:,-1]
# turning RB = 1 ; NRB =0 for easier processing
y = y.replace(['RB','NRB'],[1,0])

All feature attribures are now in x. And, outputs(targets) are now in y where RB is represented by 1 and NRB is represented by 0.

## 3.0 Feature Selection

Since there are up to 40 feature attributes which are a lot and a huge dataset ( more than 1000 sets of data ), we would not be using wrapper method. Instead we use filter method - ANOVA. The reason of choosing ANOVA is that it is suitable for classification with numeric inputs and categorical output.
Different options of k such as 1/2 (20) , 2/3 (27), 3/4 (30) of numbers of attributes are tested.
2/3(27) has achieved the highest accuracy_score. Therefore, it is adopted.

In [46]:
selectedFeatures = SelectKBest(score_func=f_classif, k=27) .fit_transform(x,y)
print(selectedFeatures.shape)
print(selectedFeatures)

(1055, 27)
[[ 3.919  0.     0.    ...  7.253  0.     0.   ]
 [ 4.17   0.     0.    ...  7.257  0.     0.   ]
 [ 3.932  0.     0.    ...  7.601  0.     0.   ]
 ...
 [ 4.869  0.     9.    ...  9.537  1.     0.   ]
 [ 5.158  2.    36.    ... 11.055  0.     1.   ]
 [ 5.076  2.     0.    ...  9.13   0.     2.   ]]


## 4.0 Normalizing data

In [47]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
scaler = MinMaxScaler()
X_transform = scaler.fit_transform(selectedFeatures)
X_transform

array([[0.42682384, 0.        , 0.        , ..., 0.23878156, 0.        ,
        0.        ],
       [0.48265125, 0.        , 0.        , ..., 0.23919043, 0.        ,
        0.        ],
       [0.4297153 , 0.        , 0.        , ..., 0.27435347, 0.        ,
        0.        ],
       ...,
       [0.63812278, 0.        , 0.25      , ..., 0.47224778, 0.25      ,
        0.        ],
       [0.70240214, 0.16666667, 1.        , ..., 0.6274149 , 0.        ,
        0.03703704],
       [0.6841637 , 0.16666667, 0.        , ..., 0.430645  , 0.        ,
        0.07407407]])

## 5.0 Splitting data

In [48]:
from sklearn.model_selection import train_test_split

In [49]:
seed_num = 80
X_train, X_test, y_train, y_test = train_test_split(X_transform, y, test_size=0.3, random_state=seed_num) # random_state is set to a value for reproducible output.
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,test_size=0.2, random_state=seed_num)
print(X_train.shape)
print(X_test.shape)
print(X_val.shape)

(590, 27)
(317, 27)
(148, 27)


## 6.0 Finding optimal machine learning model

### 6.1 K-Nearest Neighbour

In [50]:
# Find the best k
from sklearn.neighbors import KNeighborsClassifier
scores = []
max_score = 0
best_k = 0
for k in range(1,20):
    model_knn = KNeighborsClassifier(n_neighbors=k)
    model_knn.fit(X_train, y_train)
    score = model_knn.score(X_val, y_val)
    if score > max_score:
        max_score = score
        best_k = k
    scores.append(score)
    
print(best_k)

12


In [51]:
# build model
model_knn = KNeighborsClassifier(n_neighbors=best_k)
model_knn.fit(X_train, y_train)

# start prediction
y_pred = model_knn.predict(X_val)

In [52]:
# print results
print(accuracy_score(y_val, y_pred))
print(recall_score(y_val, y_pred))
print(precision_score(y_val, y_pred))
print(f1_score(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

0.8986486486486487
0.84
0.8571428571428571
0.8484848484848485
[[91  7]
 [ 8 42]]
              precision    recall  f1-score   support

           0       0.92      0.93      0.92        98
           1       0.86      0.84      0.85        50

    accuracy                           0.90       148
   macro avg       0.89      0.88      0.89       148
weighted avg       0.90      0.90      0.90       148



### 6.2 Perceptron

In [53]:
# build model
from sklearn.linear_model import SGDClassifier
model_pcpt = SGDClassifier(loss='perceptron', eta0=1, learning_rate='constant',penalty=None, max_iter=200, random_state=seed_num)
model_pcpt.fit(X_train, y_train)

# start prediction
y_pred = model_pcpt.predict(X_val)

In [54]:
# print results
print(accuracy_score(y_val, y_pred))
print(recall_score(y_val, y_pred))
print(precision_score(y_val, y_pred))
print(f1_score(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

0.8513513513513513
0.64
0.8888888888888888
0.7441860465116279
[[94  4]
 [18 32]]
              precision    recall  f1-score   support

           0       0.84      0.96      0.90        98
           1       0.89      0.64      0.74        50

    accuracy                           0.85       148
   macro avg       0.86      0.80      0.82       148
weighted avg       0.86      0.85      0.84       148



### 6.3 Logistic Regression

In [55]:
# build model
from sklearn.linear_model import SGDClassifier
model_logr = SGDClassifier(loss='log', eta0=0.01, learning_rate='constant',penalty=None, max_iter=200, random_state=seed_num)
model_logr.fit(X_train, y_train)

# start prediction
y_pred = model_logr.predict(X_val)

In [56]:
# print results
print(accuracy_score(y_val, y_pred))
print(recall_score(y_val, y_pred))
print(precision_score(y_val, y_pred))
print(f1_score(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

0.8716216216216216
0.78
0.8297872340425532
0.8041237113402062
[[90  8]
 [11 39]]
              precision    recall  f1-score   support

           0       0.89      0.92      0.90        98
           1       0.83      0.78      0.80        50

    accuracy                           0.87       148
   macro avg       0.86      0.85      0.85       148
weighted avg       0.87      0.87      0.87       148



### 6.4 Decision Tree

In [57]:
# Select a set of parameters that gives optimal result
from sklearn.tree import DecisionTreeClassifier
depths = np.arange(5,15,1)
parameters={'criterion':('entropy','gini'), 'max_depth':depths}
clf = DecisionTreeClassifier(random_state=seed_num)
gsc = GridSearchCV(clf, parameters, cv = 5, scoring = ['accuracy','precision','recall','f1'], n_jobs = 3, refit = 'f1')
gsc.fit(X_train, y_train)
print(gsc.best_params_)

{'criterion': 'entropy', 'max_depth': 9}


In [58]:
# Build model
model_dt = DecisionTreeClassifier(criterion='entropy',  max_depth= 9, random_state=seed_num)
model_dt.fit(X_train, y_train)

#start prediction
y_pred = model_dt.predict(X_val)

In [59]:
# print results
print(accuracy_score(y_val, y_pred))
print(recall_score(y_val, y_pred))
print(precision_score(y_val, y_pred))
print(f1_score(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

0.831081081081081
0.7
0.7777777777777778
0.7368421052631577
[[88 10]
 [15 35]]
              precision    recall  f1-score   support

           0       0.85      0.90      0.88        98
           1       0.78      0.70      0.74        50

    accuracy                           0.83       148
   macro avg       0.82      0.80      0.81       148
weighted avg       0.83      0.83      0.83       148



### 6.5 Support Vector Machine

In [60]:
# Select a set of parameters that gives optimal result
from sklearn.svm import SVC
parameters={'kernel':('linear','poly','rbf','sigmoid'), 'C':[5,10,15], 'degree':[2,3,4]}
clf = SVC()
gsc = GridSearchCV(clf, parameters, cv = 5, scoring = ['accuracy','precision','recall','f1'], n_jobs = 3, refit = 'f1')
gsc.fit(X_train, y_train)
print(gsc.best_params_)

{'C': 10, 'degree': 2, 'kernel': 'rbf'}


In [61]:
# Build model
model_svc = SVC(kernel='rbf', C=10, gamma='scale')
model_svc.fit(X_train, y_train)

# start prediction
y_pred = model_svc.predict(X_val)

In [62]:
# print results
print(accuracy_score(y_val, y_pred))
print(recall_score(y_val, y_pred))
print(precision_score(y_val, y_pred))
print(f1_score(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

0.8783783783783784
0.8
0.8333333333333334
0.816326530612245
[[90  8]
 [10 40]]
              precision    recall  f1-score   support

           0       0.90      0.92      0.91        98
           1       0.83      0.80      0.82        50

    accuracy                           0.88       148
   macro avg       0.87      0.86      0.86       148
weighted avg       0.88      0.88      0.88       148



Based on the results, the optimal machine learning model is model of K-Nearest Neighbour. It has achieved the best result in overall.

## 7.0 Confirm the machine learning model

We are going to test with model of **K-Nearest Neighbour** again. This time, we are going to use the test set.

In [63]:
# start prediction
y_pred = model_knn.predict(X_test)

In [64]:
# print results
print(accuracy_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.8422712933753943
0.7663551401869159
0.7663551401869159
0.766355140186916
[[185  25]
 [ 25  82]]
              precision    recall  f1-score   support

           0       0.88      0.88      0.88       210
           1       0.77      0.77      0.77       107

    accuracy                           0.84       317
   macro avg       0.82      0.82      0.82       317
weighted avg       0.84      0.84      0.84       317



Oops! It seems like the variance of testing using test set and validation set is significant and the result with test case is much lower than some of the models. Now, we are going to test with the model that achieved second highest performance above which is model of **Support Vector Machine** again. This time, we are going to use the test set.

In [65]:
# start prediction
y_pred = model_svc.predict(X_test)

In [66]:
# print results
print(accuracy_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.8706624605678234
0.8037383177570093
0.8113207547169812
0.807511737089202
[[190  20]
 [ 21  86]]
              precision    recall  f1-score   support

           0       0.90      0.90      0.90       210
           1       0.81      0.80      0.81       107

    accuracy                           0.87       317
   macro avg       0.86      0.85      0.86       317
weighted avg       0.87      0.87      0.87       317



The result of support vector machine is very similar to the result we get from validation set. And, this model is stable with different sets of dataset. In the average, it is more accurate.

To be safer, we are going to get another model which has close performance to support vector machine above tested again with test set, which is **Logistic Regression**.

In [67]:
# start prediction
y_pred = model_logr.predict(X_test)

# print results
print(accuracy_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.8359621451104101
0.7009345794392523
0.7894736842105263
0.7425742574257426
[[190  20]
 [ 32  75]]
              precision    recall  f1-score   support

           0       0.86      0.90      0.88       210
           1       0.79      0.70      0.74       107

    accuracy                           0.84       317
   macro avg       0.82      0.80      0.81       317
weighted avg       0.83      0.84      0.83       317



From the results of using both validation set and test set, **Support Vector Machine** has achieved the best performance in overall.

## 8.0 Conclusion

The best suited machine learning model is model of support vector machine.