In [3]:
import pandas as pd
import numpy as np

In [4]:
raw_data = pd.read_csv('yeast.ssv', sep='\s+',
                    names=["sequence_name", "mcg", "gvh", "alm", "mit", "erl", "pox", "vac", "nuc", "protein_local"])
raw_data.head

<bound method NDFrame.head of      sequence_name   mcg   gvh   alm   mit  erl  pox   vac   nuc protein_local
0       ADT1_YEAST  0.58  0.61  0.47  0.13  0.5  0.0  0.48  0.22           MIT
1       ADT2_YEAST  0.43  0.67  0.48  0.27  0.5  0.0  0.53  0.22           MIT
2       ADT3_YEAST  0.64  0.62  0.49  0.15  0.5  0.0  0.53  0.22           MIT
3       AAR2_YEAST  0.58  0.44  0.57  0.13  0.5  0.0  0.54  0.22           NUC
4       AATM_YEAST  0.42  0.44  0.48  0.54  0.5  0.0  0.48  0.22           MIT
...            ...   ...   ...   ...   ...  ...  ...   ...   ...           ...
1479    YUR1_YEAST  0.81  0.62  0.43  0.17  0.5  0.0  0.53  0.22           ME2
1480    ZIP1_YEAST  0.47  0.43  0.61  0.40  0.5  0.0  0.48  0.47           NUC
1481    ZNRP_YEAST  0.67  0.57  0.36  0.19  0.5  0.0  0.56  0.22           ME2
1482    ZUO1_YEAST  0.43  0.40  0.60  0.16  0.5  0.0  0.53  0.39           NUC
1483    G6PD_YEAST  0.65  0.54  0.54  0.13  0.5  0.0  0.53  0.22           CYT

[1484 rows x 10 colum

In [5]:
X = raw_data.iloc[:, 1:-1].to_numpy()
X

array([[0.58, 0.61, 0.47, ..., 0.  , 0.48, 0.22],
       [0.43, 0.67, 0.48, ..., 0.  , 0.53, 0.22],
       [0.64, 0.62, 0.49, ..., 0.  , 0.53, 0.22],
       ...,
       [0.67, 0.57, 0.36, ..., 0.  , 0.56, 0.22],
       [0.43, 0.4 , 0.6 , ..., 0.  , 0.53, 0.39],
       [0.65, 0.54, 0.54, ..., 0.  , 0.53, 0.22]])

In [6]:
y = raw_data.iloc[:, -1].to_numpy()
y

array(['MIT', 'MIT', 'MIT', ..., 'ME2', 'NUC', 'CYT'], dtype=object)

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import recall_score

inner_kf = KFold(n_splits=5)
outer_skf = StratifiedKFold(n_splits=5)
scaler = StandardScaler()
p_grid = {"knn__n_neighbors":range(2, 40, 1)}
knn = KNeighborsClassifier()
pipe = Pipeline(steps=[('scaler',scaler), ('knn', knn)])
clf=GridSearchCV(estimator=pipe, param_grid=p_grid, cv=inner_kf)
scoring = ['accuracy', 'f1_macro', 'recall_macro', 'precision_micro']
scores = cross_validate(clf, X=X, y=y, cv=outer_skf, scoring=scoring, return_estimator=True)

print("F1_macro; mean is: {}, deviation is:{}".format(scores['test_f1_macro'].mean(), scores['test_f1_macro'].std()))
print("Accuracy; mean is: {}, deviation is:{}".format(scores['test_accuracy'].mean(), scores['test_accuracy'].std()))
print("Precision_micro; mean is: {}, deviation is:{}".format(scores['test_precision_micro'].mean(), scores['test_precision_micro'].std()))
print("Recall_macro; mean is: {}, deviation is:{}".format(scores['test_recall_macro'].mean(), scores['test_recall_macro'].std()))

for estimator in scores['estimator']:
    print(estimator.best_params_)

F1_macro; mean is: 0.5163793726546485, deviation is:0.06349570529567317
Accuracy; mean is: 0.5727818727818728, deviation is:0.02575678124267269
Precision_micro; mean is: 0.5727818727818728, deviation is:0.02575678124267269
Recall_macro; mean is: 0.5242628683664616, deviation is:0.07177279178446955
{'knn__n_neighbors': 9}
{'knn__n_neighbors': 12}
{'knn__n_neighbors': 14}
{'knn__n_neighbors': 18}
{'knn__n_neighbors': 14}
