In [30]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler

In [31]:
data = pd.read_csv("../data/water_potability_preprocessed.csv")

In [32]:
data

Unnamed: 0.1,Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,0,7.080795,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.990970,2.963135,0
1,1,3.716080,129.422921,18630.057858,6.635246,333.775777,592.885359,15.180013,56.329076,4.500656,0
2,2,8.099124,224.236259,19909.541732,9.275884,333.775777,418.606213,16.868637,66.420093,3.055934,0
3,3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,4,9.092223,181.101509,17978.986339,6.546600,310.135738,398.410813,11.558279,31.997993,4.075075,0
...,...,...,...,...,...,...,...,...,...,...,...
3271,3271,4.668102,193.681735,47580.991603,7.166639,359.948574,526.424171,13.894419,66.687695,4.435821,1
3272,3272,7.808856,193.553212,17329.802160,8.061362,333.775777,392.449580,19.903225,66.396293,2.798243,1
3273,3273,9.419510,175.762646,33155.578218,7.350233,333.775777,432.044783,11.039070,69.845400,3.298875,1
3274,3274,5.126763,230.603758,11983.869376,6.303357,333.775777,402.883113,11.168946,77.488213,4.708658,1


In [33]:
data = data.iloc[:, 1:]

In [34]:
X = data.iloc[:, :-1].values # все признаки, кроме целевого
y = data['Potability'].values # целевой признак

разделяем выборки на обучающие и тестовые

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [36]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [37]:
X_train

array([[ 1.0729948 ,  0.16742632,  0.50384474, ...,  1.01209176,
         0.14624917,  1.68345311],
       [ 0.90453363, -0.25695512, -1.29207712, ..., -0.27787928,
        -0.46851873, -1.45553843],
       [-0.41057844, -0.58935292,  0.97012894, ...,  0.69613181,
         1.13563031, -1.07887797],
       ...,
       [-0.33476357,  0.65411399,  0.27831179, ..., -0.86136649,
        -1.12651818, -0.58011974],
       [-0.33219616,  0.00179971, -0.61234337, ..., -0.54045194,
         1.2231124 , -0.0521499 ],
       [ 0.81835842, -0.83463487, -1.00028502, ..., -0.58108188,
         0.81332674,  0.5751491 ]])

**Метод k-ближайших соседей (K-Nearest Neighbors)**

In [38]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
params = {'n_neighbors': np.arange(1,6,1)}
clf = GridSearchCV(knn, params)
clf.fit(X_train, y_train)
clf.best_params_

{'n_neighbors': 4}

In [39]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(clf.score(X_test, y_test))

[[351  62]
 [180  63]]
              precision    recall  f1-score   support

           0       0.66      0.85      0.74       413
           1       0.50      0.26      0.34       243

    accuracy                           0.63       656
   macro avg       0.58      0.55      0.54       656
weighted avg       0.60      0.63      0.60       656

0.6310975609756098


своя реализация kNN

In [40]:
def euclidian_metric(a, b):  # реализуем Евклидову метрику через норму
    return np.sqrt(np.sum((a - b) ** 2, axis=1))

In [41]:
def find_neighbours(k, y, distances):
    neighbours = y[np.argsort(distances)][:k]
    neighbours_distances = distances[neighbours][:k]

    return neighbours, neighbours_distances

In [42]:
def get_closest_classes(neighbours):
    arr = np.unique(neighbours, return_counts=True)
    best_classes = arr[0][arr[1] == np.amax(arr[1])]

    return best_classes

In [43]:
def choose_best_class(best_classes, neighbours, neighbouring_distances):
    min_mean_dist = np.inf
    best_class = None

    for elem in np.unique(neighbours):
        mean_dist = np.mean(neighbouring_distances[neighbours == elem])
        if mean_dist < min_mean_dist:
            best_class = elem
            min_mean_dist = mean_dist

    return best_class

In [44]:
def nearest_neighbours_classify(x, y, k, x_pred):
    res = np.zeros(x_pred.shape[0], dtype=y.dtype)

    for i in range(x_pred.shape[0]):
        distances = euclidian_metric(x, x_pred[i])  # считаем расстояния до классов

        neighbours, neighbouring_distances = find_neighbours(k, y, distances)  # находим ровно k соседей этой точки
            
        best_classes = get_closest_classes(neighbours)  # обнаруживаем классы, которые имеются среди соседей

        res[i] = choose_best_class(best_classes, neighbours, neighbouring_distances)  # выбираем наиболее релевантный класс по среднему расстоянию до него среди соседей
    return res

In [45]:
y_pred = nearest_neighbours_classify(X_train, y_train, clf.best_params_['n_neighbors'], X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(clf.score(X_test, y_test))

[[236 177]
 [106 137]]
              precision    recall  f1-score   support

           0       0.69      0.57      0.63       413
           1       0.44      0.56      0.49       243

    accuracy                           0.57       656
   macro avg       0.56      0.57      0.56       656
weighted avg       0.60      0.57      0.58       656

0.6310975609756098


результат получился одинаковым

**Классификатор дерева решений (Decision Tree Classifier)**

In [46]:
from sklearn import tree

dtc = tree.DecisionTreeClassifier()
params = {'max_depth': np.arange(1,15,1),
          'max_features': np.arange(5,X.shape[1]-1,1)}
clf = GridSearchCV(dtc, params)
clf = clf.fit(X_train, y_train)
clf.best_params_

{'max_depth': 10, 'max_features': 6}

In [47]:
y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(clf.score(X_test, y_test))

[[325  88]
 [151  92]]
              precision    recall  f1-score   support

           0       0.68      0.79      0.73       413
           1       0.51      0.38      0.43       243

    accuracy                           0.64       656
   macro avg       0.60      0.58      0.58       656
weighted avg       0.62      0.64      0.62       656

0.635670731707317


**Наивный байесовский классификатор (Naive Bayes)**

In [48]:
from sklearn.naive_bayes import GaussianNB, BernoulliNB

g_nb = GaussianNB()
b_nb = BernoulliNB()

In [49]:
g_nb.fit(X_train, y_train)

GaussianNB()

In [50]:
y_pred = g_nb.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(g_nb.score(X_test, y_test))

[[367  46]
 [189  54]]
              precision    recall  f1-score   support

           0       0.66      0.89      0.76       413
           1       0.54      0.22      0.31       243

    accuracy                           0.64       656
   macro avg       0.60      0.56      0.54       656
weighted avg       0.62      0.64      0.59       656

0.6417682926829268


In [51]:
b_nb.fit(X_train, y_train)

BernoulliNB()

In [52]:
y_pred = b_nb.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(b_nb.score(X_test, y_test))

[[413   0]
 [243   0]]
              precision    recall  f1-score   support

           0       0.63      1.00      0.77       413
           1       0.00      0.00      0.00       243

    accuracy                           0.63       656
   macro avg       0.31      0.50      0.39       656
weighted avg       0.40      0.63      0.49       656

0.6295731707317073


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Наивный байесовский классификатор с распределением Бернулли показал лучший результат по сравнению с нормальным распределением

**Метод опорных векторов (Support Vector Machines)**

In [53]:
from sklearn import svm

svc = svm.SVC()
params = {'C': np.arange(0.1,1.1,0.1),
          'degree': np.arange(2,5,1)}
clf = GridSearchCV(svc, params)
clf.fit(X_train, y_train)
clf.best_params_

{'C': 0.9, 'degree': 2}

In [54]:
y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(clf.score(X_test, y_test))

[[378  35]
 [176  67]]
              precision    recall  f1-score   support

           0       0.68      0.92      0.78       413
           1       0.66      0.28      0.39       243

    accuracy                           0.68       656
   macro avg       0.67      0.60      0.59       656
weighted avg       0.67      0.68      0.64       656

0.6783536585365854


**Логистическая регрессия (Logistic Regression)**

In [55]:
from sklearn.linear_model import LogisticRegression

logReg = LogisticRegression()
params = {'C': np.arange(0.1,1.1,0.1)}
clf = GridSearchCV(logReg, params)
clf.fit(X_train, y_train)
clf.best_params_

{'C': 0.2}

In [56]:
y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(clf.score(X_test, y_test))

[[413   0]
 [241   2]]
              precision    recall  f1-score   support

           0       0.63      1.00      0.77       413
           1       1.00      0.01      0.02       243

    accuracy                           0.63       656
   macro avg       0.82      0.50      0.40       656
weighted avg       0.77      0.63      0.49       656

0.6326219512195121


**Perceptron**

In [57]:
from sklearn.linear_model import Perceptron

perc = Perceptron()
params = {'alpha': np.array([0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5]),
          'l1_ratio': np.arange(0,1.05,0.05)}
clf = GridSearchCV(perc, params)
clf.fit(X_train, y_train)
clf.best_params_

{'alpha': 0.0001, 'l1_ratio': 0.0}

In [58]:
y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(clf.score(X_test, y_test))

[[284 129]
 [145  98]]
              precision    recall  f1-score   support

           0       0.66      0.69      0.67       413
           1       0.43      0.40      0.42       243

    accuracy                           0.58       656
   macro avg       0.55      0.55      0.55       656
weighted avg       0.58      0.58      0.58       656

0.5823170731707317


**Вывод:** самый лучший классификатор по точности - SVM