In [44]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, precision_score, recall_score


In [45]:
df= pd.read_csv("diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [47]:
not_accepted=["Glucose", "BloodPressure", "SkinThickness", "BMI", "Insulin"]

In [48]:

imputer = SimpleImputer(missing_values=0)

imputer.fit(df[not_accepted])
imputed_values = pd.DataFrame(
    imputer.transform(df[not_accepted]),
    # index is important to ensure we can concatenate with other columns
    index=df.index,
    columns=[not_accepted]
)

df[not_accepted] = imputed_values
df.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

Selecting Columns and Splitting the data

In [49]:
X= df.drop(["Outcome"], axis= 1)
y=df["Outcome"]
X_train, X_test, y_train, y_test=train_test_split(X,y, test_size = 0.2, random_state=0)

Normalizing the data

In [50]:
scaler= MinMaxScaler()
scaled_train =scaler.fit_transform(X_train, y_train)
scaled_test =scaler.transform(X_test)

In [51]:
# scaler= StandardScaler()
# scaled_train=scaler.fit_transform(X_train, y_train)
# scaled_test=scaler.transform(X_test)


In [52]:
scaled_train=pd.DataFrame(scaled_train, columns=X_train.columns)
scaled_train

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.411765,0.688312,0.551020,0.239130,0.134615,0.347648,0.262169,0.550000
1,0.235294,0.344156,0.367347,0.173913,0.170130,0.204499,0.155850,0.016667
2,0.000000,0.785714,0.673469,0.282609,0.800481,0.697342,0.149018,0.033333
3,0.058824,0.422078,0.326531,0.152174,0.145433,0.143149,0.322374,0.033333
4,0.470588,0.493506,0.493930,0.240798,0.170130,0.241309,0.044833,0.283333
...,...,...,...,...,...,...,...,...
609,0.588235,0.370130,0.530612,0.445652,0.199519,0.300613,0.039710,0.700000
610,0.411765,0.746753,0.428571,0.240798,0.170130,0.249489,0.130231,0.250000
611,0.235294,0.324675,0.418367,0.163043,0.170130,0.132924,0.029889,0.000000
612,0.647059,0.266234,0.510204,0.240798,0.170130,0.243354,0.094791,0.233333


In [53]:
scaled_test=pd.DataFrame(scaled_test, columns=X_test.columns)
scaled_test

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.058824,1.006494,0.530612,0.391304,0.170130,0.505112,0.561913,0.016667
1,0.117647,0.409091,0.510204,0.250000,0.103365,0.314928,0.139197,0.033333
2,0.235294,0.207792,0.387755,0.240798,0.170130,0.323108,0.133646,0.066667
3,0.294118,0.792208,0.489796,0.130435,0.193510,0.155419,0.217336,0.500000
4,0.000000,0.435065,0.418367,0.240798,0.170130,0.130879,0.248506,0.166667
...,...,...,...,...,...,...,...,...
149,0.117647,0.396104,0.571429,0.413043,0.212740,0.316973,0.270282,0.133333
150,0.235294,0.266234,0.346939,0.163043,0.042067,0.196319,0.097353,0.116667
151,0.235294,0.636364,0.632653,0.240798,0.170130,0.527607,0.242101,0.016667
152,0.176471,0.467532,0.510204,0.086957,0.109375,0.165644,0.012383,0.050000


Fitting the Model

KNN is unique compared to other classifiers in that it does almost nothing during the "fit" step, and all the work during the "predict" step. During the "fit" step, KNN just stores all the training data and corresponding labels. No distances are calculated at this point.


In [54]:
clf=KNeighborsClassifier(p=2, metric="euclidean")
clf.fit(scaled_train, y_train)


Predicting the Model using the test set

In [55]:
y_pred= clf.predict(scaled_test)

Evaluating the Model
Evaluating classification performance for KNN works the same as evaluating performance for any other classification algorithm -- you need a set of predictions, and the corresponding ground-truth labels for each of the points you made a prediction on. You can then compute evaluation metrics such as Precision, Recall, Accuracy, F1-Score etc.


In [56]:
cm=confusion_matrix(y_test, y_pred)
cm

array([[90, 17],
       [16, 31]], dtype=int64)

In [57]:
print(f"F1_score",f1_score(y_test, y_pred))
print(f"Accuracy",accuracy_score(y_test, y_pred))
print(f"Recall", recall_score(y_test,y_pred))
print(f"Precision", precision_score(y_test,y_pred))



F1_score 0.6526315789473683
Accuracy 0.7857142857142857
Recall 0.6595744680851063
Precision 0.6458333333333334


In [58]:
np.sqrt(len(y_test))

12.409673645990857

Selecting the values of K

In [59]:
def find_best_k(X_train, y_train, X_test, y_test, min_k=1, max_k=25):
    best_k = 0
    best_score = 0.0
  
    for k in range(min_k, max_k+1, 2):
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_train)
        preds = knn.predict(X_test)
        acc=accuracy_score(y_test, preds)
        precs=precision_score(y_test,y_pred)
        rec=recall_score(y_test, y_pred)
        f1 = f1_score(y_test, preds)
        if f1 > best_score:
            best_k = k
            best_score = f1
            precision=precs
            recall=rec
            accuracy=acc
    
    print("Best Value for k: {}".format(best_k))
    print("F1-Score: {}".format(best_score))
    print("Accuracy: {}".format(accuracy))
    print("Precision: {}".format(precision))
    print("Recall: {}".format(recall))


In [60]:
find_best_k(scaled_train, y_train, scaled_test, y_test)

Best Value for k: 11
F1-Score: 0.723404255319149
Accuracy: 0.8311688311688312
Precision: 0.6458333333333334
Recall: 0.6595744680851063


appendix

Strengths

Simplicity: KNN is easy to understand and implement. Its intuitive nature makes it an excellent starting point for beginners in machine learning.

No Training Phase: KNN is a lazy learner, meaning it does not require an explicit training phase. It simply stores the training data and makes predictions during runtime.

Versatility: KNN can be used for both classification and regression tasks. It can also handle multi-class classification problems.

Adaptability: The algorithm can adapt to various types of distance metrics (e.g., Euclidean, Manhattan, Minkowski), making it flexible for different types of data and problem domains.

Non-Parametric: KNN does not assume a specific distribution of the data, making it suitable for problems where the data distribution is unknown or complex.

Weaknesses

Computationally Intensive: KNN can be slow for large datasets because it needs to compute the distance between the test instance and all training instances. This can make real-time predictions impractical.

Memory Intensive: KNN stores all the training data, which can require significant memory, especially for large datasets.

Sensitivity to Irrelevant Features: KNN's performance can degrade if the data contains many irrelevant features, as all features contribute equally to the distance calculation.

Curse of Dimensionality: As the number of dimensions increases, the distance metrics used by KNN become less effective. This can lead to poor performance and the need for dimensionality reduction techniques.

Choice of 𝐾 and Distance Metric: The performance of KNN heavily depends on the choice of 𝐾 and the distance metric. Selecting inappropriate values can lead to poor model performance.

Opportunities

Hybrid Models: Combining KNN with other machine learning algorithms (e.g., using KNN for initial screening and another algorithm for final decision-making) can enhance performance.

Improved Distance Metrics: Developing or adopting advanced distance metrics that are more robust to the curse of dimensionality or tailored to specific applications can improve KNN's effectiveness.

Feature Selection and Engineering: Techniques that improve feature selection and engineering can mitigate some of KNN's weaknesses, particularly its sensitivity to irrelevant features.

Parallel and Distributed Computing: Implementing KNN in a parallel or distributed computing environment can address its computational and memory intensity, making it more scalable.

Integration with Big Data Technologies: Leveraging big data frameworks (e.g., Apache Hadoop, Spark) can help in managing and processing large datasets efficiently, enabling the use of KNN on bigger datasets.

Threats

Competition from Advanced Algorithms: More advanced algorithms (e.g., random forests, gradient boosting machines, deep learning) often outperform KNN in terms of accuracy and efficiency, particularly on complex tasks.

Scalability Issues: As datasets grow larger and more complex, KNN's scalability limitations become more pronounced, making it less competitive compared to other scalable algorithms.

High Dimensional Data: The increasing prevalence of high-dimensional data in fields like genomics and text mining poses challenges for KNN, which struggles with the curse of dimensionality.

Dependence on Distance Metrics: KNN's performance is highly dependent on the choice of distance metric, which might not always be straightforward to determine and may require extensive domain knowledge.

Data Quality: KNN is highly sensitive to noisy data and outliers. Poor data quality can significantly impact its performance, requiring robust data preprocessing techniques.