### IMPORTING LIBRARIES

In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import matplotlib.backends.backend_pdf


### 2. Loading the Dataset


In [22]:
data = pd.read_csv('data.csv')

### 3. Preprocessing the Data


In [23]:
# Assuming the target column is named 'target' and rest are features
X = data.drop(columns='label')
y = data['label']

# Normalize/standardize features if necessary
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# X = scaler.fit_transform(X)

<!-- X contains the features (independent variables).
y contains the target variable (dependent variable).
Normalization/standardization (commented out) ensures that all features contribute equally to the distance calculations in KNN. -->

### 4. Defining Train-Test Splits and K Values


In [24]:
train_test_splits = [0.60, 0.70, 0.75, 0.80, 0.90, 0.95]
k_values = [2, 4, 5, 6, 7, 10]

<!-- Lists of different train-test splits and K values for KNN to be evaluated. -->


### 5. Evaluating KNN


In [25]:
def evaluate_knn(X_train, X_test, y_train, y_test, k):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    return accuracy, conf_matrix


<!-- A function to train and evaluate the KNN model:
X_train, X_test, y_train, y_test: Training and testing data.
k: The number of neighbors for KNN.
Returns the accuracy and confusion matrix of the model. -->

### 6. Running the Evaluations and Storing Results


In [26]:
results = {}

# for split in train_test_splits:
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-split, random_state=42)
#     for k in k_values:
#         accuracy, conf_matrix = evaluate_knn(X_train, X_test, y_train, y_test, k)
#         results[(split, k)] = (accuracy, conf_matrix)


<!-- results: A dictionary to store the accuracy and confusion matrix for each combination of train-test split and K value.
Loop through each train-test split, create training and testing sets, and evaluate the KNN model for each value of K.
Store the results in the results dictionary. -->

### 7. Implementing One Scenario and Commenting Out the Rest


In [27]:
#  Example scenario
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
accuracy, conf_matrix = evaluate_knn(X_train, X_test, y_train, y_test, 2)
print(f"Accuracy: {accuracy}")
print(f"Confusion Matrix:\n{conf_matrix}")

# Full implementation is commented out to avoid long runtime
# for split in train_test_splits:
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-split, random_state=42)
#     for k in k_values:
#         accuracy, conf_matrix = evaluate_knn(X_train, X_test, y_train, y_test, k)
#         results[(split, k)] = (accuracy, conf_matrix)


Accuracy: 0.9600793650793651
Confusion Matrix:
[[1197    0    0    0    0    0    3    0    0    0]
 [   0 1387    1    0    1    0    0    0    0    0]
 [  15   20 1240    3    1    1    1   10    3    0]
 [   1    6   15 1305    0    9    1    6    9    3]
 [   2   12    0    0 1190    0    4    3    0   11]
 [   1    3    0   36    4 1031    5    0    2    3]
 [  13    2    1    1    4    7 1228    0    0    0]
 [   0   24    9    0    3    1    0 1311    0   11]
 [   5   13   14   35    3   35    6    5 1085    8]
 [   8    4    1   16   36    3    0   38    2 1123]]


<!-- Example scenario for train-test split of 0.60 and K=2. The rest of the scenarios are commented out to demonstrate a single case. -->

### 8. Saving Results to a PDF


<!-- Creates a PDF file named knn_results.pdf to store the confusion matrices and accuracy scores.
For each combination of train-test split and K value:
Creates a plot of the confusion matrix.
Annotates the plot with accuracy and confusion matrix values.
Saves the plot to the PDF. -->

In [50]:

# for all the values
# pdf = matplotlib.backends.backend_pdf.PdfPages("knn_results.pdf")

# for key, value in results.items():
#     split, k = key
#     accuracy, conf_matrix = value
#     fig, ax = plt.subplots()
#     ax.matshow(conf_matrix, cmap=plt.cm.Blues, alpha=0.3)
#     for i in range(conf_matrix.shape[0]):
#         for j in range(conf_matrix.shape[1]):
#             ax.text(x=j, y=i, s=conf_matrix[i, j], va='center', ha='center')
#     plt.xlabel('Predicted label')
#     plt.ylabel('True label')
#     plt.title(f"Train-Test Split: {split}, K: {k}\nAccuracy: {accuracy:.2f}")
#     pdf.savefig(fig)
#     plt.close()

# pdf.close()-->for all the values

pdf = matplotlib.backends.backend_pdf.PdfPages("knn_results1.pdf")
fig, ax = plt.subplots()
ax.matshow(conf_matrix, cmap=plt.cm.Blues, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i, s=conf_matrix[i, j], va='center', ha='center')
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title(f"Train-Test Split: {0.3}, K: {2}\nAccuracy: {accuracy:.2f}")
pdf.savefig(fig)
plt.close()
pdf.close()

<!-- 

### Creating a Plot

```python
    fig, ax = plt.subplots()
```
- This line creates a new figure and axes object using Matplotlib's `subplots` function. 
- `fig` is the figure object, and `ax` is the axes object where the confusion matrix will be plotted.

### Plotting the Confusion Matrix

```python
    ax.matshow(conf_matrix, cmap=plt.cm.Blues, alpha=0.3)
```
- This line plots the confusion matrix on the axes `ax` using the `matshow` function.
- `conf_matrix` is the matrix to be plotted.
- `cmap=plt.cm.Blues` specifies the color map to use, in this case, shades of blue.
- `alpha=0.3` sets the transparency level of the plot.

### Annotating the Plot

```python
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            ax.text(x=j, y=i, s=conf_matrix[i, j], va='center', ha='center')
```
- This nested loop iterates over each cell in the confusion matrix.
- `conf_matrix.shape[0]` gives the number of rows, and `conf_matrix.shape[1]` gives the number of columns.
- `ax.text(x=j, y=i, s=conf_matrix[i, j], va='center', ha='center')` adds text annotations to each cell:
  - `x=j` and `y=i` specify the position of the text.
  - `s=conf_matrix[i, j]` specifies the text to display, which is the value in the confusion matrix at position (i, j).
  - `va='center'` and `ha='center'` center the text vertically and horizontally within the cell.


### Summary

- **Initialize PDF**: Create a `PdfPages` object to handle multiple pages in a PDF.
- **Iterate Results**: Loop through each result (train-test split and K value).
- **Extract Values**: Get the specific train-test split, K value, accuracy, and confusion matrix.
- **Create Plot**: Initialize a new plot.
- **Plot Confusion Matrix**: Visualize the confusion matrix with colors.
- **Annotate Plot**: Add text to each cell of the confusion matrix.
- **Labels and Title**: Add axis labels and a title to the plot.
- **Save Plot**: Save the current plot to the PDF.
- **Close Plot**: Close the plot to free memory.
- **Finalize PDF**: Close the PDF file after all plots are saved. --> -->

### 9. Analysis of Results


In [29]:
analysis = """
The performance of the KNN model depends on both the train-test split and the value of K. Generally, a higher proportion of training data can lead to better model performance due to more information being available for the model to learn from. However, this may also lead to overfitting if the test set is too small. The value of K also plays a crucial role, with too small a K value leading to high variance and too large a K value leading to high bias. The optimal value of K often lies between 5 and 10, balancing bias and variance effectively.
"""

print(analysis)


The performance of the KNN model depends on both the train-test split and the value of K. Generally, a higher proportion of training data can lead to better model performance due to more information being available for the model to learn from. However, this may also lead to overfitting if the test set is too small. The value of K also plays a crucial role, with too small a K value leading to high variance and too large a K value leading to high bias. The optimal value of K often lies between 5 and 10, balancing bias and variance effectively.

