# $k$-NN Classifier using MAGIC telescopic dataset

## Balancing g and h Classes

In [187]:
import random

file_path = "magic.txt"
with open(file_path, "r") as f:
    lines = f.readlines()

g_records = [line for line in lines if line.strip().endswith(",g")]
h_records = [line for line in lines if line.strip().endswith(",h")]

g_count = len(g_records)
h_count = len(h_records)

print(f"Original 'g' records: {g_count}")
print(f"Original 'h' records: {h_count}")

balanced_g_records = random.sample(g_records, h_count)

balanced_dataset = balanced_g_records + h_records

random.shuffle(balanced_dataset)

output_file = "balanced_dataset.txt"
with open(output_file, "w") as f:
    f.writelines(balanced_dataset)

print(f"Balanced dataset saved to {output_file}")

with open(output_file, "r") as f:
    lines = f.readlines()

g_records = [line for line in lines if line.strip().endswith(",g")]
h_records = [line for line in lines if line.strip().endswith(",h")]

g_count = len(g_records)
h_count = len(h_records)

print(f"Original 'g' records: {g_count}")
print(f"Original 'h' records: {h_count}")

Original 'g' records: 12332
Original 'h' records: 6688
Balanced dataset saved to balanced_dataset.txt
Original 'g' records: 6688
Original 'h' records: 6688


## Splitting Dataset

In [188]:
file_path = "balanced_dataset.txt"
with open(file_path, "r") as f:
    lines = f.readlines()

random.shuffle(lines)

total_records = len(lines)
train_size = int(0.7 * total_records)
val_size = int(0.15 * total_records)

train_set = lines[:train_size]
val_set = lines[train_size : train_size + val_size]
test_set = lines[train_size + val_size :]

with open("train_set.txt", "w") as f:
    f.writelines(train_set)

with open("validation_set.txt", "w") as f:
    f.writelines(val_set)

with open("test_set.txt", "w") as f:
    f.writelines(test_set)

print("Dataset split completed:")
print(f"Training set: {len(train_set)} records")
print(f"Validation set: {len(val_set)} records")
print(f"Test set: {len(test_set)} records")

Dataset split completed:
Training set: 9363 records
Validation set: 2006 records
Test set: 2007 records


## Classifier Training

In [189]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from art import text2art
import numpy as np


def load_data(file_path):
    df = pd.read_csv(file_path, header=None)
    X = df.iloc[:, :-1].values
    y = df.iloc[:, -1].values
    return X, y


X_train, y_train = load_data("train_set.txt")
X_val, y_val = load_data("validation_set.txt")
X_test, y_test = load_data("test_set.txt")

y_train = np.where(y_train == "g", 1, 0)
y_val = np.where(y_val == "g", 1, 0)
y_test = np.where(y_test == "g", 1, 0)

### Evaluation Metrics

#### 1. **Accuracy**
   - Accuracy is the proportion of correctly classified samples out of the total samples. 
   - **Formula:**
     $$\text{Accuracy} = \frac{\text{TP} + \text{TN}}{\text{TP} + \text{TN} + \text{FP} + \text{FN}}$$
   - Where:
     - TP = True Positives
     - TN = True Negatives
     - FP = False Positives
     - FN = False Negatives

#### 2. **Precision**
   - Precision measures the ratio of correctly predicted positive observations to the total predicted positives.
   - **Formula:**
     $$\text{Precision} = \frac{\text{TP}}{\text{TP} + \text{FP}}$$
   - Precision is helpful to assess how many selected items are relevant, especially in cases where false positives matter (e.g., medical diagnoses).

#### 3. **Recall (Sensitivity or True Positive Rate)**
   - Recall calculates the ratio of correctly predicted positive observations to the all observations in actual class.
   - **Formula:**
     $$\text{Recall} = \frac{\text{TP}}{\text{TP} + \text{FN}}$$
   - Recall is helpful for understanding how many relevant items were selected, particularly in scenarios where false negatives are costly.

#### 4. **F1 Score**
   - The F1 Score is the harmonic mean of Precision and Recall, providing a balance between them. It’s especially useful when the dataset is imbalanced.
   - **Formula:**
     $$\text{F1 Score} = 2 \times \frac{\text{Precision} \times \text{Recall}}{\text{Precision} + \text{Recall}}$$

#### 5. **Confusion Matrix**
   - The Confusion Matrix is a summary of prediction results, showing the distribution of actual versus predicted classes.
   - It’s typically structured as follows:

     |                | Predicted Positive | Predicted Negative |
     |----------------|--------------------|--------------------|
     | Actual Positive| TP                 | FN                 |
     | Actual Negative| FP                 | TN                 |

   - Each element of the matrix corresponds to:
     - **TP**: True Positives (correctly classified positives)
     - **FP**: False Positives (negatives incorrectly classified as positives)
     - **FN**: False Negatives (positives incorrectly classified as negatives)
     - **TN**: True Negatives (correctly classified negatives)

### Predicting on Validation Set

In [190]:
best_k = None
best_accuracy = 0

for k in range(1, 41):

    print(text2art(f"k={k}", font="slant"))

    knn = KNeighborsClassifier(n_neighbors=k)

    knn.fit(X_train, y_train)

    # Calculate accuracy on the training set
    train_predictions = knn.predict(X_train)
    training_accuracy = accuracy_score(y_train, train_predictions)
    print(f"Training error: {1 - training_accuracy:.3f}\n")


    # Calculate accuracy on the validation set
    val_predictions = knn.predict(X_val)
    validation_accuracy = accuracy_score(y_val, val_predictions)
    if validation_accuracy > best_accuracy:
        best_accuracy = validation_accuracy
        best_k = k
    print(f"Validation error: {1 - validation_accuracy:.3f}\n")


    # Calculate confusion matrix
    cm = confusion_matrix(y_val, val_predictions)
    print("Confusion Matrix:")
    print(cm)


    # Classification report
    print("\nClassification Report on Validation Set:")
    print(classification_report(y_val, val_predictions, target_names=["h", "g"]), "\n")

print(
    f"The k value that gives the best accuracy is k={best_k} with an accuracy of {best_accuracy:.3f}"
)

    __            ___
   / /__  _____  <  /
  / //_/ /____/  / / 
 / ,<   /____/  / /  
/_/|_|         /_/   
                     

Training error: 0.000

Validation error: 0.246

Confusion Matrix:
[[750 282]
 [211 763]]

Classification Report on Validation Set:
              precision    recall  f1-score   support

           h       0.78      0.73      0.75      1032
           g       0.73      0.78      0.76       974

    accuracy                           0.75      2006
   macro avg       0.76      0.76      0.75      2006
weighted avg       0.76      0.75      0.75      2006
 

    __            ___ 
   / /__  _____  |__ \
  / //_/ /____/  __/ /
 / ,<   /____/  / __/ 
/_/|_|         /____/ 
                      

Training error: 0.105

Validation error: 0.247

Confusion Matrix:
[[877 155]
 [341 633]]

Classification Report on Validation Set:
              precision    recall  f1-score   support

           h       0.72      0.85      0.78      1032
           g       0.80     

### Note on k Value Behavior

Starting with k=1, the model exhibits overfitting, achieving a training error of 0 while the validation error remains high. As 
k increases, the model's performance improves, resulting in better validation accuracy until it reaches a peak at a specific 
k value. Beyond this point, validation accuracy begins to decline, accompanied by an increase in training error, indicating that the model is starting to underfit the data.


## Model Evaluation Using Test Set

In [191]:
knn = KNeighborsClassifier(n_neighbors=best_k)

knn.fit(X_train, y_train)

print(text2art(f"k={best_k}", font="slant"))

# Calculate accuracy on the training set
test_predictions = knn.predict(X_test)
test_accuracy = accuracy_score(y_test, test_predictions)
print(f"Test error: {1 - test_accuracy:.3f}\n")

# Calculate confusion matrix
cm = confusion_matrix(y_test, test_predictions)
print("Confusion Matrix:")
print(cm)

# Classification report
print("\nClassification Report on Test Set:")
print(classification_report(y_test, test_predictions, target_names=["h", "g"]))

    __            ____ 
   / /__  _____  / __ \
  / //_/ /____/ / /_/ /
 / ,<   /____/  \__, / 
/_/|_|         /____/  
                       

Test error: 0.240

Confusion Matrix:
[[666 316]
 [165 860]]

Classification Report on Test Set:
              precision    recall  f1-score   support

           h       0.80      0.68      0.73       982
           g       0.73      0.84      0.78      1025

    accuracy                           0.76      2007
   macro avg       0.77      0.76      0.76      2007
weighted avg       0.77      0.76      0.76      2007

