<a href="https://colab.research.google.com/github/kendo58/uni/blob/main/KNN_Classifier_with_Jaccard_Distance_on_Prog1data_csv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import csv
import math
import random
from collections import Counter
# Make sure to install matplotlib: pip install matplotlib
import matplotlib.pyplot as plt

class KNNClassifier:
    """
    A K-Nearest Neighbors classifier implementation that supports multiple distance metrics
    and data normalization.
    """

    def __init__(self, k=5, metric='euclidean'):
        """
        Initializes the KNNClassifier.

        Args:
            k (int): The number of neighbors to use for classification.
            metric (str): The distance metric to use ('euclidean' or 'jaccard').
        """
        if k <= 0:
            raise ValueError("k must be a positive integer.")
        if metric not in ['euclidean', 'jaccard']:
            raise ValueError("Metric must be either 'euclidean' or 'jaccard'.")

        self.k = k
        self.metric = metric
        self.X_train = []
        self.y_train = []

        # Internal state for normalization
        self._is_normalized = False
        self._min_max = []

    def _euclidean_distance(self, p1, p2):
        """Calculates the Euclidean distance between two numeric points."""
        return math.sqrt(sum((e1 - e2)**2 for e1, e2 in zip(p1, p2)))

    def _jaccard_distance(self, p1, p2):
        """Calculates the Jaccard distance between two categorical points."""
        set1 = set(p1)
        set2 = set(p2)
        intersection = len(set1.intersection(set2))
        union = len(set1.union(set2))
        return 1 - (intersection / union) if union != 0 else 0

    def train(self, training_data):
        """
        Trains the model by loading and storing the data.
        Converts features to float for Euclidean, keeps as-is for Jaccard.
        """
        if not training_data:
            print("Warning: The provided training data is empty.")
            return

        self.X_train = []
        self.y_train = []

        # --- Error Handling for Malformed Input ---
        # This loop skips any row that cannot be processed correctly.
        for row in training_data:
            try:
                features = row[:-1]
                if not features or not row[-1]: # Check for empty features or label
                    print(f"Warning: Skipping row with missing values: {row}")
                    continue

                # For Euclidean distance, features must be numeric
                if self.metric == 'euclidean':
                    features = [float(x) for x in features]

                self.X_train.append(features)
                self.y_train.append(row[-1].strip())
            except (ValueError, IndexError):
                print(f"Warning: Skipping malformed row that could not be processed: {row}")
                continue

        if not self.X_train:
            raise ValueError("No valid data was loaded for training. Please check the input file.")

        # --- Error Handling for Fewer than k items ---
        # Checks if the number of valid training samples is less than k.
        if len(self.X_train) < self.k:
            print(f"CRITICAL WARNING: The number of valid training samples ({len(self.X_train)}) is less than k ({self.k}).")
            print("The model will use all available samples for prediction, but results may be unreliable.")


    def normalize_data(self):
        """
        Applies min-max normalization to the loaded training data.
        This method should only be used with Euclidean distance.
        """
        if self.metric != 'euclidean':
            print("Warning: Normalization is only applicable for the 'euclidean' metric.")
            return

        if not self.X_train:
            raise RuntimeError("Must train the model before normalizing data.")

        self._min_max = []
        num_features = len(self.X_train[0])
        for i in range(num_features):
            col = [row[i] for row in self.X_train]
            min_val, max_val = min(col), max(col)
            self._min_max.append((min_val, max_val))

        for i, row in enumerate(self.X_train):
            normalized_row = []
            for j, val in enumerate(row):
                min_v, max_v = self._min_max[j]
                if (max_v - min_v) == 0:
                    normalized_row.append(0)
                else:
                    normalized_row.append((val - min_v) / (max_v - min_v))
            self.X_train[i] = normalized_row

        self._is_normalized = True
        print("Training data has been normalized.")

    def predict(self, new_data):
        """Predicts the class for a list of new data points."""
        if not self.X_train:
            raise RuntimeError("Classifier has not been trained. Call train() first.")

        distance_func = self._euclidean_distance if self.metric == 'euclidean' else self._jaccard_distance

        processed_data = []
        for row in new_data:
            p_row = row
            if self.metric == 'euclidean':
                p_row = [float(x) for x in p_row]
                if self._is_normalized:
                    normalized_row = []
                    for i, val in enumerate(p_row):
                        min_v, max_v = self._min_max[i]
                        if (max_v - min_v) == 0:
                            normalized_row.append(0)
                        else:
                            normalized_row.append((val - min_v) / (max_v - min_v))
                    p_row = normalized_row
            processed_data.append(p_row)

        predictions = []
        for test_row in processed_data:
            distances = [(self.y_train[i], distance_func(test_row, train_row)) for i, train_row in enumerate(self.X_train)]
            distances.sort(key=lambda x: x[1])

            # Use min(self.k, len(self.X_train)) to avoid errors if k > num_samples
            num_neighbors = min(self.k, len(self.X_train))
            neighbors = [dist[0] for dist in distances[:num_neighbors]]

            if not neighbors:
                predictions.append(None) # Should not happen if train data exists
                continue

            most_common = Counter(neighbors).most_common(1)
            predictions.append(most_common[0][0])

        return predictions

# --- Example Usage ---
if __name__ == '__main__':

    filepath = "Prog1data.csv"

    try:
        with open(filepath, 'r') as f:
            full_dataset = [row for row in csv.reader(f) if row]
    except FileNotFoundError:
        print(f"Error: The file '{filepath}' was not found.")
        full_dataset = []

    if full_dataset:
        random.shuffle(full_dataset)
        split_idx = int(len(full_dataset) * 0.8)
        training_set = full_dataset[:split_idx]
        testing_set = full_dataset[split_idx:]

        X_test = [row[:-1] for row in testing_set]
        y_test = [row[-1].strip() for row in testing_set]

        print("--- 1. Euclidean Distance Example ---")
        euclidean_classifier = KNNClassifier(k=5, metric='euclidean')
        euclidean_classifier.train(training_set)
        euclidean_classifier.normalize_data()
        euclidean_predictions = euclidean_classifier.predict(X_test)
        correct_euclidean = sum(1 for i in range(len(euclidean_predictions)) if euclidean_predictions[i] == y_test[i])
        accuracy_euclidean = (correct_euclidean / len(y_test)) * 100 if y_test else 0
        print(f"Accuracy on Prog1data.csv test set (Euclidean): {accuracy_euclidean:.2f}%")

        print("\n--- 2. Jaccard Distance Example ---")
        jaccard_classifier = KNNClassifier(k=5, metric='jaccard')
        jaccard_classifier.train(training_set)
        jaccard_predictions = jaccard_classifier.predict(X_test)
        correct_jaccard = sum(1 for i in range(len(jaccard_predictions)) if jaccard_predictions[i] == y_test[i])
        accuracy_jaccard = (correct_jaccard / len(y_test)) * 100 if y_test else 0
        print(f"Accuracy on Prog1data.csv test set (Jaccard): {accuracy_jaccard:.2f}%")

        print("\n--- 3. Example of k > number of samples warning ---")
        # Create a tiny training set
        tiny_training_set = [['1', '1', '+'], ['2', '2', '-']]
        # Use a k that is too large
        faulty_classifier = KNNClassifier(k=5, metric='euclidean')
        faulty_classifier.train(tiny_training_set) # This will now print a warning.



--- 1. Euclidean Distance Example ---
Training data has been normalized.
Accuracy on Prog1data.csv test set (Euclidean): 99.00%

--- 2. Jaccard Distance Example ---
Accuracy on Prog1data.csv test set (Jaccard): 76.75%

The model will use all available samples for prediction, but results may be unreliable.
