# Requirements

In [11]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# Creating the DataSet

In [12]:
# Sepehr Moniri : 981813205

# x is the set of data's features
# y is the set of the data
x, y = make_classification(n_samples=20000, n_features=8, random_state=981813205)

In [13]:
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=981813205)

split = 0.8 # spliting into 80% Training set and 20% test set
split_number = int(np.ceil(x.shape[0] * split))
x_train, y_train = x[0:split_number, :], y[0:split_number]
x_test, y_test = x[split_number:, :], y[split_number:]

# Implementing the Algorithm

In [14]:
class KNN:
    def __init__(self, n_neighbors=5, weights='uniform', metric='euclidean'):
        self.n_neighbors = n_neighbors
        self.weights = weights
        self.metric = metric

    def euclidean_distance(self, x1, x2):
        return np.sqrt(np.sum((x1 - x2) ** 2))

    def manhattan_distance(self, x1, x2):
        return np.sum(np.abs(x1 - x2))

    # Initiling the metric
    def get_distance(self, x1, x2):
        if self.metric == 'euclidean':
            return self.euclidean_distance(x1, x2)

        elif self.metric == 'manhattan':
            return self.manhattan_distance(x1, x2)

        else:
            raise ValueError("Invalid metric")

    # Defining the dataset
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train


    def predict(self, X_test):
        predictions = []
        for x in X_test:
            # calculating the distance of each item in test dataset with all datas in train dataset
            distances = [self.get_distance(x, x_train) for x_train in self.X_train]

            sorted_indices = np.argsort(distances)
            k_nearest_indices = sorted_indices[:self.n_neighbors]
            k_nearest_labels = self.y_train[k_nearest_indices]

            # Initialing the weights
            if self.weights == 'uniform':
                prediction = np.argmax(np.bincount(k_nearest_labels))

            elif self.weights == 'distance':
                weights = 1 / (np.array(distances)[k_nearest_indices] + 1e-10)
                weights /= np.sum(weights)
                weighted_labels = np.zeros(max(k_nearest_labels) + 1)

                for label, weight in zip(k_nearest_labels, weights):
                    weighted_labels[label] += weight

                prediction = np.argmax(weighted_labels)

            predictions.append(prediction)

        return predictions

# Algorithm setup and Predication

In [15]:
knn = KNN(n_neighbors=3, weights='uniform', metric='manhattan')
# knn = KNN(3, weights="distance", metric="manhattan")
knn.fit(x_train, y_train)

In [16]:
y_pred = knn.predict(x_test)
print("KNN predictions:", y_pred)

KNN predictions: [0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1

# Validation

In [17]:
def accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)

In [18]:
acc = accuracy(y_test, y_pred)
print("Accuracy:", acc)

Accuracy: 0.8725


# Improvments
For imporoving the implemented algorithm, we should check all possible ways that the parameters like weights and metric could stand together untile we find a compination that the accuracy is the best of them. For finding that compination, we try all values of the parameters along with a cerain number of neighbors (here I set it to [3,5,7,9]).
At the end the 'improvment' function will give us an combination of parameters that have a best accuracy.

In [19]:
def improvment(param_grid, accuracy_func, imp_alg):
    best_accuracy = 0
    best_params = {}

    for n_neighbors in param_grid['n_neighbors']:
        for weights in param_grid['weights']:
            for metric in param_grid['metric']:
                # Initialize KNN model with current parameters
                model = imp_alg(n_neighbors=n_neighbors, weights=weights, metric=metric) # , algorithm=algorithm
                model.fit(x_train, y_train)
                y_pred = model.predict(x_test)
                accuracy = accuracy_func(y_test, y_pred)

                # Check if current parameters yield better accuracy
                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_params = {
                        'n_neighbors': n_neighbors,
                        'weights': weights,
                        'metric': metric
                    }

    return best_params, best_accuracy

In [20]:
param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

print(improvment(param_grid, accuracy, KNN))


({'n_neighbors': 9, 'weights': 'uniform', 'metric': 'manhattan'}, 0.885)


The outputs here are tested on 2000 dataset.