In [1]:
import numpy as np

# Part A: Model Code
# 1. Euclidean Distance
def euclidean_distance(vector1, vector2):
    return np.sqrt(np.sum((vector1 - vector2) ** 2))


In [2]:
# 2. Manhattan Distance
def manhattan_distance(vector1, vector2):
    return np.sum(np.abs(vector1 - vector2))


In [3]:
# 3. Accuracy and Generalization Error
def accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)

def generalization_error(y_true, y_pred):
    return 1 - accuracy(y_true, y_pred)


In [4]:
# 4. Precision, Recall, and F1 Score
def precision(y_true, y_pred):
    tp = np.sum((y_true == 1) & (y_pred == 1))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    return tp / (tp + fp) if (tp + fp) > 0 else 0

def recall(y_true, y_pred):
    tp = np.sum((y_true == 1) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    return tp / (tp + fn) if (tp + fn) > 0 else 0

def f1_score(y_true, y_pred):
    prec = precision(y_true, y_pred)
    rec = recall(y_true, y_pred)
    return 2 * (prec * rec) / (prec + rec) if (prec + rec) > 0 else 0


In [5]:
# 5. Confusion Matrix
def confusion_matrix(y_true, y_pred):
    tp = np.sum((y_true == 1) & (y_pred == 1))
    tn = np.sum((y_true == 0) & (y_pred == 0))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    return np.array([[tn, fp], [fn, tp]])


In [6]:

# 6. ROC Curve
def roc_curve(y_true, y_score):
    # This is a placeholder
    pass


In [7]:
# 7. AUC for ROC Curve
def auc(fpr, tpr):
    # This is a placeholder
    pass


In [8]:
# 8. Precision-Recall Curve
def precision_recall_curve(y_true, y_score):
    # This is a placeholder
    pass


In [9]:
# 9. KNN_Classifier Model Class
class KNN_Classifier:
    def __init__(self):
        self.X_train = None
        self.y_train = None
        self.n_neighbors = None
        self.weights = 'uniform'

    def fit(self, X, Y, n_neighbors=5, weights='uniform'):
        self.X_train = X
        self.y_train = Y
        self.n_neighbors = n_neighbors
        self.weights = weights

    def predict(self, X):
        # Placeholder for prediction logic
        pass


In [10]:
import pandas as pd
import numpy as np

# Part B: Data Processing

# 10. Reading the winequality-white.csv file as a Pandas data frame.
df = pd.read_csv('winequality-white.csv')


FileNotFoundError: [Errno 2] No such file or directory: 'winequality-white.csv'

In [None]:
# 11. Convert quality into binary classification
df['quality'] = df['quality'].apply(lambda x: 1 if x > 5 else 0)


In [None]:
# 12. Summarizing each of the variables in terms of mean, standard deviation, and quartiles.
summary = df.describe()


In [None]:
# 13. Shuffling the rows of your data.
df_shuffled = df.sample(frac=1, random_state=1).reset_index(drop=True)



In [None]:
# 14. Generating pair plots to identify redundant features.
import seaborn as sns
    #sns.pairplot(df_shuffled)



In [None]:
# 15. Drop the redundant features identified from the pair plots.
df_shuffled = df_shuffled.drop(columns=['redundant_feature1', 'redundant_feature2'])


In [None]:
# 16. Partition function to split data into training and test sets.
def partition(X, Y, t):
    train_size = int((1 - t) * X.shape[0])
    return X[:train_size], X[train_size:], Y[:train_size], Y[train_size:]

# Splitting the dataset
X = df_shuffled.drop('quality', axis=1).values
Y = df_shuffled['quality'].values
X_train, X_test, Y_train, Y_test = partition(X, Y, t=0.2)


In [None]:
# Part C: Model Evaluation
# Implement a basic KNN classifier

class KNN_Classifier:
    def __init__(self):
        self.X_train = None
        self.y_train = None
        self.n_neighbors = None
        self.weights = None

    def fit(self, X, Y, n_neighbors=5, weights='uniform'):
        self.X_train = X
        self.y_train = Y
        self.n_neighbors = n_neighbors
        self.weights = weights

    def predict(self, X):
        predictions = []
        for i in range(X.shape[0]):
            # Compute distances from X[i] to all points in self.X_train
            distances = np.sqrt(np.sum((self.X_train - X[i]) ** 2, axis=1))

            # Sort distances, and return the indices of k nearest neighbors
            k_indices = np.argsort(distances)[:self.n_neighbors]

            # Extract the labels of the nearest neighbors
            k_nearest_labels = self.y_train[k_indices]

            # For 'uniform' weights, use majority vote
            if self.weights == 'uniform':
                # Count occurrences of each class in the nearest neighbors
                counts = np.bincount(k_nearest_labels)
                # Choose the class with the most occurrences
                predictions.append(counts.argmax())

            # For 'distance' weights, weight votes by inverse distance
            elif self.weights == 'distance':
                # Count occurrences of each class, weighted by inverse distance
                weights = 1 / distances[k_indices]
                prediction = np.argmax(np.bincount(k_nearest_labels, weights=weights))
                predictions.append(prediction)

        return np.array(predictions)

# Instantiate the classifier
knn = KNN_Classifier()

# Fit the model on the training data
knn.fit(X_train, Y_train, n_neighbors=5, weights='uniform')

# Make predictions on the test data
Y_pred = knn.predict(X_test)

# Calculate accuracy, precision, recall, F1 score, confusion matrix, etc.
accuracy_val = accuracy(Y_test, Y_pred)
precision_val = precision(Y_test, Y_pred)
recall_val = recall(Y_test, Y_pred)
f1_val = f1_score(Y_test, Y_pred)
conf_matrix = confusion_matrix(Y_test, Y_pred)
