In [180]:
import sys

# Setup

In [181]:
# To add your own Drive Run this cell.
# from google.colab import drive
# drive.mount('/content/drive')

In [182]:
# Please append your own directory after ‘/content/drive/My Drive/'
# where you have nutil.py and adult_subsample.csv
### ========== TODO : START ========== ###
# sys.path += ['/content/drive/My Drive/cm146/pset1']
# sys.path += ['/content/drive/']

### ========== TODO : END ========== ###


In [183]:
from nutil import *

In [184]:
# Use only the provided packages!
import math
import csv

from collections import Counter

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np
import warnings
warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)

# Models

In [185]:
######################################################################
# Immutatble classes
######################################################################

class Classifier(object) :
    """
    Classifier interface.
    """

    def fit(self, X, y):
        raise NotImplementedError()

    def predict(self, X):
        raise NotImplementedError()


class MajorityVoteClassifier(Classifier) :

    def __init__(self) :
        """
        A classifier that always predicts the majority class.

        Attributes
        --------------------
            prediction_ -- majority class
        """
        self.prediction_ = None

    def fit(self, X, y) :
        """
        Build a majority vote classifier from the training set (X, y).

        Parameters
        --------------------
            X    -- numpy array of shape (n,d), samples
            y    -- numpy array of shape (n,), target classes

        Returns
        --------------------
            self -- an instance of self
        """
        majority_val = Counter(y).most_common(1)[0][0]
        self.prediction_ = majority_val
        return self

    def predict(self, X) :
        """
        Predict class values.

        Parameters
        --------------------
            X    -- numpy array of shape (n,d), samples

        Returns
        --------------------
            y    -- numpy array of shape (n,), predicted classes
        """
        if self.prediction_ is None :
            raise Exception("Classifier not initialized. Perform a fit first.")

        n,d = X.shape
        y = [self.prediction_] * n
        return y



In [186]:
######################################################################
# Mutatble classes
######################################################################

class RandomClassifier(Classifier) :

    def __init__(self) :
        """
        A classifier that predicts according to the distribution of the classes.

        Attributes
        --------------------
            probabilities_ -- class distribution dict (key = class, val = probability of class)
        """
        self.probabilities_ = dict()

    def fit(self, X, y) :
        """
        Build a random classifier from the training set (X, y).

        Parameters
        --------------------
            X    -- numpy array of shape (n,d), samples
            y    -- numpy array of shape (n,), target classes

        Returns
        --------------------
            self -- an instance of self
        """

        ### ========== TODO : START ========== ###
        # part b: set self.probabilities_ according to the training set
        count_0 = Counter(y)[0]
        count_1 = Counter(y)[1]
        total = count_0 + count_1

        self.probabilities_[0] = count_0 / total
        self.probabilities_[1] = count_1 / total

        return self

        ### ========== TODO : END ========== ###

        return self

    def predict(self, X, seed=1234) :
        """
        Predict class values.

        Parameters
        --------------------
            X    -- numpy array of shape (n,d), samples
            seed -- integer, random seed

        Returns
        --------------------
            y    -- numpy array of shape (n,), predicted classes
        """
        if self.probabilities_ is None :
            raise Exception("Classifier not initialized. Perform a fit first.")
        np.random.seed(seed)

        ### ========== TODO : START ========== ###
        # part b: predict the class for each test example
        # hint: use np.random.choice (be careful of the parameters)
        np.random.seed(seed)
        classes = list(self.probabilities_.keys())
        probabilities = list(self.probabilities_.values())
        
        n, d = X.shape
        y = np.random.choice(classes, size=n, p=probabilities)

        ### ========== TODO : END ========== ###

        return y


# Auxiliary functions

In [187]:
######################################################################
# Immutatble functions
######################################################################

def plot_histograms(X, y, Xnames, yname) :
    n,d = X.shape  # n = number of examples, d =  number of features
    fig = plt.figure(figsize=(20,15))
    ncol = 3
    nrow = d // ncol + 1
    for i in range(d) :
        fig.add_subplot (nrow,ncol,i+1)
        data, bins, align, labels = plot_histogram(X[:,i], y, Xname=Xnames[i], yname=yname, show = False)
        n, bins, patches = plt.hist(data, bins=bins, align=align, alpha=0.5, label=labels)
        plt.xlabel(Xnames[i])
        plt.ylabel('Frequency')
        plt.legend() #plt.legend(loc='upper left')

    plt.savefig ('histograms.pdf')


def plot_histogram(X, y, Xname, yname, show = True) :
    """
    Plots histogram of values in X grouped by y.

    Parameters
    --------------------
        X     -- numpy array of shape (n,d), feature values
        y     -- numpy array of shape (n,), target classes
        Xname -- string, name of feature
        yname -- string, name of target
    """

    # set up data for plotting
    targets = sorted(set(y))
    data = []; labels = []
    for target in targets :
        features = [X[i] for i in range(len(y)) if y[i] == target]
        data.append(features)
        labels.append('%s = %s' % (yname, target))

    # set up histogram bins
    features = set(X)
    nfeatures = len(features)
    test_range = list(range(int(math.floor(min(features))), int(math.ceil(max(features)))+1))
    if nfeatures < 10 and sorted(features) == test_range:
        bins = test_range + [test_range[-1] + 1] # add last bin
        align = 'left'
    else :
        bins = 10
        align = 'mid'

    # plot
    if show == True:
        plt.figure()
        n, bins, patches = plt.hist(data, bins=bins, align=align, alpha=0.5, label=labels)
        plt.xlabel(Xname)
        plt.ylabel('Frequency')
        plt.legend() #plt.legend(loc='upper left')
        plt.show()

    return data, bins, align, labels

In [188]:
######################################################################
# Mutatble functions
######################################################################

def error(clf, X, y, ntrials=100, test_size=0.2) :
    """
    Computes the classifier error over a random split of the data,
    averaged over ntrials runs.

    Parameters
    --------------------
        clf         -- classifier
        X           -- numpy array of shape (n,d), features values
        y           -- numpy array of shape (n,), target classes
        ntrials     -- integer, number of trials

    Returns
    --------------------
        train_error -- float, training error
        test_error  -- float, test error
        f1_score    -- float, test "micro" averaged f1 score
    """

    ### ========== TODO : START ========== ###
    # compute cross-validation error using StratifiedShuffleSplit over ntrials

    from sklearn.model_selection import StratifiedShuffleSplit
    from sklearn.metrics import accuracy_score, f1_score
    import numpy as np

    sss = StratifiedShuffleSplit(n_splits=ntrials, test_size=test_size, random_state=0)

    train_errors = []
    test_errors = []
    f1_scores = []

    for train_index, test_index in sss.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        clf.fit(X_train, y_train)

        y_train_pred = clf.predict(X_train)
        y_test_pred = clf.predict(X_test)

        train_errors.append(1 - accuracy_score(y_train, y_train_pred))
        test_errors.append(1 - accuracy_score(y_test, y_test_pred))
        f1_scores.append(f1_score(y_test, y_test_pred, average='micro'))

    train_error = np.mean(train_errors)
    test_error = np.mean(test_errors)
    f1_score = np.mean(f1_scores)

    ### ========== TODO : END ========== ###

    return train_error, test_error, f1_score

In [189]:
######################################################################
# Immutatble functions
######################################################################


def write_predictions(y_pred, filename, yname=None) :
    """Write out predictions to csv file."""
    out = open(filename, 'wb')
    f = csv.writer(out)
    if yname :
        f.writerow([yname])
    f.writerows(list(zip(y_pred)))
    out.close()

# Evaluation

In [None]:

######################################################################
# main
######################################################################

def run_experiment(apply_scaling=False) :
    # load adult_subsample dataset with correct file path
    ### ========== TODO : START ========== ###
    data_file =  "./adult_subsample.csv"
    ### ========== TODO : END ========== ###


    data = load_data(data_file, header=1, predict_col=-1)

    X = data.X; Xnames = data.Xnames
    y = data.y; yname = data.yname
    n,d = X.shape  # n = number of examples, d =  number of features




    #========================================
    # part a: plot histograms of each feature
    print('Plotting...')
    plot_histograms (X, y, Xnames=Xnames, yname=yname)
    plt.show()



    ### ========== TODO : START ========== ###
    # part i: Preprocess X (e.g., normalize)
    # (try this after finishing the sections below)

    # X = ?
    scaler = StandardScaler()
    if apply_scaling:
        X = scaler.fit_transform(X)

    ### ========== TODO : END ========== ###




    #========================================
    # train Majority Vote classifier on data
    print('Classifying using Majority Vote...')
    clf = MajorityVoteClassifier() # create MajorityVote classifier, which includes all model parameters
    clf.fit(X, y)                  # fit training data using the classifier
    y_pred = clf.predict(X)        # take the classifier and run it on the training data
    train_error = 1 - metrics.accuracy_score(y, y_pred, normalize=True)
    print('\t-- training error: %.3f' % train_error)





    ### ========== TODO : START ========== ###
    # part b: evaluate training error of Random classifier
    print('Classifying using Random...')
    clf = RandomClassifier ()
    clf.fit(X, y)
    y_pred = clf.predict(X)
    train_error = 1 - metrics.accuracy_score(y, y_pred)

    print('\t-- training error: %.3f' % train_error)
    ### ========== TODO : END ========== ###




    ### ========== TODO : START ========== ###
    # part c: evaluate training error of Decision Tree classifier
    print('Classifying using Decision Tree...')
    clf = DecisionTreeClassifier(criterion='entropy')
    clf.fit(X, y)
    y_pred = clf.predict(X)
    train_error = 1 - metrics.accuracy_score(y, y_pred)

    print('\t-- training error: %.3f' % train_error)
    ### ========== TODO : END ========== ###




    ### ========== TODO : START ========== ###
    # part d: evaluate training error of k-Nearest Neighbors classifier
    # use k = 3, 5, 7 for n_neighbors
    print('Classifying using k-Nearest Neighbors...')


    # print the error for each k
    for k in [3, 5, 7]:
        clf = KNeighborsClassifier(n_neighbors=k)   
        clf.fit(X, y)
        y_pred = clf.predict(X)
        train_error = 1 - metrics.accuracy_score(y, y_pred,normalize=True)
        print(f'\t-- training error for k={k}: {train_error:.3f}')

    ### ========== TODO : END ========== ###




    ### ========== TODO : START ========== ###
    # part e: use cross-validation to compute average training and test error of classifiers
    print('Investigating various classifiers...')
    models = {
        'MajorityVote': MajorityVoteClassifier(),
        'Random': RandomClassifier(),
        'DecisionTree': DecisionTreeClassifier(criterion='entropy'),
        'KNN (k=5)': KNeighborsClassifier(n_neighbors=5)
    }

    for model_name, model in models.items():
        train_err, test_err, f1 = error(model, X, y,ntrials=20)
        print(f'{model_name} -- Train Error: {train_err:.3f}, Test Error: {test_err:.3f}, F1 Score: {f1:.3f}')

    # clf =

    # summary = error(clf, X, y, ntrials=20)
    # print(summary)
    ### ========== TODO : END ========== ###




    ### ========== TODO : START ========== ###
    # part f: use 10-fold cross-validation to find the best value of k for k-Nearest Neighbors classifier
    print('Finding the best k...')
    k_values = range(1, 51, 2)
    validation_errors = []

    for k in k_values:
        clf = KNeighborsClassifier(n_neighbors=k)
        scores = cross_val_score(clf, X, y, cv=10)
        validation_errors.append(1 - np.mean(scores))

    plt.plot(k_values, validation_errors)
    plt.xlabel('Number of Neighbors (k)')
    plt.ylabel('Validation Error')
    plt.title('KNN: Validation Error vs Number of Neighbors')
    plt.show()

    best_k = k_values[np.argmin(validation_errors)]
    print(f'Best value of k: {best_k}, error: {np.min(validation_errors)}')


    ### ========== TODO : END ========== ###




    ### ========== TODO : START ========== ###
    # part g: investigate decision tree classifier with various depths
    print('Investigating depths...')
    depths = range(1, 21)
    train_errors = []
    test_errors = []

    for depth in depths:
        clf = DecisionTreeClassifier(criterion='entropy', max_depth=depth)
        train_err, test_err, _ = error(clf, X, y)
        train_errors.append(train_err)
        test_errors.append(test_err)

    best_d = depths[np.argmin(test_errors)]
    print(f'Best value of depth: {best_d}, test error: {np.min(test_errors)}')
    plt.plot(depths, train_errors, label='Training Error')
    plt.plot(depths, test_errors, label='Test Error')
    plt.xlabel('Tree Depth')
    plt.ylabel('Error')
    plt.title('Decision Tree: Error vs Tree Depth')
    plt.legend()
    plt.show()


    ### ========== TODO : END ========== ###





   ### ========== TODO : START ========== ###
    # part h: investigate decision tree and k-Nearest Neighbors classifier with various training set sizes
    print('Investigating training set sizes...')

    # Initialize lists to store errors
    d_tree_test = []
    d_tree_train = []
    knn_test = []
    knn_train = []

    # Stratified split to maintain class balance
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=0)
    for train_index, test_index in sss.split(X, y):
        x_train_data = X[train_index]
        x_test_data = X[test_index]
        y_train_data = y[train_index]
        y_test_data = y[test_index]

    # Initialize models with best hyperparameters
    d_tree = DecisionTreeClassifier(criterion='entropy', max_depth=5)  # Replace 5 with the best depth if different
    knn = KNeighborsClassifier(n_neighbors=best_k)  # Replace 15 with the best k if different

    # Fit models on the full training set initially
    d_tree.fit(x_train_data, y_train_data)
    knn.fit(x_train_data, y_train_data)

    # Define the fractions of training data to evaluate
    percentages = [x / 10.0 for x in range(1, 10)]

    # Evaluate model performance for different training set sizes
    for fraction in percentages:
        # Compute errors for Decision Tree
        d_train_error, d_test_error, _ = error(d_tree, X, y, ntrials=100, test_size=fraction)
        d_tree_train.append(d_train_error)
        d_tree_test.append(d_test_error)

        # Compute errors for K-Nearest Neighbors
        k_train_error, k_test_error, _ = error(knn, X, y, ntrials=100, test_size=fraction)
        knn_train.append(k_train_error)
        knn_test.append(k_test_error)

    # Plotting results
    plt.figure(figsize=(10, 6))
    d_train_out, = plt.plot(percentages, d_tree_train, label='Decision Tree Training Error (max_depth=5)')
    d_test_out, = plt.plot(percentages, d_tree_test, label='Decision Tree Test Error (max_depth=5)')
    k_train_out, = plt.plot(percentages, knn_train, label=f'KNN Training Error (k={best_k})')
    k_test_out, = plt.plot(percentages, knn_test, label=f'KNN Test Error (k={best_k})')

    plt.title('Amount of Training Data vs. Decision Tree and KNN Error')
    plt.xlabel('Amount of Training Data')
    plt.ylabel('Error')
    plt.legend(handles=[d_train_out, d_test_out, k_train_out, k_test_out])
    plt.grid(True)
    plt.show()
    ### ========== TODO : END ========== ###




    print('Done')


if __name__ == "__main__":
    print("Running without scaling...")
    run_experiment(apply_scaling=False)

    print("\nRunning with scaling...")
    run_experiment(apply_scaling=True)
