In [6]:
import numpy as np
import xgboost as xgb
import csv

ModuleNotFoundError: No module named 'xgboost'

In [36]:
class KNearestNeighbor(object):
    """ a kNN classifier with L2 distance """

    def __init__(self):
        pass

    def train(self, X, y):
        """
        Train the classifier. For k-nearest neighbors this is just 
        memorizing the training data.

        Inputs:
        - X: A numpy array of shape (num_train, D) containing the training data
          consisting of num_train samples each of dimension D.
        - y: A numpy array of shape (N,) containing the training labels, where
             y[i] is the label for X[i].
        """
        self.X_train = X
        self.y_train = y

    def predict(self, X, k=1, num_loops=0):
        """
        Predict labels for test data using this classifier.

        Inputs:
        - X: A numpy array of shape (num_test, D) containing test data consisting
             of num_test samples each of dimension D.
        - k: The number of nearest neighbors that vote for the predicted labels.
        - num_loops: Determines which implementation to use to compute distances
          between training points and testing points.

        Returns:
        - y: A numpy array of shape (num_test,) containing predicted labels for the
          test data, where y[i] is the predicted label for the test point X[i].  
        """
        if num_loops == 0:
            dists = self.compute_distances_no_loops(X)
        elif num_loops == 1:
            dists = self.compute_distances_one_loop(X)
        elif num_loops == 2:
            dists = self.compute_distances_two_loops(X)
        else:
            raise ValueError('Invalid value %d for num_loops' % num_loops)

        return self.predict_labels(dists, k=k)

    def compute_distances(self, X):
        """
        Compute the distance between each test point in X and each training point
        in self.X_train using no explicit loops.

        Input / Output: Same as compute_distances_two_loops
        """
        num_test = X.shape[0]
        num_train = self.X_train.shape[0]
        dists = np.zeros((num_test, num_train)) 
        # expand the square differences into quadratic form, and re-arrange the terms
        # get dist_11 = sqrt(x1^2...+x1_tr^2...-2(x1*x1_tr+...)) across first row, sum over columns
        X_terms = np.sum(np.square(X), axis=1)
        Xtr_terms = np.sum(np.square(self.X_train), axis=1)
        X_Xtr = np.dot(X,np.transpose(self.X_train)) #transpose to element wise multiply x12 with x_tr12 e.g.

        dists = np.sqrt(X_terms[:,np.newaxis]+Xtr_terms[np.newaxis,:]-2*X_Xtr)
        return dists

    def predict_labels(self, dists, k=1):
        """
        Given a matrix of distances between test points and training points,
        predict a label for each test point.

        Inputs:
        - dists: A numpy array of shape (num_test, num_train) where dists[i, j]
          gives the distance betwen the ith test point and the jth training point.

        Returns:
        - y: A numpy array of shape (num_test,) containing predicted labels for the
          test data, where y[i] is the predicted label for the test point X[i].  
        """
        num_test = dists.shape[0]
        y_pred = np.zeros(num_test)
        for i in range(num_test):
            # A list of length k storing the labels of the k nearest neighbors to
            # the ith test point.
            closest_y = np.array(self.y_train[np.argsort(dists[i,:],axis=0)[:k]])
            y_pred[i] = np.bincount(closest_y).argmax()

        return y_pred

In [37]:
# load csv file
data_pathname = "/Users/Jason/Documents/CS 221/project/dataset/ASR2016-clean.csv"
data_file = open(data_pathname, 'r')
reader = csv.reader(data_file)
headers = next(reader, None) # skip headers in csv

# northeast, south, midwest, west
X = [[], [], [], []] 
y = [[], [], [], []]

# load regional data and labels
for row in reader:
    row = [float(item) if item !=' ' else -1.0 for item in row]
    
    region = int(row[0] - 1)
    
    label = row[-1] == float('1') # is employed
    features = row[1:-1] # 32 features
    
    X[region].append(features)
    y[region].append(label)

In [38]:
# split regions into train-test-validation sets
NUM_REGIONS = 4
PERCENT_TEST = 0.8

X_train = [[], [], [], []] 
X_test = [[], [], [], []] 
y_train = [[], [], [], []]
y_test = [[], [], [], []]

for region in range(NUM_REGIONS):
    num_training = int(PERCENT_TEST*len(y[region]))
    
    X_train[region] = np.asarray(X[region])[:num_training]
    X_test[region] = np.asarray(X[region])[num_training + 1:]
    
    y_train[region] = np.asarray(y[region])[:num_training]
    y_test[region] = np.asarray(y[region])[num_training + 1:]

In [39]:
# train classification on each region 
classifiers = [[], [], [], []] 
for region in range(NUM_REGIONS):
    num_test = X_test[region].shape[0]
    classifiers[region] = KNearestNeighbor()
    classifiers[region].train(X_train[region], y_train[region])

In [40]:
# evaluate on each region
for region in range(NUM_REGIONS):
    dists =  classifiers[region].compute_distances(X_test[region])
    y_test_pred = classifiers[region].predict_labels(dists, k=25)
    num_correct = np.sum(y_test_pred == y_test[region])
    accuracy = float(num_correct) / num_test
    print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy))

Got 111 / 230 correct => accuracy: 0.482609
Got 227 / 230 correct => accuracy: 0.986957
Got 210 / 230 correct => accuracy: 0.913043
Got 186 / 230 correct => accuracy: 0.808696
