In [1]:
import numpy as np
import pandas as pd

def matrix_prep(A):
    """
    Demeans and normalizes each column of A.

    Parameters
    ----------
    A : ndarray (m,n)
        Input data matrix.

    Returns
    -------
    A : ndarray (m,n)
        Column demeaned and column normalized matrix.
    """
    
    # Standardize
    mu = np.mean(A, axis=0)
    A  = A - mu

    # Normalize
    A /= np.linalg.norm(A, axis=0)
    return A

In [23]:
def pc_var(A,k):
    """
    Computes the first k V-coordinates and the cumulative variance of A.

    Parameters
    ----------
    A : ndarray (m,n)
        Preprocessed data matrix.

    k : int
        Number of V-coordinates to return.

    Returns
    -------
    V_coordinates_k : ndarray (m, k)
        First k columns of A @ V, where V comes from the SVD of A.

    cumulative_var : ndarray
        Cumulative variance based on singular values.
    """

    A = matrix_prep(A)
    U, s, Vt = np.linalg.svd(A)
    V = Vt.T
    
    # Get the V coordinates 
    V_coordinates_k = A[:k,:]
    V_coordinates = V_coordinates_k @ V
    
    s2 = s**2
    cumulative_var = np.cumsum(s2) / np.sum(s2)
    return V_coordinates, cumulative_var

In [27]:
def single_predictor(data, label, x):
    """
    Predict the label of x using nearest neighbor.

    Parameters
    ----------
    data : ndarray (m, n)
    label : ndarray (m,)
    x : ndarray (n,)

    Returns
    -------
    scalar
        Label of the nearest row in data.
    """
    rowx = data[x]
    data = np.vstack((data[:x,:], data[x+1:,:]))
    
    distance = np.linalg.norm(data - x, axis=0)
    index = np.argmin(distance)

    return label[index]

In [55]:
def comparison(train_data, train_label, test_data, test_label):
    """
    Evaluate nearest-neighbor accuracy on a test set.

    Parameters
    ----------
    train_data : ndarray (n_train, n_features)
    train_label : ndarray (n_train,)
    test_data : ndarray (n_test, n_features)
    test_label : ndarray (n_test,)

    Returns
    -------
    accuracy : float
        Percentage of correct predictions.
    """
    
    correct = 0
    total = len(test_data)

    for i in range(total):
        x = test_data[i]
        y_true = test_labels[i]
        y_pred = single_predictor(train_data, train_labels, x)

        if y_pred == y_true:
            correct += 1

    return 100* (correct / total)

In [18]:
df = pd.read_csv('433pitch.csv', index_col=0)
A = np.array(df)

In [19]:
A

array([[  0.37317784,  95.57745361,  -4.35503129,   9.59396008,
          0.        ],
       [  0.3520521 ,  93.56331534,  -6.38268757,   8.46389037,
          0.        ],
       [  0.52958333,  94.84846647,  -6.51860449,   8.62331085,
          0.        ],
       ...,
       [  0.54345794,  98.04195944,  -3.69483739,   9.44401943,
          0.        ],
       [  0.57236084, 100.40067226,  -5.74658958,  10.05535303,
          0.        ],
       [  0.14924052,  95.52746962,  -7.54267693,   7.35886207,
          0.        ]], shape=(648, 5))

In [20]:
matrix_prep(A)

array([[ 0.01045451,  0.01022673, -0.01720613,  0.01726237, -0.01567479],
       [ 0.00574066,  0.00430681, -0.03376774, -0.00405412, -0.01567479],
       [ 0.04535373,  0.00808411, -0.03487789, -0.00104697, -0.01567479],
       ...,
       [ 0.04844961,  0.01747036, -0.01181376,  0.01443404, -0.01567479],
       [ 0.0548988 ,  0.02440305, -0.02857218,  0.02596562, -0.01567479],
       [-0.0395133 ,  0.01007982, -0.04324237, -0.02489825, -0.01567479]],
      shape=(648, 5))

In [24]:
pc_var(A,10)

(array([[-2.13495840e-02, -2.00097078e-02, -1.23510418e-02,
         -4.61260667e-03,  4.85842958e-03],
        [-3.66862220e-03, -3.57538320e-02, -3.87466127e-03,
         -1.05184524e-02, -6.03760100e-03],
        [-2.80989749e-02, -2.98233806e-02, -9.38414182e-03,
         -3.73278942e-02, -2.06193023e-02],
        [ 5.44129610e-02,  7.63868948e-02, -4.00946218e-02,
          4.47089310e-02, -4.33225984e-02],
        [-2.75147256e-04, -4.56191549e-02,  3.90769999e-05,
          1.04983434e-02,  6.16245008e-03],
        [-3.82916387e-02,  4.69788111e-02, -3.96551679e-02,
         -1.72250942e-02,  7.24239240e-03],
        [ 5.22227801e-03, -5.67488373e-02,  4.96361566e-03,
         -3.96229928e-02, -5.19250496e-02],
        [ 2.45307077e-02,  6.79292780e-02, -3.95285569e-02,
          8.33414937e-03, -1.71259450e-02],
        [-4.63048721e-02, -1.27569897e-02, -1.78057875e-02,
         -3.41014763e-02,  9.08723573e-03],
        [-2.89339059e-02,  2.30258494e-02,  9.34189382e-02,
    

In [28]:
single_predictor(A[:,:-1], A[:,-1], A[18])

IndexError: arrays used as indices must be of integer (or boolean) type