In [5]:
import numpy as np
import pandas as pd

def matrix_prep(A):
    """
    Demeans and normalizes each column of A.

    Parameters
    ----------
    A : ndarray (m,n)
        Input data matrix.

    Returns
    -------
    A : ndarray (m,n)
        Column demeaned and column normalized matrix.
    """
    
    # Standardize
    mu = np.mean(A, axis=0)
    A  = A - mu

    # Normalize
    A /= np.linalg.norm(A, axis=0)
    return A

In [29]:
def pc_var(A, k):
    """
    Computes the first k V-coordinates and the cumulative variance of A.

    Parameters
    ----------
    A : ndarray (m, n)
        Raw numeric data matrix.

    k : int
        Number of V-coordinates to return.

    Returns
    -------
    V_coordinates_k : ndarray (m, k)
        First k columns.

    cumulative_var : ndarray
        Cumulative variance based on singular values.
    """
    # 1) Standardize/normalize columns
    A_prep = matrix_prep(A)

    # 2) SVD
    U, s, Vt = np.linalg.svd(A_prep, full_matrices=False)
    V = Vt.T

    # 3) V-coordinates
    V_coordinates = A_prep @ V
    V_coordinates_k = V_coordinates[:, :k]

    # 4) Cumulative variance from singular values
    s2 = s**2
    cumulative_var = np.cumsum(s2) / np.sum(s2)

    return V_coordinates_k, cumulative_var

In [14]:
def single_predictor(data, label, x):
    """
    Predict the label of x using nearest neighbor.

    Parameters
    ----------
    data : ndarray (m, n)
        Training data.
    label : ndarray (m,)
        Labels for each training row.
    x : ndarray (n,)

    Returns
    -------
    scalar
        Label of the nearest row in data.
    """
    distances = np.linalg.norm(data - x, axis=1)
    index = np.argmin(distances)
    return label[index]

In [39]:
def comparison(train_data, train_label, test_data, test_label):
    """
    Nearest neighbor accuracy on a test set.

    Parameters
    ----------
    train_data : ndarray
    train_label : ndarray
    test_data : ndarray
    test_label : ndarray

    Returns
    -------
    accuracy : float
        Percentage of correct predictions.
    """

    correct = 0
    total = len(test_data)

    for i in range(total):
        x = test_data[i]
        y_true = test_label[i]
        y_pred = single_predictor(train_data, train_label, x)

        if y_pred == y_true:
            correct += 1

    return 100 * (correct / total)

In [16]:
def split_pitch_data():
    """Splits the data from 433pitch.csv
    into 2 dataframes. The first df has a random 70% sample of the data
    from 433pitch.csv; the 'training' data.
    The second df has the remaining 30%; the 'testing' data. 

    Parameters
    ----------
    None

    Returns
    -------
    two dfs
    """
    df = pd.read_csv('433pitch.csv', index_col=0)
    df0 = df.loc[df['injury'] < 0.5] #slice out the uninjured samples
    df1 = df.loc[df['injury'] > 0.5] #slice out the injured samples
    
    size0 = df0.shape[0] 
    size1 = df1.shape[0] 
    rand0 = np.random.permutation(size0) #randomly shuffle the inj samples
    rand1 = np.random.permutation(size1) #randomly shuffle the uninj samples

    per0 = int(0.7 * size0) 
    per1 = int(0.7 * size1)
    
    df_train = pd.concat([df0.iloc[rand0[:per0]],df1.iloc[rand1[:per1]]]) #stack the uninjured on top of the injured
    df_test = df.drop(df_train.index) #df_test is 30% left over

    return df_train, df_test

In [40]:
def final_test():
    """
    Tests PCA and nearest neighbor for k = 1,2,3,4.
    Returns a 1x4 numpy array of mean accuracies.
    """
    ks = [1, 2, 3, 4]
    acc_means = np.zeros(4)

    for i in range(4):
        k = ks[i]
        accs = []

        for t in range(100):
            # Split the data
            df_train, df_test = split_pitch_data()

            # Labels
            y_train = df_train["injury"].to_numpy()
            y_test  = df_test["injury"].to_numpy()

            # Numeric
            X_train_raw = df_train.drop(columns=["injury"]).to_numpy(float)
            X_test_raw  = df_test.drop(columns=["injury"]).to_numpy(float)

            # Normalize and standardize
            X_train = matrix_prep(X_train_raw.copy())
            X_test  = matrix_prep(X_test_raw.copy())

            # V from SVD of training data
            U, s, Vt = np.linalg.svd(X_train, full_matrices=False)
            V = Vt.T

            # V-coordinates
            train_coords = X_train @ V
            test_coords  = X_test  @ V

            # First k V-coordinates
            train_data = train_coords[:, :k]
            test_data  = test_coords[:, :k]

            # Accuracy
            acc = comparison(train_data, y_train, test_data, y_test)
            accs.append(acc)

        # Average over 100 runs
        acc_means[i] = np.mean(accs)

    return acc_means

In [41]:
results = final_test()
print("Mean accuracy for k = 1, 2, 3, 4:", results)

Mean accuracy for k = 1, 2, 3, 4: [76.98974359 78.22564103 76.41025641 75.72307692]


In [42]:
df_full = pd.read_csv("433pitch.csv", index_col=0)
X_full = df_full.drop(columns=["injury"]).to_numpy(float)

_, cumvar_full = pc_var(X_full, k=4)
print("Cumulative variance for first 4 PCs:", cumvar_full[:4])

Cumulative variance for first 4 PCs: [0.49800592 0.75080345 0.9231557  1.        ]
