In [28]:
# load csv data
using CSV, DataFrames, Random, Statistics, LinearAlgebra
df = CSV.read("parkinsons_updrs.csv", DataFrame)

# how many rows and columns?
size(df)

df


Row,subject#,age,sex,test_time,motor_UPDRS,total_UPDRS,Jitter(%),Jitter(Abs),Jitter:RAP,Jitter:PPQ5,Jitter:DDP,Shimmer,Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,Shimmer:APQ11,Shimmer:DDA,NHR,HNR,RPDE,DFA,PPE
Unnamed: 0_level_1,Int64,Int64,Int64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,1,72,0,5.6431,28.199,34.398,0.00662,3.38e-5,0.00401,0.00317,0.01204,0.02565,0.23,0.01438,0.01309,0.01662,0.04314,0.01429,21.64,0.41888,0.54842,0.16006
2,1,72,0,12.666,28.447,34.894,0.003,1.68e-5,0.00132,0.0015,0.00395,0.02024,0.179,0.00994,0.01072,0.01689,0.02982,0.011112,27.183,0.43493,0.56477,0.1081
3,1,72,0,19.681,28.695,35.389,0.00481,2.462e-5,0.00205,0.00208,0.00616,0.01675,0.181,0.00734,0.00844,0.01458,0.02202,0.02022,23.047,0.46222,0.54405,0.21014
4,1,72,0,25.647,28.905,35.81,0.00528,2.657e-5,0.00191,0.00264,0.00573,0.02309,0.327,0.01106,0.01265,0.01963,0.03317,0.027837,24.445,0.4873,0.57794,0.33277
5,1,72,0,33.642,29.187,36.375,0.00335,2.014e-5,0.00093,0.0013,0.00278,0.01703,0.176,0.00679,0.00929,0.01819,0.02036,0.011625,26.126,0.47188,0.56122,0.19361
6,1,72,0,40.652,29.435,36.87,0.00353,2.29e-5,0.00119,0.00159,0.00357,0.02227,0.214,0.01006,0.01337,0.02263,0.03019,0.009438,22.946,0.53949,0.57243,0.195
7,1,72,0,47.649,29.682,37.363,0.00422,2.404e-5,0.00212,0.00221,0.00637,0.04352,0.445,0.02376,0.02621,0.03488,0.07128,0.01326,22.506,0.4925,0.54779,0.17563
8,1,72,0,54.64,29.928,37.857,0.00476,2.471e-5,0.00226,0.00259,0.00678,0.02191,0.212,0.00979,0.01462,0.01911,0.02937,0.027969,22.929,0.47712,0.54234,0.23844
9,1,72,0,61.669,30.177,38.353,0.00432,2.854e-5,0.00156,0.00207,0.00468,0.04296,0.371,0.01774,0.02134,0.03451,0.05323,0.013381,22.078,0.51563,0.61864,0.20037
10,1,72,0,68.688,30.424,38.849,0.00496,2.702e-5,0.00258,0.00253,0.00773,0.0361,0.31,0.0203,0.0197,0.02569,0.06089,0.018021,22.606,0.50032,0.58673,0.20117


In [41]:
function prepare_data(df; target_col="total_UPDRS", test_size=0.15, val_size=0.15, seed = 15095)
    """
    Prepares data following the lecture's 70/15/15 split
    Returns: (X_train, y_train, X_val, y_val, X_test, y_test, feature_names)
    """
    
    # Remove non-feature columns
    exclude_cols = ["subject#", "motor_UPDRS", "total_UPDRS"]
    feature_cols = setdiff(names(df), exclude_cols)
    
    # Extract features and target
    X = Matrix(df[:, feature_cols])
    y = df[:, target_col]
    n = size(X, 1)
    
    # Create train/val/test split (70/15/15)
    n_train = floor(Int, n * 0.7)
    n_val = floor(Int, n * 0.15)
    n_test = n - n_train - n_val
    
    # Shuffle indices
    indices = randperm(n)
    train_idx = indices[1:n_train]
    val_idx = indices[n_train+1:n_train+n_val]
    test_idx = indices[n_train+n_val+1:end]
    
    # Split data
    X_train_raw = X[train_idx, :]
    y_train = y[train_idx]
    X_val_raw = X[val_idx, :]
    y_val = y[val_idx]
    X_test_raw = X[test_idx, :]
    y_test = y[test_idx]
    
    # Normalize training set: zero mean, unit ℓ2-norm per column
    # (as specified in lecture slide 8)
    X_train, μ, norms = normalize_features(X_train_raw)
    
    # Apply same normalization to validation and test sets
    X_val = apply_normalization(X_val_raw, μ, norms)
    X_test = apply_normalization(X_test_raw, μ, norms)
    
    return X_train, y_train, X_val, y_val, X_test, y_test, feature_cols, μ, norms
end

function normalize_features(X)
    """
    Normalize to zero mean and unit ℓ2-norm
    """
    n, p = size(X)
    X_norm = copy(X)
    μ = zeros(p)
    norms = zeros(p)
    
    for j in 1:p
        # Zero mean
        μ[j] = mean(X[:, j])
        X_norm[:, j] .-= μ[j]
        
        # Unit ℓ2-norm
        norms[j] = norm(X_norm[:, j])
        if norms[j] > 1e-10  # Avoid division by zero
            X_norm[:, j] ./= norms[j]
        end
    end
    
    return X_norm, μ, norms
end

function apply_normalization(X, μ, norms)
    """
    Apply pre-computed normalization parameters
    """
    n, p = size(X)
    X_norm = copy(X)
    
    for j in 1:p
        X_norm[:, j] .-= μ[j]
        if norms[j] > 1e-10
            X_norm[:, j] ./= norms[j]
        end
    end
    
    return X_norm
end

# Prepare the data
X_train, y_train, X_val, y_val, X_test, y_test, feature_names, μ, norms = 
    prepare_data(df)

println("Data shapes:")
println("Training: $(size(X_train)), Validation: $(size(X_val)), Test: $(size(X_test))")
println("Number of features: $(length(feature_names))")
println("\nFeature names:")
for (i, name) in enumerate(feature_names)
    println("  $i. $name")
end

In [44]:
# shape of X_train
size(X_train)  # (number of training samples, number of features)
# shape of y_train
size(y_train)  # (number of training samples,)
# shape of X_val
size(X_val)    # (number of validation samples, number of features)
# shape of y_val
size(y_val)    # (number of validation samples,)
# shape of X_test
size(X_test)   # (number of test samples, number of features)
# shape of y_test
size(y_test)   # (number of test samples,)

(882,)