In [34]:
import csv
import numpy as np

# Load CSV data into NumPy arrays
def load_data(file_path, selected_columns, target_column):
    data = []
    with open(file_path, 'r') as file:
        reader = csv.reader(file)
        headers = next(reader)  # Read the header row
        selected_indices = [headers.index(col) for col in selected_columns]
        target_index = headers.index(target_column)
        
        for row in reader:
            data.append([float(row[i]) if row[i].isdigit() else row[i] for i in selected_indices + [target_index]])
    data = np.array(data, dtype=object)
    X = np.array(data[:, :-1], dtype=float)  # Features
    y = np.array(data[:, -1], dtype=float)  # Target variable
    return X, y


In [None]:
def normalize(X, y):
    """Normalize features and target to have mean 0 and standard deviation 1."""
    X_mean = np.mean(X, axis=0) # Mean of each column
    X_std = np.std(X, axis=0) # Standard deviation of each column
    y_mean = np.mean(y) # Mean of target
    y_std = np.std(y) # Standard deviation of target
    
    X_normalized = (X - X_mean) / X_std # Normalized features
    y_normalized = (y - y_mean) / y_std # Normalized target
    return X_normalized, y_normalized, X_mean, X_std, y_mean, y_std # Return the normalization parameters



def lasso_regression(X, y, alpha, iterations=1000, learning_rate=0.001):
    m, n = X.shape
    theta = np.zeros(n) # Initialize the weights
    
    for _ in range(iterations):
        predictions = X @ theta # Make predictions
        mse_loss = np.mean((y - predictions) ** 2)  # MSE
        gradient = (-2 / m) * (X.T @ (y - predictions)) + (alpha / m) * np.sign(theta) # L1 regularization, L1 norm
        theta -= learning_rate * gradient # Update the weights
    
    return theta


def ridge_regression(X, y, alpha, iterations=1000, learning_rate=0.001):
    m, n = X.shape
    theta = np.zeros(n)
    
    for _ in range(iterations):
        predictions = X @ theta
        mse_loss = np.mean((y - predictions) ** 2)  # MSE
        gradient = (-2 / m) * (X.T @ (y - predictions)) + (2 * alpha / m) * theta # L2 regularization, squared L2 norm
        theta -= learning_rate * gradient
    
    return theta



In [None]:
def polynomial_features(X, degree):
    """Generate polynomial features up to the given degree."""
    X_poly = X
    for d in range(2, degree + 1):
        X_poly = np.hstack((X_poly, X ** d)) # Add higher-order features
    return X_poly

def polynomial_regression(X, y, degree):
    """Train polynomial regression using the normal equation."""
    X_poly = polynomial_features(X, degree) # Generate polynomial features
    X_poly, _, _, _, _, _ = normalize(X_poly, y)  # Normalize expanded features
    theta = np.linalg.inv(X_poly.T @ X_poly) @ X_poly.T @ y # Normal equation
    return theta




In [37]:
def cross_validation_with_theta(X, y, model, model_params, folds=5, y_mean=0, y_std=1):
    fold_size = len(X) // folds
    errors = []
    best_theta = None
    
    for i in range(folds):
        # Split data into train and validation sets
        X_val = X[i * fold_size:(i + 1) * fold_size]
        y_val = y[i * fold_size:(i + 1) * fold_size]
        X_train = np.vstack((X[:i * fold_size], X[(i + 1) * fold_size:]))
        y_train = np.hstack((y[:i * fold_size], y[(i + 1) * fold_size:]))
        
        # Train the model
        if model == "lasso":
            theta = lasso_regression(X_train, y_train, **model_params)
        elif model == "ridge":
            theta = ridge_regression(X_train, y_train, **model_params)
        elif model == "polynomial":
            degree = model_params['degree']
            theta = polynomial_regression(X_train, y_train, degree)
        
        # Predict and calculate error (denormalize predictions)
        if model == "polynomial":
            X_val_poly = polynomial_features(X_val, model_params['degree'])
            X_val_poly, _, _, _, _, _ = normalize(X_val_poly, y_val)
            predictions = X_val_poly @ theta
        else:
            predictions = X_val @ theta
        
        predictions = predictions * y_std + y_mean  # Denormalize predictions
        error = np.mean((y_val * y_std + y_mean - predictions) ** 2)  # Calculate denormalized MSE
        errors.append(error)
        
        # Store the last theta (trained on the last fold)
        best_theta = theta
    
    print(f"Model: {model}, Fold Errors: {errors}")
    return np.mean(errors), best_theta


In [38]:
if __name__ == "__main__":
    # File path and columns
    file_path = '/data/haoyang27/data_mining/StudentPerformanceFactors_Cleaned.csv'  # Replace with your file path
    selected_columns = ['Hours_Studied', 'Attendance']  # Replace with your chosen features
    target_column = 'Exam_Score'  # Replace with your target column
    
    # Load and normalize data
    X, y = load_data(file_path, selected_columns, target_column)
    X, y, X_mean, X_std, y_mean, y_std = normalize(X, y)
    
    # Train and save theta for Lasso
    lasso_error, theta_lasso = cross_validation_with_theta(
        X, y, model="lasso", model_params={"alpha": 0.1}, y_mean=y_mean, y_std=y_std
    )
    print("Lasso Regression Error:", lasso_error)
    
    # Train and save theta for Ridge
    ridge_error, theta_ridge = cross_validation_with_theta(
        X, y, model="ridge", model_params={"alpha": 0.1}, y_mean=y_mean, y_std=y_std
    )
    print("Ridge Regression Error:", ridge_error)
    
    # Train and save theta for Polynomial Regression
    poly_error, theta_poly = cross_validation_with_theta(
        X, y, model="polynomial", model_params={"degree": 2}, y_mean=y_mean, y_std=y_std
    )
    print("Polynomial Regression Error:", poly_error)


Model: lasso, Fold Errors: [8.630143682013461, 7.375695718310576, 7.268288748071142, 7.203875734980572, 5.755651636194513]
Lasso Regression Error: 7.246731103914053
Model: ridge, Fold Errors: [8.630134278861314, 7.3756850929335815, 7.268282009963958, 7.203865559227341, 5.755636498761594]
Ridge Regression Error: 7.246720687949558
Model: polynomial, Fold Errors: [8.549712865731061, 7.2398728295203965, 7.1850819893227404, 7.053220144742656, 5.515896936720181]
Polynomial Regression Error: 7.108756953207407


In [39]:
def predict(X_input, model, model_params, theta, X_mean, X_std, y_mean, y_std):
    """
    Predict the exam score given input features.
    
    Parameters:
    - X_input: Raw input features (not normalized)
    - model: The model type ("lasso", "ridge", or "polynomial")
    - model_params: The parameters used to train the model
    - theta: The trained parameters
    - X_mean, X_std: The mean and std of the features used during training
    - y_mean, y_std: The mean and std of the target variable during training
    
    Returns:
    - Predicted exam score (denormalized)
    """
    # Normalize the input features
    X_input_normalized = (X_input - X_mean) / X_std

    # For polynomial regression, expand features
    if model == "polynomial":
        degree = model_params['degree']
        X_input_normalized = polynomial_features(X_input_normalized, degree)

    # Compute predictions
    predictions_normalized = X_input_normalized @ theta
    
    # Denormalize the predictions
    predictions = predictions_normalized * y_std + y_mean
    return predictions


In [41]:
# Example input: Hours_Studied and Attendance for a new student
X_input = np.array([[24, 98]])  # Replace with actual input features

# Predict using Lasso
predicted_lasso = predict(
    X_input,
    model="lasso",
    model_params={"alpha": 0.1},
    theta=theta_lasso,
    X_mean=X_mean,
    X_std=X_std,
    y_mean=y_mean,
    y_std=y_std,
)
print("Predicted Exam Score (Lasso):", predicted_lasso)

# Predict using Ridge
predicted_ridge = predict(
    X_input,
    model="ridge",
    model_params={"alpha": 0.1},
    theta=theta_ridge,
    X_mean=X_mean,
    X_std=X_std,
    y_mean=y_mean,
    y_std=y_std,
)
print("Predicted Exam Score (Ridge):", predicted_ridge)

# Predict using Polynomial
predicted_poly = predict(
    X_input,
    model="polynomial",
    model_params={"degree": 2},
    theta=theta_poly,
    X_mean=X_mean,
    X_std=X_std,
    y_mean=y_mean,
    y_std=y_std,
)
print("Predicted Exam Score (Polynomial):", predicted_poly)


Predicted Exam Score (Lasso): [71.28409414]
Predicted Exam Score (Ridge): [71.28411431]
Predicted Exam Score (Polynomial): [71.84380941]
