Assignment 4 of the course “Introduction to Machine Learning” at the University of Leoben.
Author: Fotios Lygerakis
Semester: SS 2022/2023

Import the libraries

In [215]:
import pandas as pd
import numpy as np

Create the Regression Models

In [216]:
class Predictor:
    def __init__(self):
        self.coefficients = None

    def fit(self, X, y):
        pass

    def predict(self, X):
        pass

class LinearRegression(Predictor):
    def fit(self, X, y):
        # Add a column of ones to X for the bias term
        X = np.hstack((np.ones((X.shape[0], 1)), X))

        # Compute the coefficients using the normal equation
        self.coefficients = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y)

    def predict(self, X):
        # Add a column of ones to X for the bias term
        X = np.hstack((np.ones((X.shape[0], 1)), X))

        # Predict the target variable
        y_pred = X.dot(self.coefficients)
        return y_pred

class RidgeRegression(Predictor):
    def __init__(self, alpha=1):
        self.alpha = alpha

    def fit(self, X, y):
        # Add a column of ones to X for the bias term
        X = np.hstack((np.ones((X.shape[0], 1)), X))

        # Compute the coefficients using the ridge regression formula
        identity = np.eye(X.shape[1])
        self.coefficients = np.linalg.inv(X.T.dot(X) + self.alpha * identity).dot(X.T).dot(y)

    def predict(self, X):
        # Add a column of ones to X for the bias term
        X = np.hstack((np.ones((X.shape[0], 1)), X))

        # Predict the target variable
        y_pred = X.dot(self.coefficients)
        return y_pred


class LassoRegression(Predictor):
    def __init__(self, alpha=1, num_iters=10000, lr=0.001):
        self.alpha = alpha
        self.num_iters = num_iters
        self.lr = lr

    def fit(self, X, y):
        # Add a column of ones to X for the bias term
        X = np.hstack((np.ones((X.shape[0], 1)), X))

        # Set random seed for reproducibility
        np.random.seed(42)

        # Initialize the coefficients
        self.coefficients = np.zeros(X.shape[1])

        # Perform gradient descent to find the optimal coefficients
        for _ in range(self.num_iters):
            gradient = self.calculate_gradient(X, y)
            self.coefficients -= self.lr * gradient

    def calculate_gradient(self, X, y):
        num_samples = X.shape[0]
        errors = X.dot(self.coefficients) - y
        gradient = X.T.dot(errors) / num_samples + self.alpha * np.sign(self.coefficients)
        return gradient

    def predict(self, X):
        # Add a column of ones to X for the bias term
        X = np.hstack((np.ones((X.shape[0], 1)), X))

        # Predict the target variable
        y_pred = X.dot(self.coefficients)
        return y_pred

Data Preprocessing and Data loading functions

In [217]:
def preprocess(df):
    # Handle missing values
    df.fillna(df.mean(), inplace=True)

    # Remove outliers
    z_scores = np.abs((df - df.mean()) / df.std())
    df = df[(z_scores < 3).all(axis=1)]

    # Normalize the data
    df = (df - df.mean()) / df.std()

    return df

def train_test_split(X, y, test_size=0.2, random_state=42):
    # Set random seed for reproducibility
    np.random.seed(random_state)

    # Calculate the number of samples for the test set
    num_test_samples = int(test_size * X.shape[0])

    # Randomly shuffle the data
    shuffled_indices = np.random.permutation(X.shape[0])
    X_shuffled = X[shuffled_indices]
    y_shuffled = y[shuffled_indices]

    # Split the data into training and test sets
    X_train = X_shuffled[:-num_test_samples]
    y_train = y_shuffled[:-num_test_samples]
    X_test = X_shuffled[-num_test_samples:]
    y_test = y_shuffled[-num_test_samples:]

    return X_train, X_test, y_train, y_test

def load_data():
    # Load the diabetes dataset
    df = pd.read_csv("diabetes.csv")

    # Preprocess the dataset
    df = preprocess(df)

    # Split the features and target variable
    X = df.drop("target", axis=1).values
    y = df["target"].values

    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test

In [218]:
# Load the diabetes dataset
df = pd.read_csv("diabetes.csv")

# Preprocess the dataset
df = preprocess(df)

Load the data

In [219]:
# Load the data
X_train, X_test, y_train, y_test = load_data()

Fit the models

In [220]:
# Fit the linear regression
linear_regression = LinearRegression()
linear_regression.fit(X_train, y_train)
linear_regression_predictions = linear_regression.predict(X_test)
linear_regression_coefficients = linear_regression.coefficients

In [221]:
# Fit the ridge regression
ridge_regression = RidgeRegression(alpha=1)
ridge_regression.fit(X_train, y_train)
ridge_regression_predictions = ridge_regression.predict(X_test)
ridge_regression_coefficients = ridge_regression.coefficients

In [222]:
# Fit the lasso regression
lasso_regression = LassoRegression(alpha=1, num_iters=10000, lr=0.001)
lasso_regression.fit(X_train, y_train)
lasso_regression_predictions = lasso_regression.predict(X_test)
lasso_regression_coefficients = lasso_regression.coefficients

Evaluate the models

In [223]:
# Evaluate the models
linear_regression_predictions = linear_regression.predict(X_test)
ridge_regression_predictions = ridge_regression.predict(X_test)
lasso_regression_predictions = lasso_regression.predict(X_test)

# Evaluate the performance of the models using metrics such as MSE and R-squared
def evaluate(y_true, y_pred):
    if y_pred is None:
        return None, None
    mse = np.mean((y_true - y_pred) ** 2)
    r2 = 1 - np.sum((y_true - y_pred) ** 2) / np.sum((y_true - np.mean(y_true)) ** 2)
    return mse, r2

linear_regression_mse, linear_regression_r2 = evaluate(y_test, linear_regression_predictions)
ridge_regression_mse, ridge_regression_r2 = evaluate(y_test, ridge_regression_predictions)
lasso_regression_mse, lasso_regression_r2 = evaluate(y_test, lasso_regression_predictions)

# Print the evaluation results
print("Linear Regression:")
print("MSE:", linear_regression_mse)
print("R-squared:", linear_regression_r2)

print("\nRidge Regression:")
print("MSE:", ridge_regression_mse)
print("R-squared:", ridge_regression_r2)

print("\nLasso Regression:")
print("MSE:", lasso_regression_mse)
print("R-squared:", lasso_regression_r2)

# Print the coefficients
print("Linear Regression Coefficients:")
print(linear_regression_coefficients)

print("\nRidge Regression Coefficients:")
print(ridge_regression_coefficients)

print("\nLasso Regression Coefficients:")
print(lasso_regression_coefficients)

Linear Regression:
MSE: 0.45655844989286215
R-squared: 0.49648680873991846

Ridge Regression:
MSE: 0.4538881699787352
R-squared: 0.49943171351921944

Lasso Regression:
MSE: 0.9353291468042304
R-squared: -0.03152304747942458
Linear Regression Coefficients:
[ 0.01352522 -0.01787645 -0.13537619  0.28798257  0.2314548  -0.55763991
  0.28243652  0.14633152  0.21855619  0.43763473  0.07599883]

Ridge Regression Coefficients:
[ 0.01347934 -0.01716758 -0.13364481  0.28866895  0.2294461  -0.42512131
  0.18070361  0.08770086  0.19869395  0.38786468  0.07691122]

Lasso Regression Coefficients:
[ 4.85115384e-04 -4.19393335e-04  2.72609196e-04  2.01691563e-04
  4.17269671e-04 -1.58782240e-04 -2.65640671e-04 -3.34430014e-04
 -8.71494306e-05  4.99946716e-04  6.08148304e-04]


In [224]:
# Get the absolute values of the coefficients
linear_regression_abs_coefficients = np.abs(linear_regression.coefficients)
ridge_regression_abs_coefficients = np.abs(ridge_regression.coefficients)
lasso_regression_abs_coefficients = np.abs(lasso_regression.coefficients)

# Find the indices of the top k features
k = 4  # Number of top features to consider
linear_regression_top_features = np.argsort(linear_regression_abs_coefficients)[-k:][::-1]
ridge_regression_top_features = np.argsort(ridge_regression_abs_coefficients)[-k:][::-1]
lasso_regression_top_features = np.argsort(lasso_regression_abs_coefficients)[-k:][::-1]

# Exclude the index of 'target' column if present
target_index = df.columns.get_loc('target')
linear_regression_top_features = linear_regression_top_features[linear_regression_top_features != target_index]
ridge_regression_top_features = ridge_regression_top_features[ridge_regression_top_features != target_index]
lasso_regression_top_features = lasso_regression_top_features[lasso_regression_top_features != target_index]

# Print the top features for each model
print("Linear Regression Top Features:")
for feature_idx in linear_regression_top_features:
    print(df.columns[feature_idx])

print("\nRidge Regression Top Features:")
for feature_idx in ridge_regression_top_features:
    print(df.columns[feature_idx])

print("\nLasso Regression Top Features:")
for feature_idx in lasso_regression_top_features[:k]:
    print(df.columns[feature_idx])


Linear Regression Top Features:
s2
s6
bp
s3

Ridge Regression Top Features:
s2
s6
bp
s1

Lasso Regression Top Features:
s6
age
sex
