In [12]:
import numpy as np
from numpy import genfromtxt
from sklearn.preprocessing import MinMaxScaler

# Question 1

In [13]:
A = genfromtxt('life_expectancy_X.csv', delimiter=',')
A_test = genfromtxt('life_expectancy_X_test.csv', delimiter=',')

y = genfromtxt('life_expectancy_y.csv', delimiter=',')
y_test = genfromtxt('life_expectancy_y_test.csv', delimiter=',')

# Scale
scaler = MinMaxScaler()
A = scaler.fit_transform(A)
A_test = scaler.transform(A_test)

### 1)

In [14]:
def f_prime(w, phi, y, l1_lambda):
    return (1/len(phi))*(phi.T @ (phi@w - y)) + l1_lambda * np.sign(w)

def gradient_descent(phi, y, w, eta, num_iterations, l1_lambda):
	for i in range(num_iterations):
		w = w - eta * f_prime(w, phi, y, l1_lambda)  
            
	return w

def mse_func(phi, w, y):
	return np.mean((phi@w - y)**2)

def sorted_features_importance(feature_names_list, w):
	sorted_w = w[:-1]

	sorted_indices = np.argsort(-np.abs(sorted_w))
	sorted_features = [feature_names_list[i] for i in sorted_indices]
	sorted_w = w[sorted_indices]

	for feature, weight in zip(sorted_features, sorted_w):
		print(f"{feature}: {weight:.4f}")

In [74]:
phi = np.hstack((np.ones((A.shape[0], 1)), A))
w0 = np.zeros(phi.shape[1])
eta = 0.02
l1_lambda = 0.01

w = gradient_descent(phi, y, w0, eta, 10000, l1_lambda)

phi_test = np.hstack((np.ones((A_test.shape[0], 1)), A_test))

print(f'MSE: {mse_func(phi_test, w, y_test)}')

MSE: 0.005947909788639809


### 2)

In [39]:
feature_names = [
    "Exercise amount",
    "Amount of supportive relationships",
    "Number of siblings",
    "Alcohol / Drugs / Smoking consumption",
    "Height",
    "Attractiveness",
    "work ethics"
]

sorted_features_importance(feature_names, w)

Exercise amount: 57.7919
Amount of supportive relationships: 27.2238
Number of siblings: 12.1992
Height: -1.2059
Attractiveness: 0.0121
Alcohol / Drugs / Smoking consumption: 0.0086
work ethics: 0.0040


Biggest Positive Impact: Exercise Amount, Supportive Relationshops, Number of Siblings  
Negative Impact: Height  
Attractiveness, Alcohol/Drugs/Smoking, and Work Ethic all have a minimal impact on longetivty  

### 3)

In [None]:
def center_func(n):
      C = np.eye(n) - (1/n) * np.ones((n, n))
      return C

def pca_reduc_func(X, num_dimensions):
    n = len(X)

    C = center_func(n)
    Q = X.T @ C @ X

    [D, V] = np.linalg.eigh(Q)

    v = V[:, -num_dimensions:]

    X_hat = (C @ X) @ v 

    return X_hat, v 

def f_prime(w, phi, y):
    return (1/len(phi))*(phi.T @ (phi@w - y))

def gradient_descent(phi, y, w, eta, num_iterations):
    for i in range(num_iterations):
        w = w - eta * f_prime(w, phi, y)  

        if i % 1000 == 0:
            mse = np.mean((phi @ w - y)**2)
            print(f"Iter {i}, MSE={mse:.2f}")

    return w

In [16]:
pca_A, pca_v = pca_reduc_func(A, 4)

pca_phi = np.hstack((np.ones((pca_A.shape[0], 1)), pca_A))
w0 = np.zeros(pca_phi.shape[1])
eta = 0.05

pca_w = gradient_descent(pca_phi, y, w0, eta, 20000)

A_test_C = center_func(len(A_test))
pca_A_test = A_test @ pca_v

pca_phi_test = np.hstack((np.ones((pca_A_test.shape[0], 1)), pca_A_test))
# print(f'MSE: {mse_func(pca_phi_test, pca_w, y_test)}')
print(f'MSE: {mse_func(pca_phi, pca_w, y)}')

Iter 0, MSE=5471.17
Iter 1000, MSE=54.17
Iter 2000, MSE=54.16
Iter 3000, MSE=54.16
Iter 4000, MSE=54.16
Iter 5000, MSE=54.16
Iter 6000, MSE=54.16
Iter 7000, MSE=54.16
Iter 8000, MSE=54.16
Iter 9000, MSE=54.16
Iter 10000, MSE=54.16
Iter 11000, MSE=54.16
Iter 12000, MSE=54.16
Iter 13000, MSE=54.16
Iter 14000, MSE=54.16
Iter 15000, MSE=54.16
Iter 16000, MSE=54.16
Iter 17000, MSE=54.16
Iter 18000, MSE=54.16
Iter 19000, MSE=54.16
MSE: 54.160849392578356


In [101]:
def pca_reduc_func(X, num_dimensions):
    mean = np.mean(X, axis=0)
    X_centered = X - mean

    Q = X_centered.T @ X_centered
    D, V = np.linalg.eigh(Q)
    v = V[:, -num_dimensions:]

    X_hat = X_centered @ v
    return X_hat, v, mean

def apply_pca(X, v, mean):
    return (X - mean) @ v

pca_A, pca_v, pca_mean = pca_reduc_func(A, 4)

# Add bias term AFTER projection
pca_phi = np.hstack((np.ones((pca_A.shape[0], 1)), pca_A))

# Initialize weights to 0
w0 = np.zeros(pca_phi.shape[1])
eta = 0.05
pca_w = gradient_descent(pca_phi, y, w0, eta, 20000)

# Predict on training set
train_mse = mse_func(pca_phi, pca_w, y)
print(f"Train MSE: {train_mse:.4f}")

# Transform test set using same PCA basis and mean
pca_A_test = apply_pca(A_test, pca_v, pca_mean)
pca_phi_test = np.hstack((np.ones((pca_A_test.shape[0], 1)), pca_A_test))
test_mse = mse_func(pca_phi_test, pca_w, y_test)
print(f"Test MSE: {test_mse:.4f}")


Train MSE: 54.1608
Test MSE: 50.2073


### 4)
PCA is a good idea for this problem because it projects the data onto the most important dimensions. For this problem, the linear regression weights of attractiveness, alcohol/drugs consumption, and work ethic are all very small and don't have much significance to the predicted label. There are 4 weights that do make an impact, so projecting the data down to just 4 dimensions make sense. Once the data is projected, the noise and less impactful dimensions are removed, and it is easier to run linear regression and other algorithms.

In [18]:
corr_list = []
for xi in A.T:
    corr = np.corrcoef(xi, y)[0, 1]
    corr_list.append(corr)

print(corr_list)

[0.850862923961672, 0.43643566943178946, 0.0024597524148604446, -0.31227345199114964, -0.03625414381613679, -0.034824150075934056, 0.031364032485239474]


In [None]:
# Check correlations with feature names
feature_names = [
    "Exercise amount",
    "Amount of supportive relationships",
    "Number of siblings",
    "Alcohol / Drugs / Smoking consumption",
    "Height",
    "Attractiveness",
    "work ethics"
]

sorted_features_importance(feature_names, np.array(corr_list))

Exercise amount: 0.8509
Amount of supportive relationships: 0.4364
Alcohol / Drugs / Smoking consumption: -0.3123
Height: -0.0363
Attractiveness: -0.0348
Number of siblings: 0.0025
