In [5]:
import numpy as np
import pandas as pd
import os 

In [6]:
embeddings = np.load("chest-xray14_embeddings.npy")
annotation = pd.read_csv('Data_Entry_2017.csv')
annotation = annotation.rename(columns={'Image Index': 'img_id', 'Finding Labels': 'class_name', 'Patient ID':'patient_id', 'Patient Age':'age', 'Patient Gender': 'gender', 'View Position': 'view_position'})


In [7]:
file_path = "selected_png_list.txt"
ids = []

with open(file_path, 'r') as file:
    for line in file:
        full_path = line.strip()  
        filename = os.path.basename(full_path)
        ids.append(filename)

label_dict = annotation.set_index('img_id')['class_name'].to_dict()

labels = []
for id in ids:
    if label_dict[id] == 'No Finding':
        labels.append(0)
    else:
        labels.append(1)

In [8]:

def soft_thresholding(x, lambda_):
    
    if x > lambda_:
        return x - lambda_
    elif x < -lambda_:
        return x + lambda_
    else:
        return 0

def lasso_coordinate_descent(X, y, lambda_, num_iters=100):
    """
    Lasso regression using coordinate descent.

    Parameters:
        X : numpy array, shape (n_samples, n_features)
            Design matrix.
        y : numpy array, shape (n_samples,)
            Response vector.
        lambda_ : float
            Regularization parameter controlling the strength of the L1 penalty.
        num_iters : int
            Number of iterations to run the coordinate descent.

    Returns:
        numpy array, shape (n_features,)
            Coefficients of the Lasso regression.
    """
    
    n_samples, n_features = X.shape

   
    X_mean = np.mean(X, axis=0)
    X_std = np.std(X, axis=0)
    X = (X - X_mean) / X_std

   
    beta = np.zeros(n_features)
    
    
    for _ in range(num_iters):
        for j in range(n_features):
            
            temp_beta = beta.copy()
            
            temp_beta[j] = 0
            
            r = y - X @ temp_beta
            
            rho = np.dot(X[:, j], r)
            
            beta[j] = soft_thresholding(rho / n_samples, lambda_)

    
    beta /= X_std

    return beta


np.random.seed(0)
X = embeddings
y = labels
lambda_ = 0.001


coefficients = lasso_coordinate_descent(X, y, lambda_)
print("Lasso coefficients:", coefficients)


print(np.argmax(coefficients))


Lasso coefficients: [-1.91515091e-01  0.00000000e+00 -1.24607246e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  4.68805419e-01
  0.00000000e+00 -5.34409700e-01 -1.38630110e+00  1.28913679e+00
  5.12257783e-01 -6.41400284e-01 -4.32624572e-01  3.25910472e-01
 -1.86215872e-01  0.00000000e+00  0.00000000e+00  0.00000000e+00
  5.13204148e-01  6.44275448e-01 -1.74549922e-03 -4.48767594e-01
  0.00000000e+00  0.00000000e+00 -6.60371151e-01  0.00000000e+00
 -1.21308290e+00  0.00000000e+00  9.90084164e-02  0.00000000e+00
 -5.67972584e-01  0.00000000e+00  3.09595619e-02  0.00000000e+00
  0.00000000e+00  0.00000000e+00  5.02365048e-01  0.00000000e+00
  0.00000000e+00 -7.22361956e-01  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  1.58338239e-01  0.00000000e+00
 -4.76798876e-01  1.11613853e+00  0.00000000e+00 -1.81702130e-02
 -1.09452315e+00  0.00000000e+00  0.00000000e+00  7.80188514e-01
 -2.16939544e-01  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.0

In [None]:
abs_coefficients = [abs(x) for x in coefficients]
abs_coefficients = np.array(abs_coefficients)
sorted_indices = np.argsort(-abs_coefficients)
#print(sorted_indices[:45])

sorted_beta = coefficients[sorted_indices]
#print(sorted_beta)

count = 0
for coeff in sorted_beta:
    if abs(coeff) > 1:
        count += 1
print(count)


lasso_reduced_indicies = sorted(sorted_indices[:45])
print(len(lasso_reduced_indicies))

print(embeddings.shape)

reduced_embeddings = embeddings[:, lasso_reduced_indicies]

print(reduced_embeddings.shape)

np.save("lasso_reduced_embeddings.npy", reduced_embeddings)


45
45
(9600, 512)
(9600, 45)
