In [1]:
import numpy as np
from helpers import *
from implementations import *

In [2]:
# You have to change the path for it to work
data_path = 'D:\\EPFL\\MA1\\Machine Learning\\Projet 1\\dataset'

In [3]:
x_train, x_test, y_train, train_ids, test_ids, headers_train = load_csv_data(data_path, sub_sample=False)

test


In [4]:
x_train.shape

(328135, 321)

In [5]:
x_test.shape

(109379, 321)

In [6]:
y_train.shape

(328135,)

## Data Preprocessing

In [7]:
def standardize(x):
    """Stadartize the input data x

    Args:
        x: numpy array of shape=(num_samples, num_features)

    Returns:
        standartized data, shape=(num_samples, num_features)

    >>> standardize(np.array([[1, 2], [3, 4], [5, 6]]))
    array([[-1.22474487, -1.22474487],
           [ 0.        ,  0.        ],
           [ 1.22474487,  1.22474487]])
    """
    # ***************************************************
    mean = np.mean(x, 0)
    std = np.std(x, 0)
    std_data = np.zeros(x.shape)
    for i in range(x.shape[0]):
        std_data[i,:] = (x[i,:]-mean)/std
    #centered_data = x - np.mean(x, axis=0)
    #std_data = centered_data / np.std(centered_data, axis=0) with this you get close results, applying the std to the centered data does not change a lot
    # ***************************************************
    return std_data

In [8]:
# Function to find the count of missing values in each columns
def find_missing_values(data, headers=None):
    num_rows, num_cols = data.shape
    missing_count = np.zeros(num_cols, dtype=int)  
    columns = np.linspace(0,num_cols, num_cols+1)
    
    for col in range(num_cols):
        # Count the missing values
        missing_count[col] = np.sum(np.isnan(data[:, col]))
    if headers : 
        # Returning only the columns with missing values
        missing_info = {headers[col]: missing_count[col] for col in range(num_cols) if missing_count[col] > 0}
    else : 
        missing_info = {columns[col]: missing_count[col] for col in range(num_cols) if missing_count[col] > 0}
    return missing_info

In [9]:
# Function to remove columns with missing values
def remove_high_missing_columns(data):
    num_rows, num_cols = data.shape
    threshold = 0  # For now, i put this value, I don't know what is a good treshold
    missing_count = find_missing_values(data) 

    # Create a mask for columns to keep
    columns_to_keep = [col for col in range(num_cols) if missing_count.get(col, 0) <= threshold]

    # Return a new array with only the columns that meet the criteria
    return data[:, columns_to_keep], columns_to_keep

In [10]:
def get_info_on_data(data, headers):
# Créer un dictionnaire pour stocker les informations
    column_info = {}
    if headers :
        columns = headers
    else : 
        columns = np.linspace(0,num_cols, num_cols+1)
    num_rows, num_cols = data.shape

    for col in range(data.shape[1]):
        # Récupérer la colonne
        column_values = data[:, col]

        # Déterminer le type de variable
        unique_values = set(column_values)  # Récupérer les valeurs uniques
        length = len(unique_values)
        min_value = min(unique_values)
        max_value = max(unique_values)
        
        # Ajouter des informations sur la colonne
        column_info[columns[col]] = {
            'colonne' : columns[col],
            'length': length,
            #'unique_values': unique_values
            'min value' : min_value,
            'max value' : max_value    
        }

    return column_info

In [11]:
column_info = get_info_on_data(x_train, headers_train)
print(headers_train)

['Id', '_STATE', 'FMONTH', 'IDATE', 'IMONTH', 'IDAY', 'IYEAR', 'DISPCODE', 'SEQNO', '_PSU', 'CTELENUM', 'PVTRESD1', 'COLGHOUS', 'STATERES', 'CELLFON3', 'LADULT', 'NUMADULT', 'NUMMEN', 'NUMWOMEN', 'CTELNUM1', 'CELLFON2', 'CADULT', 'PVTRESD2', 'CCLGHOUS', 'CSTATE', 'LANDLINE', 'HHADULT', 'GENHLTH', 'PHYSHLTH', 'MENTHLTH', 'POORHLTH', 'HLTHPLN1', 'PERSDOC2', 'MEDCOST', 'CHECKUP1', 'BPHIGH4', 'BPMEDS', 'BLOODCHO', 'CHOLCHK', 'TOLDHI2', 'CVDSTRK3', 'ASTHMA3', 'ASTHNOW', 'CHCSCNCR', 'CHCOCNCR', 'CHCCOPD1', 'HAVARTH3', 'ADDEPEV2', 'CHCKIDNY', 'DIABETE3', 'DIABAGE2', 'SEX', 'MARITAL', 'EDUCA', 'RENTHOM1', 'NUMHHOL2', 'NUMPHON2', 'CPDEMO1', 'VETERAN3', 'EMPLOY1', 'CHILDREN', 'INCOME2', 'INTERNET', 'WEIGHT2', 'HEIGHT3', 'PREGNANT', 'QLACTLM2', 'USEEQUIP', 'BLIND', 'DECIDE', 'DIFFWALK', 'DIFFDRES', 'DIFFALON', 'SMOKE100', 'SMOKDAY2', 'STOPSMK2', 'LASTSMK2', 'USENOW3', 'ALCDAY5', 'AVEDRNK2', 'DRNK3GE5', 'MAXDRNKS', 'FRUITJU1', 'FRUIT1', 'FVBEANS', 'FVGREEN', 'FVORANG', 'VEGETAB1', 'EXERANY2', 'EXR

In [12]:
print(column_info["_STATE"])

{'colonne': '_STATE', 'length': 12, 'min value': np.float64(1.0), 'max value': np.float64(12.0)}


In [13]:
column_with_nan = find_missing_values(x_train)

In [14]:
sliced_x_train, columns_to_keep = remove_high_missing_columns(x_train)

In [15]:
sliced_x_train.shape

(328135, 82)

In [16]:
#print(column_with_nan)
print(x_train.shape)
print(len(column_with_nan))
print(max(column_with_nan.values()))
print(min(column_with_nan.values()))

(328135, 321)
239
328103
1


## Implementations

In [17]:
def split_data(x, y, ratio, seed=1):
    """
    split the dataset based on the split ratio. If ratio is 0.8
    you will have 80% of your data set dedicated to training
    and the rest dedicated to testing. If ratio times the number of samples is not round
    you can use np.floor. Also check the documentation for np.random.permutation,
    it could be useful.

    Args:
        x: numpy array of shape (N,), N is the number of samples.
        y: numpy array of shape (N,).
        ratio: scalar in [0,1]
        seed: integer.

    Returns:
        x_tr: numpy array containing the train data.
        x_te: numpy array containing the test data.
        y_tr: numpy array containing the train labels.
        y_te: numpy array containing the test labels.

    >>> split_data(np.arange(13), np.arange(13), 0.8, 1)
    (array([ 2,  3,  4, 10,  1,  6,  0,  7, 12,  9]), array([ 8, 11,  5]), array([ 2,  3,  4, 10,  1,  6,  0,  7, 12,  9]), array([ 8, 11,  5]))
    """
    # set seed
    np.random.seed(seed)
    # ***************************************************
    indices = np.random.permutation(len(x))
    x_perm = x[indices]
    y_perm = y[indices]
    split = int(np.floor(ratio*len(x)))
    x_tr = x_perm[0:split]
    x_te = x_perm[split:len(x_perm)]
    y_tr = y_perm[0:split]
    y_te = y_perm[split:len(x_perm)]
    # ***************************************************
    return x_tr, x_te, y_tr, y_te

In [18]:
std_data_train = standardize(sliced_x_train)

In [19]:
ratio = 0.8

In [20]:
x_tr, x_val, y_tr, y_val = split_data(x = std_data_train, y = y_train, ratio = ratio, seed=1)

In [21]:
x_tr.shape

(262508, 82)

In [22]:
print(f"After the split of {ratio*100} we have, for the training set a shape of {x_tr.shape} and for the validation set {x_val.shape}")

After the split of 80.0 we have, for the training set a shape of (262508, 82) and for the validation set (65627, 82)


### Least Squares

In [23]:
def classification_metrics(y_true, y_pred_proba, threshold=0):
    """Calculate and print classification metrics given the true labels and predicted probabilities.
    
    Args:
        y_val: numpy array of shape (N,), N is the number of samples. True labels.
        y_pred_proba: numpy array of shape (N,), predicted probabilities for class 1.
        threshold: scalar. The threshold to convert probabilities to binary predictions.
    
    Returns:
        accuracy, precision, recall (TPR), FPR, F1-score
    """
    # Step 1: Convert probabilities to binary predictions using the threshold
    y_pred = np.where(y_pred_proba >= threshold, 1, -1)
    
    # Step 2: Calculate TP, FP, TN, FN
    TP = np.sum((y_true == 1) & (y_pred == 1))
    FP = np.sum((y_true == 1) & (y_pred == -1))
    TN = np.sum((y_true == -1) & (y_pred == -1))
    FN = np.sum((y_true == 1) & (y_pred == -1))
    
    # Step 3: Calculate metrics
    accuracy = (TP + TN) / (TP + TN + FP + FN) if (TP + TN + FP + FN) > 0 else 0
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0  # True Positive Rate
    fpr = FP / (FP + TN) if (FP + TN) > 0 else 0  # False Positive Rate
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    # Print the results
    print(f"Threshold: {threshold}")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall (TPR): {recall}")
    print(f"FPR: {fpr}")
    print(f"F1-Score: {f1}")
    
    return accuracy, precision, recall, fpr, f1

In [24]:
def predict(tx, w):
    """Make predictions using the learned weights."""
    return tx @ w

In [37]:
w, train_mse = least_squares(y_tr, x_tr)
#print(f"Trained weights: {w}")
print(f"Training MSE: {train_mse}")

Training MSE: 0.4803011171652391


In [38]:
# Make predictions on the trainig set
y_pred = predict(x_val, w)

# Compute MSE on the test set
test_mse = compute_MSE(y_val, x_val, w)
print(f"Test MSE: {test_mse}")

Test MSE: 0.48191135689480163


In [39]:
print(y_pred[:10])

[-0.17758127  0.06641111  0.27531845 -0.09796116 -0.21646513  0.10909002
 -0.17809572  0.06971775 -0.25512041 -0.11519682]


In [40]:
print(y_val[:5])

[-1 -1 -1 -1 -1]


In [41]:
accuracy, precision, recall, fpr, f1 = classification_metrics(y_val, y_pred, threshold=0)

Threshold: 0
Accuracy: 0.9677994742771311
Precision: 0.8788627935723115
Recall (TPR): 0.8788627935723115
FPR: 0.018568141832453647
F1-Score: 0.8788627935723115


In [42]:
x_test_sliced = x_test[:, columns_to_keep]

In [43]:
print(x_test_sliced.shape)

(109379, 82)


In [44]:
# Make predictions on the test set
y_pred_subm = predict(x_test_sliced, w)

In [47]:
print(y_pred_subm[:20])
print(np.mean(y_pred_subm))
print(np.min(y_pred_subm))

[-1.08867459e+09 -6.24998961e+08 -2.74831505e+09 -1.07912041e+09
 -6.37052305e+09 -5.89932078e+09 -6.24076083e+08 -3.17680476e+09
 -4.19301824e+09 -1.12069487e+09 -3.21821134e+09 -5.26872186e+09
 -1.63224584e+09 -1.57994191e+09 -2.66575254e+09 -3.14583416e+09
 -1.57889649e+09 -1.60085937e+09 -1.17270502e+09 -2.64951253e+09]
-3425780992.976299
-6428292305.751772


In [32]:
y_pred_submission = np.where(y_pred >= 0, 1, -1)

In [48]:
print(y_pred_submission[:20])
print(np.mean(y_pred_submission))

[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
-1.0


In [31]:
#create_csv_submission(test_ids, y_pred, "submission_ls")