In [1]:
# LDA (linear discriminant analysis) Model

# 5 Methods:
# Chi-Squared
# Extra Trees
# L1 Regularization - Lasso #
# Random Forest
# Mutual Info

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

celeb_data = pd.read_csv('../list_attr_celeba.csv')

celeb_data.replace(1, 1.0, inplace=True)
celeb_data.replace(-1, 0.0, inplace=True)
celeb_data.drop(columns = ['image_id', '5_o_Clock_Shadow', 'Arched_Eyebrows', 'Bags_Under_Eyes', 'Bushy_Eyebrows', 'Bangs', 'Blurry', 'Smiling', 'Mouth_Slightly_Open', 'Wearing_Earrings', 'Wearing_Hat', 'Wearing_Lipstick', 'Wearing_Necklace', 'Wearing_Necktie'], inplace=True)

celeb_data.head()

column_names = celeb_data.drop("Attractive", axis=1).columns
X = celeb_data.drop("Attractive", axis=1).values
y = celeb_data["Attractive"].values

In [3]:
# training lasso (logistic regression with L1 regularization)
lasso = LogisticRegression(penalty='l1', solver='liblinear', random_state=42)
lasso.fit(X, y)

# lasso coefficients
lasso_coefficients = lasso.coef_.flatten()

# lasso coefficients
sorted_lasso = sorted(
    zip(column_names, lasso_coefficients),  # zip feature names with coefficients
    key=lambda x: abs(x[1]),  # sort by abs value of coefficients
    reverse=True
)

# sorted coefficients
print("\nLasso Coefficients (sorted by absolute value):")
for feature, coefficient in sorted_lasso:
    print(f"{feature}: {coefficient:.4f}")



Lasso Coefficients (sorted by absolute value):
Chubby: -1.8687
Young: 1.5701
Gray_Hair: -1.4311
Heavy_Makeup: 1.4144
Bald: -1.2632
Eyeglasses: -1.1990
Double_Chin: -0.9540
Pale_Skin: 0.8839
Receding_Hairline: -0.8623
Pointy_Nose: 0.5691
Oval_Face: 0.5552
Straight_Hair: 0.4262
Narrow_Eyes: -0.4219
Wavy_Hair: 0.3916
Sideburns: 0.3800
High_Cheekbones: 0.3357
Brown_Hair: 0.3279
Big_Nose: -0.2772
Rosy_Cheeks: 0.2633
Mustache: -0.2084
Blond_Hair: 0.2078
Goatee: -0.1970
No_Beard: -0.1931
Male: -0.1620
Black_Hair: 0.1222
Big_Lips: -0.0115


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# split dataset using ExtraTrees-selected features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# training the LDA model
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)

# predicting on test set
y_pred_test = lda.predict(X_test)

# accuracy
accuracy = accuracy_score(y_test, y_pred_test)
print(f"Accuracy: {accuracy:.6f}")

# precision
precision = precision_score(y_test, y_pred_test, average='binary')
print(f"Precision: {precision:.6f}")

# recall
recall = recall_score(y_test, y_pred_test, average='binary')
print(f"Recall: {recall:.6f}")

# F1 score
f1 = f1_score(y_test, y_pred_test, average='binary')
print(f"F1 Score: {f1:.6f}")

Accuracy: 0.757470
Precision: 0.774117
Recall: 0.741577
F1 Score: 0.757497
