In [9]:
# LDA (linear discriminant analysis) Model

# 5 Methods:
# Chi-Squared
# Extra Trees 
# L1 Regularization - Lasso 
# Random Forest #
# Mutual Info

In [10]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

celeb_data = pd.read_csv('../list_attr_celeba.csv')

celeb_data.replace(1, 1.0, inplace=True)
celeb_data.replace(-1, 0.0, inplace=True)
celeb_data.drop(columns = ['image_id', '5_o_Clock_Shadow', 'Arched_Eyebrows', 'Bags_Under_Eyes', 'Bushy_Eyebrows', 'Bangs', 'Blurry', 'Smiling', 'Mouth_Slightly_Open', 'Wearing_Earrings', 'Wearing_Hat', 'Wearing_Lipstick', 'Wearing_Necklace', 'Wearing_Necktie'], inplace=True)

celeb_data.head()

column_names = celeb_data.drop("Attractive", axis=1).columns
X = celeb_data.drop("Attractive", axis=1).values
y = celeb_data["Attractive"].values

In [11]:
# training Random Forest model
rf = RandomForestClassifier(random_state=42)
rf.fit(X, y)

# feature importances
feature_importances = rf.feature_importances_

# sort/display Random Forest feature importances
sorted_rf = sorted(zip(column_names, feature_importances), key=lambda x: x[1], reverse=True)

print("Random Forest Feature Importances (sorted):")
for feature, importance in sorted_rf:
    print(f"{feature}: {importance:.4f}")

Random Forest Feature Importances (sorted):
Heavy_Makeup: 0.2593
Young: 0.1524
Male: 0.1108
Big_Nose: 0.0500
Pointy_Nose: 0.0406
Chubby: 0.0381
Oval_Face: 0.0359
Wavy_Hair: 0.0313
Eyeglasses: 0.0312
High_Cheekbones: 0.0246
Straight_Hair: 0.0226
Big_Lips: 0.0220
Receding_Hairline: 0.0220
Narrow_Eyes: 0.0185
Black_Hair: 0.0183
Brown_Hair: 0.0177
Gray_Hair: 0.0162
No_Beard: 0.0152
Pale_Skin: 0.0142
Double_Chin: 0.0127
Blond_Hair: 0.0124
Rosy_Cheeks: 0.0086
Goatee: 0.0077
Sideburns: 0.0076
Mustache: 0.0065
Bald: 0.0039


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

importances = rf.feature_importances_
selected_features = column_names[importances > 0.01]  # features with importance > threshold
X_selected = pd.DataFrame(X, columns=column_names)[selected_features].values

# splitting dataset using selected features
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

# training the LDA model
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)

# predicting on test set
y_pred_test = lda.predict(X_test)

# accuracy
lda_accuracy = accuracy_score(y_test, y_pred_test)

print(f"The accuracy of the LDA model with RandomForest-selected features is: {lda_accuracy:.6f}")


The accuracy of the LDA model with RandomForest-selected features is: 0.757848
