In [1]:
# Logistic Regression Model

# 5 Methods:
# Chi-Squared
# Extra Trees
# L1 Regularization - Lasso
# Random Forest 
# Mutual Info #

In [2]:
import pandas as pd
from sklearn.feature_selection import mutual_info_classif

celeb_data = pd.read_csv('../list_attr_celeba.csv')

celeb_data.replace(1, 1.0, inplace=True)
celeb_data.replace(-1, 0.0, inplace=True)
celeb_data.drop(columns = ['image_id', '5_o_Clock_Shadow', 'Arched_Eyebrows', 'Bags_Under_Eyes', 'Bushy_Eyebrows', 'Bangs', 'Blurry', 'Smiling', 'Mouth_Slightly_Open', 'Wearing_Earrings', 'Wearing_Hat', 'Wearing_Lipstick', 'Wearing_Necklace', 'Wearing_Necktie'], inplace=True)

celeb_data.head()

column_names = celeb_data.drop("Attractive", axis=1).columns
X = celeb_data.drop("Attractive", axis=1).values
y = celeb_data["Attractive"].values

In [3]:
mi_scores = mutual_info_classif(X, y)

# Sorting and printing Mutual Information scores based on scores
mi_results = sorted(
    zip(column_names, mi_scores),
    key=lambda x: x[1],  # Sort by score
    reverse=True  # Higher scores first
)

print("\nMutual Information Scores (sorted by score):")
for feature, score in mi_results:
    print(f"{feature}: {score:.2f}")


Mutual Information Scores (sorted by score):
Heavy_Makeup: 0.14
Young: 0.10
Male: 0.09
Big_Nose: 0.04
No_Beard: 0.04
Chubby: 0.03
Pointy_Nose: 0.03
Wavy_Hair: 0.03
Eyeglasses: 0.03
Gray_Hair: 0.03
Oval_Face: 0.03
Double_Chin: 0.03
High_Cheekbones: 0.02
Rosy_Cheeks: 0.01
Receding_Hairline: 0.01
Blond_Hair: 0.01
Goatee: 0.01
Brown_Hair: 0.01
Bald: 0.01
Mustache: 0.01
Sideburns: 0.01
Big_Lips: 0.01
Narrow_Eyes: 0.01
Black_Hair: 0.00
Straight_Hair: 0.00
Pale_Skin: 0.00


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression

selected_features = [feature for feature, score in mi_results if score > 0.01]
X_selected = pd.DataFrame(X, columns=column_names)[selected_features].values

# Splitting dataset using Mutual Information-selected features
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

# Training logistic regression model
log_reg = LogisticRegression(random_state=42, max_iter=1000)
log_reg.fit(X_train, y_train)

# Predicting on test set
y_pred_test = log_reg.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred_test)
print(f"Accuracy: {accuracy:.6f}")

# precision
precision = precision_score(y_test, y_pred_test, average='binary')
print(f"Precision: {precision:.6f}")

# recall
recall = recall_score(y_test, y_pred_test, average='binary')
print(f"Recall: {recall:.6f}")

# F1 score
f1 = f1_score(y_test, y_pred_test, average='binary')
print(f"F1 Score: {f1:.6f}")

Accuracy: 0.753208
Precision: 0.760639
Recall: 0.754171
F1 Score: 0.757391
