In [1]:
# SVM (Support Vector Machine) Model

# 5 Methods:
# Chi-Squared 
# Extra Trees
# L1 Regularization - Lasso
# Random Forest # 
# Mutual Info 

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score

celeb_data = pd.read_csv('../list_attr_celeba.csv')

celeb_data.replace(1, 1.0, inplace=True)
celeb_data.replace(-1, 0.0, inplace=True)
celeb_data.drop(columns = ['image_id', '5_o_Clock_Shadow', 'Arched_Eyebrows', 'Bags_Under_Eyes', 'Bushy_Eyebrows', 'Bangs', 'Blurry', 'Smiling', 'Mouth_Slightly_Open', 'Wearing_Earrings', 'Wearing_Hat', 'Wearing_Lipstick', 'Wearing_Necklace', 'Wearing_Necktie'], inplace=True)

celeb_data.head()

column_names = celeb_data.drop("Attractive", axis=1).columns
X = celeb_data.drop("Attractive", axis=1).values
y = celeb_data["Attractive"].values

In [3]:
# random forest feature selection
rf = RandomForestClassifier(random_state=42)
rf.fit(X, y)

# extracting feature importances
feature_importances = rf.feature_importances_

# selecting top 10 most important features
selected_indices = feature_importances.argsort()[-10:][::-1]
selected_features = column_names[selected_indices]
X_new = pd.DataFrame(X, columns=column_names)[selected_features]

print("selected features:", selected_features)

# training-testing split (70/30)
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=42)

# training svm model
svm = SVC(random_state=42, kernel='linear')
svm.fit(X_train, y_train)

# predicting on test set
y_pred = svm.predict(X_test)

# classification report
print("\nclassification report:")
print(classification_report(y_test, y_pred))

selected features: Index(['Heavy_Makeup', 'Young', 'Male', 'Big_Nose', 'Pointy_Nose', 'Chubby',
       'Oval_Face', 'Wavy_Hair', 'Eyeglasses', 'High_Cheekbones'],
      dtype='object')

classification report:
              precision    recall  f1-score   support

           0       0.68      0.86      0.76     29734
           1       0.82      0.61      0.70     31046

    accuracy                           0.73     60780
   macro avg       0.75      0.73      0.73     60780
weighted avg       0.75      0.73      0.73     60780



In [4]:
# accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.6f}")

# precision
precision = precision_score(y_test, y_pred, average='binary')
print(f"Precision: {precision:.6f}")

# recall
recall = recall_score(y_test, y_pred, average='binary')
print(f"Recall: {recall:.6f}")

# F1 score
f1 = f1_score(y_test, y_pred, average='binary')
print(f"F1 Score: {f1:.6f}")

the accuracy of the svm model with random forest feature selection is: 0.730339
Precision: 0.815129
Recall: 0.610546
F1 Score: 0.698158
