In [3]:
# !pip install xgboost --quiet

In [4]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelBinarizer
import torch
from torchvision import datasets, transforms

In [5]:
def load_prof_for_ml(train_path, test_path):

    transform = transforms.Compose([
        transforms.Resize((226, 226)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    
    traindataset = datasets.ImageFolder(root=train_path, transform=transform)
    testdataset = datasets.ImageFolder(root=test_path, transform=transform)
    
    # to a list of tuples with images and labels
    train_data = [(image.numpy(), label) for image, label in traindataset]
    test_data = [(image.numpy(), label) for image, label in testdataset]
    
    # features and labels
    X_train, y_train = zip(*train_data)
    X_test, y_test = zip(*test_data)
    
    # to numpy arrays
    X_train = torch.tensor(X_train).numpy().reshape(len(X_train), -1)
    X_test = torch.tensor(X_test).numpy().reshape(len(X_test), -1)
    y_train = torch.tensor(y_train).numpy()
    y_test = torch.tensor(y_test).numpy()
    
    return X_train, X_test, y_train, y_test


In [None]:
train_path = '/home/ubuntu/W210-Capstone/notebooks/idenprof/train'
test_path = '/home/ubuntu/W210-Capstone/notebooks/idenprof/test'
X_train, X_test, y_train, y_test = load_prof_for_ml(train_path, test_path)

In [None]:
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100),
    # 'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    # 'Gradient Boosting': GradientBoostingClassifier(n_estimators=100),
    # 'AdaBoost': AdaBoostClassifier(n_estimators=100)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)

    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions, average='weighted')
    recall = recall_score(y_test, predictions, average='weighted')
    precision = precision_score(y_test, predictions, average='weighted')
    
    lb = LabelBinarizer()
    y_test_binarized = lb.fit_transform(y_test)
    predictions_binarized = lb.transform(predictions)
    auc = roc_auc_score(y_test_binarized, predictions_binarized, average='macro')

    print(f"{name} - Accuracy: {accuracy}, F1: {f1}, Recall: {recall}, Precision: {precision}, AUC: {auc}")


In [None]:
models = {
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
}

for name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)

    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions, average='weighted')
    recall = recall_score(y_test, predictions, average='weighted')
    precision = precision_score(y_test, predictions, average='weighted')
    
    lb = LabelBinarizer()
    y_test_binarized = lb.fit_transform(y_test)
    predictions_binarized = lb.transform(predictions)
    auc = roc_auc_score(y_test_binarized, predictions_binarized, average='macro')

    print(f"{name} - Accuracy: {accuracy}, F1: {f1}, Recall: {recall}, Precision: {precision}, AUC: {auc}")


In [None]:
models = {
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100),
}

for name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)

    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions, average='weighted')
    recall = recall_score(y_test, predictions, average='weighted')
    precision = precision_score(y_test, predictions, average='weighted')
    
    lb = LabelBinarizer()
    y_test_binarized = lb.fit_transform(y_test)
    predictions_binarized = lb.transform(predictions)
    auc = roc_auc_score(y_test_binarized, predictions_binarized, average='macro')

    print(f"{name} - Accuracy: {accuracy}, F1: {f1}, Recall: {recall}, Precision: {precision}, AUC: {auc}")


In [None]:
models = {
    'AdaBoost': AdaBoostClassifier(n_estimators=100)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)

    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions, average='weighted')
    recall = recall_score(y_test, predictions, average='weighted')
    precision = precision_score(y_test, predictions, average='weighted')
    
    lb = LabelBinarizer()
    y_test_binarized = lb.fit_transform(y_test)
    predictions_binarized = lb.transform(predictions)
    auc = roc_auc_score(y_test_binarized, predictions_binarized, average='macro')

    print(f"{name} - Accuracy: {accuracy}, F1: {f1}, Recall: {recall}, Precision: {precision}, AUC: {auc}")
