In [3]:
# !pip install xgboost --quiet
# !pip install --upgrade xgboost


In [3]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder

from xgboost import XGBClassifier
import torch
from torchvision import datasets, transforms
from PIL import Image
import numpy as np
import os

In [4]:
# # Pytorch - Dataloader

# def load_prof_for_ml(train_path, test_path):

#     transform = transforms.Compose([
#         transforms.Resize((226, 226)),
#         transforms.ToTensor(),
#         transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
#     ])
    
#     traindataset = datasets.ImageFolder(root=train_path, transform=transform)
#     testdataset = datasets.ImageFolder(root=test_path, transform=transform)
    
#     # to a list of tuples with images and labels
#     train_data = [(image.numpy(), label) for image, label in traindataset]
#     test_data = [(image.numpy(), label) for image, label in testdataset]
    
#     # features and labels
#     X_train, y_train = zip(*train_data)
#     X_test, y_test = zip(*test_data)
    
#     # to numpy arrays
#     X_train = torch.tensor(X_train).numpy().reshape(len(X_train), -1)
#     X_test = torch.tensor(X_test).numpy().reshape(len(X_test), -1)
#     y_train = torch.tensor(y_train).numpy()
#     y_test = torch.tensor(y_test).numpy()
    
#     return X_train, X_test, y_train, y_test


In [5]:
# Without Pytorch

def is_image_file(filename):
    valid_extensions = [".jpg", ".jpeg", ".png", ".gif", ".bmp"]
    return any(filename.lower().endswith(ext) for ext in valid_extensions)

def load_images_from_folder(folder):
    images = []
    labels = []
    for filename in os.listdir(folder):
        if is_image_file(filename):
            label = os.path.basename(folder)
            img_path = os.path.join(folder, filename)
            with Image.open(img_path) as img:
                img = img.convert('RGB')
                img = img.resize((226, 226))
                images.append(np.array(img))
                labels.append(label)
    return images, labels


def load_prof_for_ml(train_path, test_path):
    X_train, y_train = [], []
    X_test, y_test = [], []

    for class_folder in os.listdir(train_path):
        imgs, lbls = load_images_from_folder(os.path.join(train_path, class_folder))
        X_train.extend(imgs)
        y_train.extend(lbls)

    for class_folder in os.listdir(test_path):
        imgs, lbls = load_images_from_folder(os.path.join(test_path, class_folder))
        X_test.extend(imgs)
        y_test.extend(lbls)

    X_train = np.array(X_train).reshape(len(X_train), -1)
    X_test = np.array(X_test).reshape(len(X_test), -1)

    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)
    
    return X_train, X_test, y_train, y_test


In [6]:
train_path = '/home/ubuntu/W210-Capstone/notebooks/idenprof/train'
test_path = '/home/ubuntu/W210-Capstone/notebooks/idenprof/test'
X_train, X_test, y_train, y_test = load_prof_for_ml(train_path, test_path)

In [7]:
X_train

array([[172, 222, 255, ..., 129, 106,  64],
       [116,  87,  53, ...,  57,  56,  54],
       [151, 193, 251, ..., 181, 207, 244],
       ...,
       [235, 230, 224, ..., 110, 102,  91],
       [193, 199, 189, ..., 217, 229, 241],
       [ 12,  13,   0, ...,  98,  98,  96]], dtype=uint8)

In [8]:
len(y_train)

9000

In [6]:
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100),
    # 'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    # 'Gradient Boosting': GradientBoostingClassifier(n_estimators=100),
    # 'AdaBoost': AdaBoostClassifier(n_estimators=100)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)

    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions, average='weighted')
    recall = recall_score(y_test, predictions, average='weighted')
    precision = precision_score(y_test, predictions, average='weighted')
    
    lb = LabelBinarizer()
    y_test_binarized = lb.fit_transform(y_test)
    predictions_binarized = lb.transform(predictions)
    auc = roc_auc_score(y_test_binarized, predictions_binarized, average='macro')

    print(f"{name} - Accuracy: {accuracy}, F1: {f1}, Recall: {recall}, Precision: {precision}, AUC: {auc}")


Random Forest - Accuracy: 0.467, F1: 0.4597919427256182, Recall: 0.467, Precision: 0.4665723225481096, AUC: 0.7038888888888889


In [None]:
# models = {
#     'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
# }

# for name, model in models.items():
#     model.fit(X_train, y_train)
#     predictions = model.predict(X_test)

#     accuracy = accuracy_score(y_test, predictions)
#     f1 = f1_score(y_test, predictions, average='weighted')
#     recall = recall_score(y_test, predictions, average='weighted')
#     precision = precision_score(y_test, predictions, average='weighted')
    
#     lb = LabelBinarizer()
#     y_test_binarized = lb.fit_transform(y_test)
#     predictions_binarized = lb.transform(predictions)
#     auc = roc_auc_score(y_test_binarized, predictions_binarized, average='macro')

#     print(f"{name} - Accuracy: {accuracy}, F1: {f1}, Recall: {recall}, Precision: {precision}, AUC: {auc}")


In [7]:
# xgb_model = XGBClassifier(
#     n_estimators=10,  # kernel keeps dying n_estimators=100
#     learning_rate=0.1,
#     gamma=2,
#     subsample=0.9,
#     colsample_bytree=0.5,
#     use_label_encoder=False,
#     eval_metric='logloss',
#     n_jobs=1, # kernel keeps dying
#     early_stopping_rounds = 10
# )


# eval_set = [(X_train, y_train), (X_test, y_test)]  
# # xgb_model.fit(X_train, y_train, early_stopping_rounds=10, eval_set=eval_set, verbose=True)
# # eval_set = [(X_test, y_test)]
# xgb_model.fit(X_train, y_train, eval_set=eval_set, verbose=True)

# predictions = xgb_model.predict(X_test)

# accuracy = accuracy_score(y_test, predictions)
# f1 = f1_score(y_test, predictions, average='weighted')
# recall = recall_score(y_test, predictions, average='weighted')
# precision = precision_score(y_test, predictions, average='weighted')

# lb = LabelBinarizer()
# y_test_binarized = lb.fit_transform(y_test)
# predictions_binarized = lb.transform(predictions)
# auc = roc_auc_score(y_test_binarized, predictions_binarized, average='macro')

# print(f"XGBoost - Accuracy: {accuracy}, F1: {f1}, Recall: {recall}, Precision: {precision}, AUC: {auc}")


In [4]:
xgb_model = XGBClassifier(
    n_estimators=10,
    learning_rate=0.1,
    use_label_encoder=False,  
    eval_metric='mlogloss',   # multi-class classification
    n_jobs=1,                  # To prevent the kernel from dying
    early_stopping_rounds=10
)

eval_set = [(X_train, y_train), (X_test, y_test)]  
xgb_model.fit(
    X_train, 
    y_train,
    eval_set=eval_set, 
    verbose=True
)

predictions = xgb_model.predict(X_test)

accuracy = accuracy_score(y_test, predictions)
f1 = f1_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
precision = precision_score(y_test, predictions, average='weighted')

lb = LabelBinarizer()
y_test_binarized = lb.fit_transform(y_test)
predictions_binarized = lb.transform(predictions)
auc = roc_auc_score(y_test_binarized, predictions_binarized, average='macro')

print(f"Accuracy: {accuracy}, F1: {f1}, Recall: {recall}, Precision: {precision}, AUC: {auc}")


[0]	validation_0-mlogloss:2.14401	validation_1-mlogloss:2.23760
[1]	validation_0-mlogloss:2.01155	validation_1-mlogloss:2.18098
[2]	validation_0-mlogloss:1.89377	validation_1-mlogloss:2.13118
[3]	validation_0-mlogloss:1.79101	validation_1-mlogloss:2.08732
[4]	validation_0-mlogloss:1.69771	validation_1-mlogloss:2.05028
[5]	validation_0-mlogloss:1.61259	validation_1-mlogloss:2.01462
[6]	validation_0-mlogloss:1.53474	validation_1-mlogloss:1.98281
[7]	validation_0-mlogloss:1.46093	validation_1-mlogloss:1.95400
[8]	validation_0-mlogloss:1.39292	validation_1-mlogloss:1.92931
[9]	validation_0-mlogloss:1.32978	validation_1-mlogloss:1.90691
Accuracy: 0.433, F1: 0.4279137152355479, Recall: 0.433, Precision: 0.42957091853074925, AUC: 0.6849999999999999


In [7]:
## need to look into this code, but this takes FOREVER!!!!!

models = {
    'Gradient Boosting': GradientBoostingClassifier(
        n_estimators=3, # initally had 100, and tried 5 and keep losing connection - you can try with more estimators if that works for you
        learning_rate=0.1,
        n_iter_no_change=5,  
        validation_fraction=0.1, 
        verbose=True,
        warm_start=True  # Enable warm start
    )
}
for name, model in models.items():
    if name == 'Gradient Boosting':
        # eval_set = [(X_train, y_train), (X_test, y_test)]  
        # model.fit(X_train, y_train, verbose=True)
        model.fit(X_train, y_train)
    else:
        # eval_set = [(X_train, y_train), (X_test, y_test)]  
        model.fit(X_train, y_train)
        
    predictions = model.predict(X_test)

    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions, average='weighted')
    recall = recall_score(y_test, predictions, average='weighted')
    precision = precision_score(y_test, predictions, average='weighted')
    
    # AUC
    lb = LabelBinarizer()
    y_test_binarized = lb.fit_transform(y_test)
    predictions_binarized = lb.transform(predictions)
    auc = roc_auc_score(y_test_binarized, predictions_binarized, average='macro')

    print(f"{name} - Accuracy: {accuracy}, F1: {f1}, Recall: {recall}, Precision: {precision}, AUC: {auc}")


      Iter       Train Loss   Remaining Time 
         1           2.1911           79.43m
         2           2.1014           39.76m
         3           2.0262            0.00s
Gradient Boosting - Accuracy: 0.3385, F1: 0.32295341310197223, Recall: 0.3385, Precision: 0.32798180543599725, AUC: 0.6325000000000001


In [1]:
# ### Testing 

# from sklearn.ensemble import AdaBoostClassifier
# from sklearn.model_selection import GridSearchCV

# # Define the model
# ada_boost = AdaBoostClassifier()

# # Set up the parameters grid
# param_grid = {
#     'n_estimators': [10],
#     'learning_rate': [0.01, 0.1, 1.0]
# }

# # Perform grid search
# grid_search = GridSearchCV(estimator=ada_boost, param_grid=param_grid, cv=5, scoring='accuracy')

# # Fit the model
# grid_search.fit(X_train, y_train)

# # Print the best parameters
# print(f"Best parameters: {grid_search.best_params_}")

# # Use the best estimator for further predictions
# best_model = grid_search.best_estimator_


In [9]:
models = {
    'AdaBoost': AdaBoostClassifier(
        n_estimators=10,
        learning_rate = 0.1
        
    )}

for name, model in models.items():
    if name == 'Gradient Boosting':
        # eval_set = [(X_train, y_train), (X_test, y_test)]  
        # model.fit(X_train, y_train, verbose=True)
        model.fit(X_train, y_train)
    else:
        # eval_set = [(X_train, y_train), (X_test, y_test)]  
        model.fit(X_train, y_train)
        
    predictions = model.predict(X_test)

    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions, average='weighted')
    recall = recall_score(y_test, predictions, average='weighted')
    precision = precision_score(y_test, predictions, average='weighted')
    
    # AUC
    lb = LabelBinarizer()
    y_test_binarized = lb.fit_transform(y_test)
    predictions_binarized = lb.transform(predictions)
    auc = roc_auc_score(y_test_binarized, predictions_binarized, average='macro')

    print(f"{name} - Accuracy: {accuracy}, F1: {f1}, Recall: {recall}, Precision: {precision}, AUC: {auc}")


AdaBoost - Accuracy: 0.2375, F1: 0.18981870885516053, Recall: 0.2375, Precision: 0.1837460971640944, AUC: 0.576388888888889


In [11]:
models = {
    'AdaBoost': AdaBoostClassifier(
        n_estimators=10,
        learning_rate = 0.2
        
    )}

for name, model in models.items():
    if name == 'Gradient Boosting':
        # eval_set = [(X_train, y_train), (X_test, y_test)]  
        # model.fit(X_train, y_train, verbose=True)
        model.fit(X_train, y_train)
    else:
        # eval_set = [(X_train, y_train), (X_test, y_test)]  
        model.fit(X_train, y_train)
        
    predictions = model.predict(X_test)

    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions, average='weighted')
    recall = recall_score(y_test, predictions, average='weighted')
    precision = precision_score(y_test, predictions, average='weighted')
    
    # AUC
    lb = LabelBinarizer()
    y_test_binarized = lb.fit_transform(y_test)
    predictions_binarized = lb.transform(predictions)
    auc = roc_auc_score(y_test_binarized, predictions_binarized, average='macro')

    print(f"{name} - Accuracy: {accuracy}, F1: {f1}, Recall: {recall}, Precision: {precision}, AUC: {auc}")


AdaBoost - Accuracy: 0.2455, F1: 0.23398605757347404, Recall: 0.2455, Precision: 0.23345162230307406, AUC: 0.5808333333333333


In [None]:
# models = {
#     'Gradient Boosting': GradientBoostingClassifier(n_estimators=100),
#     'AdaBoost': AdaBoostClassifier(n_estimators=100)
# }

# for name, model in models.items():
#     model.fit(X_train, y_train)
#     predictions = model.predict(X_test)

#     accuracy = accuracy_score(y_test, predictions)
#     f1 = f1_score(y_test, predictions, average='weighted')
#     recall = recall_score(y_test, predictions, average='weighted')
#     precision = precision_score(y_test, predictions, average='weighted')
    
#     lb = LabelBinarizer()
#     y_test_binarized = lb.fit_transform(y_test)
#     predictions_binarized = lb.transform(predictions)
#     auc = roc_auc_score(y_test_binarized, predictions_binarized, average='macro')

#     print(f"{name} - Accuracy: {accuracy}, F1: {f1}, Recall: {recall}, Precision: {precision}, AUC: {auc}")
