In [25]:
import pandas as pd
from sklearn.model_selection import cross_val_predict, RandomizedSearchCV, GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, confusion_matrix
import matplotlib.pyplot as plt
from time import time
import numpy as np

In [10]:
# get features and labels
data = pd.read_pickle('pifsc_embeddings2.pickle')
features = dict(data)
data = pd.read_pickle('pifsc_embeddings.pkl')
features.update(data)

cols = ["species","image","distinctiveness","quality"]
labels1 = pd.read_csv('pifsc_labels_detail.csv')[cols]
labels1[['quality', 'distinctiveness']] = labels1[['distinctiveness', 'quality']]

cols = ["species","filename","distinctiveness","quality"]
labels2 = pd.read_csv('pifsc_labels2.csv')[cols]
labels2.columns = ["species","image","distinctiveness","quality"]

labels = pd.concat([labels1, labels2]).reset_index(drop=True)

In [11]:
# clean labels
is_na = labels.quality.isna()
labels = labels.loc[~is_na]
labels = labels.drop_duplicates(subset='image', keep='first')
# labels.reset_index(drop=False, inplace=True)

# clean features
imgs = labels.image.tolist()
features = {k: v for k, v in features.items() if k in imgs}
assert len(features) == len(labels)

In [20]:
# get data, split
Y = labels.quality
X = np.array([features[i] for i in labels.image])
species_code = labels.species.astype('category').cat.codes
X = np.insert(X, 0, species_code, axis=1)
target_names = 'high medium low'.split()
# species = labels.species.astype('category').cat.codes
# X = np.insert(X, 0, species, axis=1)
# sc = StandardScaler()
# X = sc.fit_transform(X)
# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42, stratify=Y)

In [26]:
# train MLP
model = MLPClassifier(max_iter=300, random_state=42, early_stopping=True)
# train Logistic regression
# classifiers = [SVC(class_weight='balanced', random_state=42, kernel='poly', degree=2),
#                SVC(class_weight='balanced', random_state=42, kernel='poly')]
# for clf in classifiers:
#     clf.fit(X_train, Y_train)

In [31]:
# hyperparameter tuning
params = {
    'hidden_layer_sizes': [(50,), (100,), (150), (100, 50)],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant', 'adaptive']
}

n_iter = 20
random_search = RandomizedSearchCV(model, params, n_iter=n_iter, n_jobs=-1, cv=5, random_state=42, verbose=3)

In [32]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results["rank_test_score"] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print(
                "Mean validation score: {0:.3f} (std: {1:.3f})".format(
                    results["mean_test_score"][candidate],
                    results["std_test_score"][candidate],
                )
            )
            print("Parameters: {0}".format(results["params"][candidate]))
            print("")

t0 = time()
random_search.fit(X, Y)
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - t0), n_iter)
)
report(random_search.cv_results_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits




NameError: name 'start' is not defined

In [None]:
best = random_search.best_estimator_
pred = cross_val_predict(best, X, Y, n_jobs=-1, verbose=1)
# print(f'deg={clf.degree}'.center(53, '-'))
print(classification_report(Y, pred, target_names=target_names))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    9.2s remaining:   13.7s


In [None]:
# results
cm = confusion_matrix(Y, pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=target_names)
disp.plot()
# ax.title.set_text(f'{type(clf).__name__} {clf.kernel} {clf.degree}')
plt.tight_layout()