In [None]:
import os
from PIL import Image
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm  


IMAGE_SIZE = (224, 224)  
DATA_DIR = "Images"
CLASSES = ["Baroque", "Contemporary_Realism", "Expressionism"]

def load_images(data_dir, classes, image_size):
    images = []
    labels = []
    for label, class_name in enumerate(classes):
        class_dir = os.path.join(data_dir, class_name)
        image_files = os.listdir(class_dir)
        for image_name in tqdm(image_files, desc=f"Loading {class_name}", unit="image"):
            image_path = os.path.join(class_dir, image_name)
            try:
                img = Image.open(image_path).convert("RGB")
                img = img.resize(image_size)
                img_array = np.array(img) / 255.0  
                images.append(img_array)
                labels.append(label)
            except Exception as e:
                print(f"Error loading image {image_path}: {e}")
    return np.array(images), np.array(labels)


X, y = load_images(DATA_DIR, CLASSES, IMAGE_SIZE)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Loaded {len(X_train)} training and {len(X_test)} testing images.")

np.save("X_train.npy", X_train)
np.save("X_test.npy", X_test)
np.save("y_train.npy", y_train)
np.save("y_test.npy", y_test)

X_train = np.load("X_train.npy")
X_test = np.load("X_test.npy")
y_train = np.load("y_train.npy")
y_test = np.load("y_test.npy")
print(f"Loaded cached images: {X_train.shape}, {X_test.shape}")


ImportError: DLL load failed while importing _multiarray_umath: The specified module could not be found.

ImportError: numpy._core.multiarray failed to import

In [None]:
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Model
from tensorflow.keras.applications.resnet50 import preprocess_input

# Load ResNet50 without the classification head
base_model = ResNet50(weights="imagenet", include_top=False, input_shape=(224, 224, 3))
feature_extractor = Model(inputs=base_model.input, outputs=base_model.output)

# Preprocess the images for ResNet50
X_train_preprocessed = preprocess_input(X_train)
X_test_preprocessed = preprocess_input(X_test)

# Extract features using the pretrained ResNet50
train_features = feature_extractor.predict(X_train_preprocessed, batch_size=32, verbose=1)
test_features = feature_extractor.predict(X_test_preprocessed, batch_size=32, verbose=1)

# Flatten features for use in traditional classifiers
train_features = train_features.reshape(len(train_features), -1)
test_features = test_features.reshape(len(test_features), -1)

print(f"Train features shape: {train_features.shape}, Test features shape: {test_features.shape}")


np.save("train_features.npy", train_features)
np.save("test_features.npy", test_features)

train_features = np.load("train_features.npy")
test_features = np.load("test_features.npy")
print(f"Loaded cached features: {train_features.shape}, {test_features.shape}")




ImportError: DLL load failed while importing _multiarray_umath: The specified module could not be found.

ImportError: numpy._core.multiarray failed to import

In [None]:
import numpy as np
from sklearn.decomposition import PCA
import pandas as pd
import matplotlib.pyplot as plt

train_features = np.load("train_features.npy")
test_features = np.load("test_features.npy")

pca = PCA()
pca.fit(train_features)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)

optimal_components = np.argmax(cumulative_variance >= 0.95) + 1
print(f"Number of components to explain 95% variance: {optimal_components}")


plt.figure(figsize=(10, 6))
plt.plot(cumulative_variance, label="Cumulative Variance", color="blue", linewidth=2)
plt.axhline(y=0.95, color="red", linestyle="--", linewidth=1, label="95% Variance Threshold")
plt.axvline(x=optimal_components, color="green", linestyle="--", linewidth=1, label=f"Optimal Components: {optimal_components}")
plt.text(optimal_components + 5000, 0.96, "95% Variance", color="red", fontsize=12)
plt.text(optimal_components + 2, 0.5, f"{optimal_components} Components", color="green", fontsize=12, rotation=90)
plt.title("Explained Variance vs. Number of Components", fontsize=16, fontweight="bold")
plt.xlabel("Number of Components", fontsize=14)
plt.ylabel("Cumulative Explained Variance", fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.grid(color="gray", linestyle="--", linewidth=0.5, alpha=0.7)
plt.ylim(0, 1.02)
plt.xlim(0, len(cumulative_variance))
plt.legend(fontsize=12)
plt.tight_layout()
plt.show()


pca = PCA(n_components=optimal_components)
train_features_pca = pca.fit_transform(train_features)
test_features_pca = pca.transform(test_features)

np.save("train_features_pca.npy", train_features_pca)
np.save("test_features_pca.npy", test_features_pca)

from joblib import dump
pca_model_path = "pca_model.joblib"
dump(pca, pca_model_path)
print(f"PCA model saved to {pca_model_path}")
print(f"Reduced train features shape: {train_features_pca.shape}")
print(f"Reduced test features shape: {test_features_pca.shape}")


In [23]:
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from joblib import dump
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier


train_features_pca = np.load("train_features_pca.npy")
test_features_pca = np.load("test_features_pca.npy")
y_train = np.load("y_train.npy")
y_test = np.load("y_test.npy")

os.makedirs("results", exist_ok=True)

models = {
    "SVM": SVC(kernel="linear", class_weight="balanced", random_state=42),
    "Random Forest": RandomForestClassifier(
        n_estimators=300, max_depth=30, min_samples_split=2, max_features="sqrt", random_state=42
    ),
    "KNN": KNeighborsClassifier(n_neighbors=25, weights="distance", metric="minkowski"),
    "Naive Bayes": GaussianNB(),
    "AdaBoost": AdaBoostClassifier(
        n_estimators=200, learning_rate=0.1, estimator=DecisionTreeClassifier(max_depth=3), random_state=42
    ),
}

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

cv_results = {}
test_set_results = {}
normalized_accuracy = {}

for model_name, model in models.items():
    print(f"Training {model_name}...")
    
    cv_scores = cross_val_score(model, train_features_pca, y_train, cv=cv, scoring="accuracy")
    cv_results[model_name] = cv_scores
    np.savetxt(f"results/{model_name}_cv_scores.csv", cv_scores, delimiter=",")
    print(f"{model_name} Mean CV Accuracy: {cv_scores.mean():.4f}, Std Dev: {cv_scores.std():.4f}")
    model.fit(train_features_pca, y_train)
    
    dump(model, f"results/{model_name}_model.joblib")
    y_pred = model.predict(test_features_pca)
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Test Set Accuracy: {test_accuracy:.4f}")
    
    report = classification_report(y_test, y_pred, output_dict=True)
    pd.DataFrame(report).to_csv(f"results/{model_name}_classification_report.csv")
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    pd.DataFrame(conf_matrix).to_csv(f"results/{model_name}_confusion_matrix.csv")
    test_set_results[model_name] = test_accuracy

svm_mean_accuracy = cv_results["SVM"].mean()  # SVM Mean Accuracy
normalized_accuracy = {}

for model_name, accuracy in test_set_results.items():
    normalized_accuracy[model_name] = svm_mean_accuracy / accuracy

pd.DataFrame.from_dict(normalized_accuracy, orient="index", columns=["Normalized Accuracy"]).to_csv(
    "results/normalized_accuracy.csv"
)

normalized_df = pd.DataFrame(list(normalized_accuracy.items()), columns=["Model", "Normalized Accuracy"])
plt.figure(figsize=(8, 6))
sns.barplot(data=normalized_df, x="Normalized Accuracy", y="Model", palette="viridis")
plt.title("Normalized Accuracy (Lower is Better)")
plt.xlabel("Normalized Accuracy")
plt.ylabel("Model")
plt.tight_layout()
plt.savefig("results/normalized_accuracy_plot.png")
plt.show()


Total number of samples in the dataset: 11434


In [None]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from joblib import dump
import os
import pandas as pd
import time
import seaborn as sns
import matplotlib.pyplot as plt

# Load PCA-reduced features and labels
train_features_pca = np.load("train_features_pca.npy")
test_features_pca = np.load("test_features_pca.npy")
y_train = np.load("y_train.npy")
y_test = np.load("y_test.npy")

os.makedirs("results", exist_ok=True)

models = {
    "SVM": SVC(kernel="linear", class_weight="balanced", random_state=42),
    "Random Forest": RandomForestClassifier(
        n_estimators=300, max_depth=30, min_samples_split=2, max_features="sqrt", random_state=42
    ),
    "KNN": KNeighborsClassifier(n_neighbors=25, weights="distance", metric="minkowski"),
    "Naive Bayes": GaussianNB(),
    "AdaBoost": AdaBoostClassifier(
        n_estimators=200, learning_rate=0.1, estimator=DecisionTreeClassifier(max_depth=3), random_state=42
    ),
}

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

all_results = {}

for model_name, model in models.items():
    print(f"Evaluating {model_name}...")

    training_times = []
    testing_times = []
    fold_accuracies = []

    for repetition in range(3):  # 5 repetitions
        print(f"Repetition {repetition + 1}/5")
        for fold_idx, (train_idx, val_idx) in enumerate(cv.split(train_features_pca, y_train)):
            print(f"  Fold {fold_idx + 1}/10")

            X_train_fold, X_val_fold = train_features_pca[train_idx], train_features_pca[val_idx]
            y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]

            start_time = time.time()
            model.fit(X_train_fold, y_train_fold)
            training_time = time.time() - start_time
            training_times.append(training_time)

            start_time = time.time()
            y_pred_fold = model.predict(X_val_fold)
            testing_time = time.time() - start_time
            testing_times.append(testing_time)

            fold_accuracy = accuracy_score(y_val_fold, y_pred_fold)
            fold_accuracies.append(fold_accuracy)
    all_results[model_name] = {
        "Training Times": training_times,
        "Testing Times": testing_times,
        "Fold Accuracies": fold_accuracies,
    }

    pd.DataFrame({
        "Training Times": training_times,
        "Testing Times": testing_times,
        "Fold Accuracies": fold_accuracies,
    }).to_csv(f"results/{model_name}_cv_times_and_accuracies.csv", index=False)

    model.fit(train_features_pca, y_train)

    dump(model, f"results/{model_name}_final_model.joblib")
    y_pred_test = model.predict(test_features_pca)
    test_accuracy = accuracy_score(y_test, y_pred_test)
    report = classification_report(y_test, y_pred_test, output_dict=True)
    conf_matrix = confusion_matrix(y_test, y_pred_test)
    pd.DataFrame(report).transpose().to_csv(f"results/{model_name}_classification_report.csv")
    pd.DataFrame(conf_matrix).to_csv(f"results/{model_name}_confusion_matrix.csv")

    print(f"{model_name} Test Accuracy: {test_accuracy:.4f}")

summary = {
    "Model": [],
    "Mean Training Time (s)": [],
    "Mean Testing Time (s)": [],
    "Mean Accuracy": [],
}

for model_name, result in all_results.items():
    summary["Model"].append(model_name)
    summary["Mean Training Time (s)"].append(np.mean(result["Training Times"]))
    summary["Mean Testing Time (s)"].append(np.mean(result["Testing Times"]))
    summary["Mean Accuracy"].append(np.mean(result["Fold Accuracies"]))

summary_df = pd.DataFrame(summary)
summary_df.to_csv("results/model_comparison_summary.csv", index=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=summary_df, x="Mean Accuracy", y="Model", palette="viridis")
plt.title("Model Mean Accuracy")
plt.xlabel("Mean Accuracy")
plt.ylabel("Model")
plt.tight_layout()
plt.savefig("results/model_accuracy_comparison.png")
plt.show()

plt.figure(figsize=(10, 6))
sns.barplot(data=summary_df, x="Mean Training Time (s)", y="Model", palette="magma")
plt.title("Model Mean Training Time")
plt.xlabel("Mean Training Time (s)")
plt.ylabel("Model")
plt.tight_layout()
plt.savefig("results/model_training_time_comparison.png")
plt.show()

plt.figure(figsize=(10, 6))
sns.barplot(data=summary_df, x="Mean Testing Time (s)", y="Model", palette="coolwarm")
plt.title("Model Mean Testing Time")
plt.xlabel("Mean Testing Time (s)")
plt.ylabel("Model")
plt.tight_layout()
plt.savefig("results/model_testing_time_comparison.png")
plt.show()


In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from sklearn.preprocessing import label_binarize
from joblib import load
import numpy as np

test_features_pca = np.load("test_features_pca.npy")
y_test = np.load("y_test.npy")
models = {
    "SVM": load("results/SVM_final_model.joblib"),
    "Random Forest": load("results/Random Forest_final_model.joblib"),
    "KNN": load("results/KNN_final_model.joblib"),
    "Naive Bayes": load("results/Naive Bayes_final_model.joblib"),
    "AdaBoost": load("results/AdaBoost_final_model.joblib"),
}

y_test_bin = label_binarize(y_test, classes=np.unique(y_test))
n_classes = y_test_bin.shape[1]

plt.figure(figsize=(10, 8))

for model_name, model in models.items():
    if hasattr(model, "predict_proba"): 
        y_score = model.predict_proba(test_features_pca)
    else:  
        y_score = model.decision_function(test_features_pca)

    for i in range(n_classes):
        fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_score[:, i])
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f"{model_name} (Class {i} AUC = {roc_auc:.2f})")

plt.plot([0, 1], [0, 1], color="gray", linestyle="--")
plt.title("ROC Curves for Multi-Class Models")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc="lower right")
plt.tight_layout()
plt.savefig("results/roc_curve_comparison.png")
plt.show()
