Imports and Configuration

In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
import os
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from glob import glob
import pickle

# Initialize SKIMAGE_AVAILABLE to False
SKIMAGE_AVAILABLE = False

# Try to import scikit-image
try:
    from skimage.feature import local_binary_pattern, hog
    from skimage.color import rgb2gray
    SKIMAGE_AVAILABLE = True
except ImportError:
    print("scikit-image is not installed. LBP and HOG features will be skipped.")

# Initialize TENSORFLOW_AVAILABLE to False
TENSORFLOW_AVAILABLE = False

# Try to import TensorFlow
try:
    from tensorflow.keras.applications import VGG16
    from tensorflow.keras.applications.vgg16 import preprocess_input
    from tensorflow.keras.models import Model
    base_model = VGG16(weights='imagenet', include_top=False)
    model = Model(inputs=base_model.input, outputs=base_model.get_layer('block5_pool').output)
    TENSORFLOW_AVAILABLE = True
except ImportError:
    print("TensorFlow is not installed. Deep features will be skipped.")

IMAGE_FOLDER = "E:/Coding/Advanced ML/train_data"
LABELS_FILE = "E:/Coding/Advanced ML/train.csv"
IMG_SIZE = (64, 64)
PCA_CACHE_FILE = "pca_results_combined.pkl"  # Updated cache file for PCA

Image Loading Function

In [16]:
def load_images_from_folder(folder, size=IMG_SIZE):
    images = []
    filenames = []
    for filename in glob(os.path.join(folder, "*.jpg")):  # Or other extensions
        img = cv2.imread(filename)
        if img is not None:  # Check if image was read successfully
            img = cv2.resize(img, size)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            images.append(img)
            filenames.append(os.path.basename(filename))
        else:
            print(f"Warning: Could not read image {filename}. Skipping.")
    return images, filenames

Extract the RGBW, LBP, Edge Features, HOG, Color Histogram, Deep Features

In [17]:
def extract_rgbw_features(image):
    r, g, b = cv2.split(image)
    w = (r + g + b) / 3  # White component as average of RGB
    return np.array([r.mean(), g.mean(), b.mean(), w.mean()])

def extract_lbp_features(image):
    gray = rgb2gray(image)
    lbp = local_binary_pattern(gray, P=8, R=1, method="uniform")
    hist, _ = np.histogram(lbp, bins=np.arange(0, 10), range=(0, 9))
    hist = hist.astype("float")
    hist /= (hist.sum() + 1e-6)  # Normalize the histogram
    return hist

def extract_edge_features(image):
    gray = rgb2gray(image)
    edges = cv2.Canny((gray * 255).astype(np.uint8), 100, 200)
    return edges.mean()

def extract_hog_features(image):
    if not SKIMAGE_AVAILABLE:
        return np.array([])  # Return an empty array if scikit-image is not available
    gray = rgb2gray(image)
    fd, _ = hog(gray, orientations=8, pixels_per_cell=(16, 16),
                cells_per_block=(1, 1), visualize=True, channel_axis=None)  # Use channel_axis instead of multichannel
    return fd

def extract_color_histogram(image, bins=(8, 8, 8)):
    hist = cv2.calcHist([image], [0, 1, 2], None, bins, [0, 256, 0, 256, 0, 256])
    cv2.normalize(hist, hist)
    return hist.flatten()

def extract_deep_features(image, model):
    image = cv2.resize(image, (224, 224))  # Resize for VGG16
    image = preprocess_input(image)
    image = np.expand_dims(image, axis=0)
    features = model.predict(image)
    return features.flatten()

Load Pre-trained model

In [18]:
# Load pre-trained VGG16 model + higher level layers
base_model = VGG16(weights='imagenet', include_top=False)
model = Model(inputs=base_model.input, outputs=base_model.get_layer('block5_pool').output)

Load Labels

In [19]:
labels_df = pd.read_csv(LABELS_FILE)
labels_dict = dict(zip(labels_df['file_name'], labels_df['label']))

Process Images and Perform PCA with Caching

In [None]:
from joblib import Parallel, delayed
import time

def extract_features(img):
    rgbw = extract_rgbw_features(img)
    lbp = extract_lbp_features(img)
    edge = extract_edge_features(img)
    hog_feat = extract_hog_features(img)
    color_hist = extract_color_histogram(img)
    if TENSORFLOW_AVAILABLE:
        deep_feat = extract_deep_features(img, model)
    else:
        deep_feat = np.array([])  # Skip deep features if TensorFlow is not available
    return np.hstack([rgbw, lbp, edge, hog_feat, color_hist, deep_feat])

def process_images_and_pca(image_folder, labels_dict, img_size=IMG_SIZE, cache_file=PCA_CACHE_FILE):
    if os.path.exists(cache_file):
        print("Loading PCA results from cache...")
        with open(cache_file, 'rb') as f:
            df_pca, df_features, scaler, pca = pickle.load(f)
            return df_pca, df_features, scaler, pca
    else:
        print("Processing images and performing PCA...")
        start_time = time.time()
        
        # Load images
        images, filenames = load_images_from_folder(image_folder, img_size)
        print(f"Loaded {len(images)} images in {time.time() - start_time:.2f} seconds.")
        
        # Parallel feature extraction
        feature_extraction_start = time.time()
        features_list = Parallel(n_jobs=-1)(delayed(extract_features)(img) for img in images)
        print(f"Feature extraction completed in {time.time() - feature_extraction_start:.2f} seconds.")
        
        # Create DataFrame
        features = np.array(features_list)
        df_features = pd.DataFrame(features)
        df_features['Image'] = filenames
        df_features['Label'] = df_features['Image'].map(lambda x: labels_dict.get(f"train_data/{x}", np.nan))
        
        # Standardize features
        scaler = StandardScaler()
        features_scaled = scaler.fit_transform(features)
        print(f"Feature scaling completed in {time.time() - start_time:.2f} seconds.")
        
        # Perform PCA
        pca = PCA(n_components=20)
        pca_result = pca.fit_transform(features_scaled)
        print(f"PCA completed in {time.time() - start_time:.2f} seconds.")
        
        # Create PCA DataFrame
        df_pca = pd.DataFrame(pca_result, columns=['PC1', 'PC2'])
        df_pca['Image'] = filenames
        df_pca['Label'] = df_features['Label']
        
        # Cache results
        with open(cache_file, 'wb') as f:
            pickle.dump((df_pca, df_features, scaler, pca), f)
        print(f"Total time: {time.time() - start_time:.2f} seconds.")
        
        return df_pca, df_features, scaler, pca

Execute PCA

In [23]:
df_pca, df_features, scaler, pca = process_images_and_pca(IMAGE_FOLDER, labels_dict)

Processing images and performing PCA...


KeyboardInterrupt: 

Elbow Method

In [None]:
# Continue with clustering and evaluation as before
inertia = []
cluster_range = range(1, 30)

for k in cluster_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(df_pca[['PC1', 'PC2']])
    inertia.append(kmeans.inertia_)

plt.figure(figsize=(20, 6))
plt.plot(cluster_range, inertia, marker='o', linestyle='--')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia (Sum of Squared Distances)')
plt.title('Elbow Method for Optimal k')
plt.show()

Silhouette Score

In [None]:
from sklearn.metrics import silhouette_score

sil_scores = []

for k in range(2, 10):
    kmeans = KMeans(n_clusters=k, random_state=42)
    cluster_labels = kmeans.fit_predict(df_pca[['PC1', 'PC2']])
    score = silhouette_score(df_pca[['PC1', 'PC2']], cluster_labels)
    sil_scores.append(score)

plt.figure(figsize=(8, 6))
plt.plot(range(2, 10), sil_scores, marker='o', linestyle='--')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score for Optimal k')
plt.show()

Clustering

In [None]:
kmeans = KMeans(n_clusters=2, random_state=42)
df_pca['Cluster'] = kmeans.fit_predict(df_pca[['PC1', 'PC2']])
plt.figure(figsize=(8, 6))
sns.scatterplot(x='PC1', y='PC2', hue=df_pca['Cluster'].astype(str), palette='coolwarm', data=df_pca)
plt.title('K-Means Clustering on PCA of Combined Image Features')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()