In [1]:
# TPR6223 Pattern Recognition Group Project
# Automobile Brand Recognition System
# 
# Group: Byte Me
# Lab Section: 1B
# Group Leader: Do Wai Lung
# Members: 
#   1. Doris Heng
#   2. Eldeena Lim Huey Yinn  
#   3. Kong Yi Xuan
#
# This project recognizes automobile brands from logo images using machine learning
# We implemented all the techniques we learned in our labs this semester

In [3]:
import numpy as np
import cv2
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier  # Lab 8 method
from sklearn.svm import SVC  # Lab 10 method
from sklearn.neural_network import MLPClassifier  # Lab 11 method
from sklearn.ensemble import VotingClassifier  # Extra credit
import pandas as pd
import warnings
warnings.filterwarnings('ignore')  # Keep output clean

In [4]:
def preprocess_logo_image(image_path, size):
    """
    Load and preprocess an automobile logo image.
    We had to figure out how to handle different lighting and image sizes.
    
    Args:
        image_path: path to image file
        size: target size (width, height)
    
    Returns:
        processed image or None if failed
    """
    try:
        # Load image with OpenCV
        img = cv2.imread(image_path)
        if img is None:
            return None

        # Convert BGR to RGB (OpenCV uses BGR by default)
        if len(img.shape) == 3:
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        
        # Resize to standard size
        resized = cv2.resize(img, size, interpolation=cv2.INTER_AREA)
        
        # Improve contrast for different lighting conditions
        # We use CLAHE which we learned about in image processing
        if len(resized.shape) == 3:
            # Convert to LAB color space
            lab = cv2.cvtColor(resized, cv2.COLOR_RGB2LAB)
            l, a, b = cv2.split(lab)
            
            # Apply CLAHE to lightness channel
            clahe = cv2.createCLAHE(clipLimit=2.5, tileGridSize=(8,8))
            l_enhanced = clahe.apply(l)
            
            # Combine back to RGB
            lab_enhanced = cv2.merge((l_enhanced, a, b))
            enhanced = cv2.cvtColor(lab_enhanced, cv2.COLOR_LAB2RGB)
        else:
            clahe = cv2.createCLAHE(clipLimit=2.5, tileGridSize=(8,8))
            enhanced = clahe.apply(resized)
        
        # Convert to grayscale for our analysis
        if len(enhanced.shape) == 3:
            gray = cv2.cvtColor(enhanced, cv2.COLOR_RGB2GRAY)
        else:
            gray = enhanced
        
        # Small blur to reduce noise from phone cameras
        blurred = cv2.GaussianBlur(gray, (3, 3), 0)
        
        # Normalize to 0-1 range
        normalized = blurred.astype(np.float32) / 255.0
        
        return normalized
        
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return None

In [5]:
def load_automobile_logos(folders, labels, img_size):
    """
    Load all our collected automobile logo images.
    We manually collected and cropped these photos as required.
    
    Args:
        folders: dict of brand -> folder path
        labels: dict of brand -> numeric label
        img_size: target image size
    
    Returns:
        images array, labels array, file paths
    """
    print("Loading our automobile logo dataset...")
    print("These are photos we collected ourselves around Melaka")
    
    images = []
    image_labels = []
    file_paths = []
    
    # Load each brand's images
    for brand, folder in folders.items():
        if not os.path.exists(folder):
            print(f"Warning: {folder} not found for {brand}")
            continue
        
        print(f"\nLoading {brand.upper()} logos from {folder}...")
        
        # Get all image files
        image_files = [f for f in os.listdir(folder) 
                      if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp'))]
        
        print(f"Found {len(image_files)} {brand} images")
        
        # Process each image
        loaded_count = 0
        for img_file in image_files:
            full_path = os.path.join(folder, img_file)
            processed = preprocess_logo_image(full_path, img_size)
            
            if processed is not None:
                images.append(processed)
                image_labels.append(labels[brand])
                file_paths.append(full_path)
                loaded_count += 1
        
        print(f"Successfully loaded {loaded_count} {brand} images")
    
    if len(images) == 0:
        print("ERROR: No images loaded!")
        return None, None, None
    
    # Convert to numpy arrays
    images_array = np.array(images)
    labels_array = np.array(image_labels)
    
    print(f"\nDataset summary:")
    print(f"Total images: {len(images_array)}")
    print(f"Image shape: {images_array.shape}")
    
    # Show distribution
    unique_labels, counts = np.unique(labels_array, return_counts=True)
    brand_names = {v: k for k, v in labels.items()}
    print("\nBrand distribution:")
    for label, count in zip(unique_labels, counts):
        print(f"  {brand_names[label].upper()}: {count} images")
    
    return images_array, labels_array, file_paths

In [6]:
def balance_dataset_with_augmentation(images, labels):
    """
    Balance our dataset by adding augmented images.
    This helps prevent bias toward brands with more photos.
    
    Args:
        images: original images
        labels: corresponding labels
    
    Returns:
        balanced images and labels
    """
    print("\nBalancing dataset with augmentation...")
    
    unique_labels, counts = np.unique(labels, return_counts=True)
    brand_names = {0: 'Toyota', 1: 'Honda', 2: 'Mazda', 3: 'Perodua'}
    
    print("Original distribution:")
    for label, count in zip(unique_labels, counts):
        print(f"  {brand_names[label]}: {count} images")
    
    # Set target number per brand
    max_count = max(counts)
    target = min(max_count + 15, 160)  # Not too many to avoid overfitting
    
    print(f"Target per brand: {target} images")
    
    balanced_images = []
    balanced_labels = []
    
    for label in unique_labels:
        # Get images for this brand
        brand_mask = labels == label
        brand_images = images[brand_mask]
        current_count = len(brand_images)
        
        print(f"\nProcessing {brand_names[label]}...")
        
        # Add all original images
        for img in brand_images:
            balanced_images.append(img)
            balanced_labels.append(label)
        
        # Add augmented images if needed
        if current_count < target:
            needed = target - current_count
            print(f"  Adding {needed} augmented images")
            
            for i in range(needed):
                # Pick random original image
                base_img = brand_images[i % current_count]
                
                # Create augmented version
                augmented = simple_augment(base_img)
                if augmented is not None:
                    balanced_images.append(augmented)
                    balanced_labels.append(label)
        
        final_count = len([l for l in balanced_labels if l == label])
        print(f"  Final {brand_names[label]} count: {final_count}")
    
    return np.array(balanced_images), np.array(balanced_labels)

In [7]:
def simple_augment(image):
    """
    Apply simple augmentation to an image.
    We keep changes small to preserve logo recognition.
    
    Args:
        image: input image
    
    Returns:
        augmented image
    """
    try:
        h, w = image.shape
        
        # Small rotation (-5 to +5 degrees)
        angle = np.random.uniform(-5, 5)
        center = (w//2, h//2)
        M = cv2.getRotationMatrix2D(center, angle, 1.0)
        rotated = cv2.warpAffine(image, M, (w, h), borderMode=cv2.BORDER_REFLECT)
        
        # Slight brightness change (0.85 to 1.15 times original)
        brightness = np.random.uniform(0.85, 1.15)
        adjusted = np.clip(rotated * brightness, 0, 1)
        
        # Small scaling (0.95 to 1.05 times original)
        scale = np.random.uniform(0.95, 1.05)
        new_h, new_w = int(h * scale), int(w * scale)
        scaled = cv2.resize(adjusted, (new_w, new_h))
        
        # Crop or pad back to original size
        if new_h >= h and new_w >= w:
            # Crop from center
            start_y = (new_h - h) // 2
            start_x = (new_w - w) // 2
            final = scaled[start_y:start_y+h, start_x:start_x+w]
        else:
            # Pad to original size
            pad_y = max(0, (h - new_h) // 2)
            pad_x = max(0, (w - new_w) // 2)
            final = np.pad(scaled, ((pad_y, h-new_h-pad_y), (pad_x, w-new_w-pad_x)), mode='edge')
            if final.shape != (h, w):
                final = cv2.resize(final, (w, h))
        
        return final.astype(np.float32)
        
    except:
        return None

In [8]:
def extract_features_pca_lda(X_train, X_test, y_train):
    """
    Extract features using PCA and LDA methods from our labs.
    
    Args:
        X_train: training data (flattened images)
        X_test: test data (flattened images)
        y_train: training labels
    
    Returns:
        dictionary with all feature sets and models
    """
    print("Extracting features using PCA and LDA...")
    
    # Step 1: Standardize the data
    print("  Standardizing data...")
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Step 2: Apply PCA (Lab 6 method)
    print("  Applying PCA (Lab 6)...")
    
    # Find number of components for 92% variance
    pca_full = PCA(random_state=42)
    pca_full.fit(X_train_scaled)
    
    cumvar = np.cumsum(pca_full.explained_variance_ratio_)
    n_components = np.argmax(cumvar >= 0.92) + 1
    
    print(f"    Using {n_components} components for 92% variance")
    
    # Apply PCA
    pca = PCA(n_components=n_components, whiten=True, random_state=42)
    pca.fit(X_train_scaled)
    
    X_train_pca = pca.transform(X_train_scaled)
    X_test_pca = pca.transform(X_test_scaled)
    
    print(f"    PCA: {X_train_scaled.shape[1]} → {n_components} features")
    
    # Step 3: Apply LDA (Lab 7 method)
    print("  Applying LDA (Lab 7)...")
    
    # LDA can create max (n_classes - 1) features
    n_classes = len(np.unique(y_train))
    lda_components = n_classes - 1  # 3 for our 4 brands
    
    lda = LinearDiscriminantAnalysis(n_components=lda_components, solver='svd')
    lda.fit(X_train_scaled, y_train)
    
    X_train_lda = lda.transform(X_train_scaled)
    X_test_lda = lda.transform(X_test_scaled)
    
    print(f"    LDA: {X_train_scaled.shape[1]} → {lda_components} features")
    
    return {
        'scaler': scaler,
        'pca': pca,
        'lda': lda,
        'X_train_pca': X_train_pca,
        'X_test_pca': X_test_pca,
        'X_train_lda': X_train_lda,
        'X_test_lda': X_test_lda,
        'n_pca_components': n_components
    }

In [9]:
def train_all_models(features, y_train, y_test):
    """
    Train all classification models from our labs.
    
    Args:
        features: extracted features dictionary
        y_train: training labels
        y_test: test labels
    
    Returns:
        dictionary with all model results
    """
    print("Training all classification models...")
    
    # Get feature sets
    X_train_pca = features['X_train_pca']
    X_test_pca = features['X_test_pca']
    X_train_lda = features['X_train_lda']
    X_test_lda = features['X_test_lda']
    
    # Cross-validation setup
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    results = {}
    
    # k-NN models (Lab 8)
    print("\n  Training k-NN models (Lab 8)...")
    
    # k-NN with PCA
    knn_pca = KNeighborsClassifier(n_neighbors=5, weights='distance')
    knn_pca.fit(X_train_pca, y_train)
    knn_pca_pred = knn_pca.predict(X_test_pca)
    knn_pca_cv = cross_val_score(knn_pca, X_train_pca, y_train, cv=cv)
    
    results['knn_pca'] = {
        'model': knn_pca,
        'feature_type': 'PCA',
        'test_accuracy': accuracy_score(y_test, knn_pca_pred),
        'cv_mean': knn_pca_cv.mean(),
        'cv_std': knn_pca_cv.std(),
        'predictions': knn_pca_pred
    }
    print(f"    k-NN + PCA: {results['knn_pca']['test_accuracy']:.3f}")
    
    # k-NN with LDA
    knn_lda = KNeighborsClassifier(n_neighbors=7, weights='distance')
    knn_lda.fit(X_train_lda, y_train)
    knn_lda_pred = knn_lda.predict(X_test_lda)
    knn_lda_cv = cross_val_score(knn_lda, X_train_lda, y_train, cv=cv)
    
    results['knn_lda'] = {
        'model': knn_lda,
        'feature_type': 'LDA',
        'test_accuracy': accuracy_score(y_test, knn_lda_pred),
        'cv_mean': knn_lda_cv.mean(),
        'cv_std': knn_lda_cv.std(),
        'predictions': knn_lda_pred
    }
    print(f"    k-NN + LDA: {results['knn_lda']['test_accuracy']:.3f}")
    
    # SVM models (Lab 10)
    print("\n  Training SVM models (Lab 10)...")
    
    # Linear SVM with PCA
    svm_linear_pca = SVC(kernel='linear', C=5.0, random_state=42)
    svm_linear_pca.fit(X_train_pca, y_train)
    svm_linear_pca_pred = svm_linear_pca.predict(X_test_pca)
    svm_linear_pca_cv = cross_val_score(svm_linear_pca, X_train_pca, y_train, cv=cv)
    
    results['svm_linear_pca'] = {
        'model': svm_linear_pca,
        'feature_type': 'PCA',
        'test_accuracy': accuracy_score(y_test, svm_linear_pca_pred),
        'cv_mean': svm_linear_pca_cv.mean(),
        'cv_std': svm_linear_pca_cv.std(),
        'predictions': svm_linear_pca_pred
    }
    print(f"    Linear SVM + PCA: {results['svm_linear_pca']['test_accuracy']:.3f}")
    
    # RBF SVM with PCA
    svm_rbf_pca = SVC(kernel='rbf', C=100, gamma='scale', random_state=42)
    svm_rbf_pca.fit(X_train_pca, y_train)
    svm_rbf_pca_pred = svm_rbf_pca.predict(X_test_pca)
    svm_rbf_pca_cv = cross_val_score(svm_rbf_pca, X_train_pca, y_train, cv=cv)
    
    results['svm_rbf_pca'] = {
        'model': svm_rbf_pca,
        'feature_type': 'PCA',
        'test_accuracy': accuracy_score(y_test, svm_rbf_pca_pred),
        'cv_mean': svm_rbf_pca_cv.mean(),
        'cv_std': svm_rbf_pca_cv.std(),
        'predictions': svm_rbf_pca_pred
    }
    print(f"    RBF SVM + PCA: {results['svm_rbf_pca']['test_accuracy']:.3f}")
    
    # Linear SVM with LDA
    svm_linear_lda = SVC(kernel='linear', C=1.0, random_state=42)
    svm_linear_lda.fit(X_train_lda, y_train)
    svm_linear_lda_pred = svm_linear_lda.predict(X_test_lda)
    svm_linear_lda_cv = cross_val_score(svm_linear_lda, X_train_lda, y_train, cv=cv)
    
    results['svm_linear_lda'] = {
        'model': svm_linear_lda,
        'feature_type': 'LDA',
        'test_accuracy': accuracy_score(y_test, svm_linear_lda_pred),
        'cv_mean': svm_linear_lda_cv.mean(),
        'cv_std': svm_linear_lda_cv.std(),
        'predictions': svm_linear_lda_pred
    }
    print(f"    Linear SVM + LDA: {results['svm_linear_lda']['test_accuracy']:.3f}")
    
    # Neural Network models (Lab 11)
    print("\n  Training Neural Networks (Lab 11)...")
    
    # MLP with PCA
    mlp_pca = MLPClassifier(
        hidden_layer_sizes=(100, 50),
        activation='relu',
        solver='adam',
        alpha=0.01,
        learning_rate='adaptive',
        max_iter=500,
        early_stopping=True,
        validation_fraction=0.1,
        random_state=42
    )
    mlp_pca.fit(X_train_pca, y_train)
    mlp_pca_pred = mlp_pca.predict(X_test_pca)
    mlp_pca_cv = cross_val_score(mlp_pca, X_train_pca, y_train, cv=cv)
    
    results['mlp_pca'] = {
        'model': mlp_pca,
        'feature_type': 'PCA',
        'test_accuracy': accuracy_score(y_test, mlp_pca_pred),
        'cv_mean': mlp_pca_cv.mean(),
        'cv_std': mlp_pca_cv.std(),
        'predictions': mlp_pca_pred
    }
    print(f"    Neural Network + PCA: {results['mlp_pca']['test_accuracy']:.3f}")
    
    # MLP with LDA
    mlp_lda = MLPClassifier(
        hidden_layer_sizes=(20, 10),
        activation='relu',
        solver='adam',
        alpha=0.01,
        max_iter=300,
        early_stopping=True,
        validation_fraction=0.1,
        random_state=42
    )
    mlp_lda.fit(X_train_lda, y_train)
    mlp_lda_pred = mlp_lda.predict(X_test_lda)
    mlp_lda_cv = cross_val_score(mlp_lda, X_train_lda, y_train, cv=cv)
    
    results['mlp_lda'] = {
        'model': mlp_lda,
        'feature_type': 'LDA',
        'test_accuracy': accuracy_score(y_test, mlp_lda_pred),
        'cv_mean': mlp_lda_cv.mean(),
        'cv_std': mlp_lda_cv.std(),
        'predictions': mlp_lda_pred
    }
    print(f"    Neural Network + LDA: {results['mlp_lda']['test_accuracy']:.3f}")
    
    return results

In [10]:
def create_ensemble(model_results, features, y_train, y_test):
    """
    Create ensemble from best models (extra credit attempt).
    
    Args:
        model_results: individual model results
        features: feature data
        y_train: training labels
        y_test: test labels
    
    Returns:
        ensemble results dictionary
    """
    print("\n  Creating ensemble (extra credit)...")
    
    # Select models with reasonable performance
    good_models = []
    
    for name, result in model_results.items():
        cv_score = result['cv_mean']
        test_score = result['test_accuracy']
        gap = abs(test_score - cv_score)
        
        # Include if decent performance and not too much overfitting
        if cv_score > 0.3 and test_score > 0.3 and gap < 0.3:
            good_models.append((name, result))
    
    if len(good_models) >= 2:
        # Use top 3 models for ensemble
        ensemble_models = [(name, result['model']) for name, result in good_models[:3]]
        ensemble = VotingClassifier(estimators=ensemble_models, voting='hard')
        
        # Train on PCA features
        X_train_pca = features['X_train_pca']
        X_test_pca = features['X_test_pca']
        
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        ensemble_cv = cross_val_score(ensemble, X_train_pca, y_train, cv=cv)
        
        ensemble.fit(X_train_pca, y_train)
        ensemble_pred = ensemble.predict(X_test_pca)
        ensemble_accuracy = accuracy_score(y_test, ensemble_pred)
        
        print(f"    Ensemble: {ensemble_accuracy:.3f}")
        
        return {
            'ensemble': {
                'model': ensemble,
                'feature_type': 'PCA',
                'test_accuracy': ensemble_accuracy,
                'cv_mean': ensemble_cv.mean(),
                'cv_std': ensemble_cv.std(),
                'predictions': ensemble_pred
            }
        }
    else:
        print("    Not enough good models for ensemble")
        return {}

In [19]:
def analyze_results(all_results):
    """
    Analyze and display all results clearly.
    
    Args:
        all_results: dictionary of all model results
    
    Returns:
        pandas DataFrame with organized results
    """
    print("\n" + "="*60)
    print("FINAL RESULTS ANALYSIS")
    print("="*60)
    
    # Organize results
    results_data = []
    for model_name, result in all_results.items():
        if 'cv_mean' in result:
            # Calculate overfitting gap
            gap = abs(result['test_accuracy'] - result['cv_mean'])
            
            # Determine generalization quality
            if gap < 0.1:
                quality = 'Excellent'
            elif gap < 0.2:
                quality = 'Good'
            else:
                quality = 'Poor'
            
            results_data.append({
                'Model': model_name,
                'Features': result['feature_type'],
                'CV_Score': result['cv_mean'],
                'Test_Score': result['test_accuracy'],
                'Gap': gap,
                'Generalization': quality
            })
    
    # Create and sort results table
    df = pd.DataFrame(results_data)
    df = df.sort_values('Test_Score', ascending=False)
    
    print("Model Performance Summary:")
    print("-" * 60)
    for _, row in df.iterrows():
        print(f"{row['Model']:15} | {row['Features']:3} | "
              f"CV: {row['CV_Score']:.3f} | Test: {row['Test_Score']:.3f} | "
              f"Gap: {row['Gap']:.3f} | {row['Generalization']}")
    
    # Find best model
    good_models = df[df['Generalization'].isin(['Excellent', 'Good'])]
    if len(good_models) > 0:
        best = good_models.iloc[0]
        print(f"\n BEST MODEL: {best['Model']}")
        print(f"   Test Accuracy: {best['Test_Score']:.3f} ({best['Test_Score']*100:.1f}%)")
        print(f"   CV Score: {best['CV_Score']:.3f}")
        print(f"   Generalization: {best['Generalization']}")
        
        # Performance interpretation
        if best['Test_Score'] >= 0.7:
            print("   Excellent performance for logo recognition.")
        elif best['Test_Score'] >= 0.5:
            print("   Good performance")
        else:
            print("   Decent performance")
    
    return df

In [21]:
def show_confusion_matrix(all_results, y_test):
    """
    Show confusion matrix for best model.
    
    Args:
        all_results: all model results
        y_test: true test labels
    """
    # Find best model
    best_model = None
    best_score = 0
    
    for name, result in all_results.items():
        if 'test_accuracy' in result:
            gap = abs(result['test_accuracy'] - result['cv_mean'])
            if result['test_accuracy'] > best_score and gap < 0.25:
                best_score = result['test_accuracy']
                best_model = name
    
    if best_model:
        print(f"\nConfusion Matrix - {best_model}")
        print("-" * 40)
        
        predictions = all_results[best_model]['predictions']
        cm = confusion_matrix(y_test, predictions)
        
        brands = ['Toyota', 'Honda', 'Mazda', 'Perodua']
        
        # Print confusion matrix
        print("        Predicted:")
        print("        ", "  ".join(f"{b[:6]:>6}" for b in brands))
        print("Actual:")
        for i, actual_brand in enumerate(brands):
            row = f"{actual_brand[:6]:>6}: "
            for j in range(len(brands)):
                row += f"{cm[i][j]:>6}  "
            print(row)
        
        # Calculate per-brand accuracy
        print("\nPer-brand accuracy:")
        for i, brand in enumerate(brands):
            if cm[i].sum() > 0:
                acc = cm[i][i] / cm[i].sum()
                print(f"  {brand}: {acc:.3f} ({acc*100:.1f}%)")

In [23]:
def main_project():
    """
    Main function that runs our automobile brand recognition project.
    This implements all requirements and follows our lab exercises.
    """
    print("="*80)
    print("AUTOMOBILE BRAND RECOGNITION SYSTEM")
    print("TPR6223 Pattern Recognition Group Project")
    print("")
    print("Group: Byte Me (Lab Section 1B)")
    print("Leader: Do Wai Lung")
    print("Members: Doris Heng, Eldeena Lim Huey Yinn, Kong Yi Xuan")
    print("="*80)
    
    # Project settings
    image_size = (128, 128)
    
    # Brand mapping
    brands = {
        'toyota': 0,
        'honda': 1, 
        'mazda': 2,
        'perodua': 3
    }
    
    # Our data folders
    folders = {
        'toyota': "Project/Manual_Cropped_Logo/Toyota",
        'honda': "Project/Manual_Cropped_Logo/Honda",
        'mazda': "Project/Manual_Cropped_Logo/Mazda", 
        'perodua': "Project/Manual_Cropped_Logo/Perodua"
    }
    
    print("\nSTEP 1: Loading automobile logo dataset")
    print("-" * 50)
    
    # Load our collected images
    images, labels, file_paths = load_automobile_logos(folders, brands, image_size)
    
    if images is None:
        print("ERROR: Failed to load dataset!")
        return
    
    print("\nSTEP 2: Balancing dataset")
    print("-" * 50)
    
    # Balance dataset with augmentation
    balanced_images, balanced_labels = balance_dataset_with_augmentation(images, labels)
    
    print("\nSTEP 3: Train-test split (60/40)")
    print("-" * 50)
    
    # Flatten images for sklearn
    num_pixels = image_size[0] * image_size[1]
    X_flat = balanced_images.reshape(len(balanced_images), num_pixels)
    
    # Split data as required (60% train, 40% test)
    X_train, X_test, y_train, y_test = train_test_split(
        X_flat, balanced_labels,
        test_size=0.4,
        random_state=42,
        stratify=balanced_labels
    )
    
    print(f"Training set: {X_train.shape[0]} images")
    print(f"Testing set: {X_test.shape[0]} images")
    
    # Show distribution
    train_unique, train_counts = np.unique(y_train, return_counts=True)
    test_unique, test_counts = np.unique(y_test, return_counts=True)
    brand_names = ['Toyota', 'Honda', 'Mazda', 'Perodua']
    
    print("\nTrain distribution:")
    for label, count in zip(train_unique, train_counts):
        print(f"  {brand_names[label]}: {count}")
    
    print("Test distribution:")
    for label, count in zip(test_unique, test_counts):
        print(f"  {brand_names[label]}: {count}")
    
    print("\nSTEP 4: Feature extraction")
    print("-" * 50)
    
    # Extract features using PCA and LDA
    features = extract_features_pca_lda(X_train, X_test, y_train)
    
    print("\nSTEP 5: Training classifiers")
    print("-" * 50)
    
    # Train all required models
    model_results = train_all_models(features, y_train, y_test)
    
    print("\nSTEP 6: Creating ensemble")
    print("-" * 50)
    
    # Try ensemble approach
    ensemble_results = create_ensemble(model_results, features, y_train, y_test)
    
    # Combine all results
    all_results = {**model_results, **ensemble_results}
    
    print("\nSTEP 7: Results analysis")
    print("-" * 50)
    
    # Analyze all results
    results_df = analyze_results(all_results)
    
    return {
        'results': all_results,
        'features': features,
        'results_df': results_df,
        'y_test': y_test
    }

In [25]:
# Run the project
if __name__ == "__main__":
    print("Starting Group Byte Me's automobile recognition project...")
    print("Make sure image folders are set up correctly!")
    
    # Execute main project
    project_data = main_project()

Starting Group Byte Me's automobile recognition project...
Make sure image folders are set up correctly!
AUTOMOBILE BRAND RECOGNITION SYSTEM
TPR6223 Pattern Recognition Group Project

Group: Byte Me (Lab Section 1B)
Leader: Do Wai Lung
Members: Doris Heng, Eldeena Lim Huey Yinn, Kong Yi Xuan

STEP 1: Loading automobile logo dataset
--------------------------------------------------
Loading our automobile logo dataset...
These are photos we collected ourselves around Melaka

Loading TOYOTA logos from Project/Manual_Cropped_Logo/Toyota...
Found 145 toyota images
Successfully loaded 145 toyota images

Loading HONDA logos from Project/Manual_Cropped_Logo/Honda...
Found 156 honda images
Successfully loaded 156 honda images

Loading MAZDA logos from Project/Manual_Cropped_Logo/Mazda...
Found 130 mazda images
Successfully loaded 130 mazda images

Loading PERODUA logos from Project/Manual_Cropped_Logo/Perodua...
Found 100 perodua images
Successfully loaded 100 perodua images

Dataset summary:


In [29]:
import automobile_code as core

if 'project_data' in locals():
    core.save_trained_models(project_data)
    print("Models saved!")

Saving trained models to saved_models/...
✅ All models saved successfully!
Best model: svm_rbf_pca (66.4% accuracy)
Models saved!
