In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import os
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# PyTorch imports
import torch
import torch.nn as nn
from torchvision import models, transforms

# Scikit-learn imports
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
import joblib
import ast

print("BUSINESS LOGO DOMAIN CLASSIFICATION")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}\n")

BUSINESS LOGO DOMAIN CLASSIFICATION
PyTorch version: 2.6.0+cu124
CUDA available: True
Using device: cuda



In [31]:
csv_path = r"C:\BCIT\data_science\project\LogoPredictor\top10k_logos.csv"
df = pd.read_csv(csv_path)
logo_dir = r"C:\BCIT\data_science\project\logo_images"

print(f" Loaded {len(df)} samples")
print(f" Columns: {df.columns.tolist()}")

# Get list of available logo files
logo_files = os.listdir(logo_dir)
logo_files_set = set(logo_files)
print(f" Found {len(logo_files)} logo files")

def clean_name(name):
    if pd.isna(name):
        return ""
    name = str(name).lower().replace(' ', '_').replace('-', '_')
    return ''.join(c for c in name if c.isalnum() or c == '_')

df['clean_name'] = df['name'].apply(clean_name)


matched_rows = []
for logo_file in logo_files:
    logo_name_raw = os.path.splitext(logo_file)[0]

    # Many files may be like "12345_company_name", so drop leading ID part
    if '_' in logo_name_raw:
        logo_name = '_'.join(logo_name_raw.split('_', 1)[1:])
    else:
        logo_name = logo_name_raw
    
    logo_cleaned = clean_name(logo_name)
    
    # Find matching row in df
    matches = df[df['clean_name'] == logo_cleaned]
    if len(matches) > 0:
        row = matches.iloc[0].copy()
        row['logo_filename'] = logo_file
        matched_rows.append(row)

df_matched = pd.DataFrame(matched_rows)
print(f"✓ Matched {len(df_matched)} logos ({len(df_matched)/len(logo_files)*100:.1f}%)")

def parse_categories(x):
    """
    Convert the raw category_groups_list string into a Python list.
    Handles:
      - "['Software', 'AI']"   (Python-list style)
      - "Software, AI"        (comma-separated)
      - NaNs                  (returns [])
    """
    if pd.isna(x):
        return []
    
    x = str(x).strip()
    
    # Try to parse as Python list: "['A', 'B']"
    if x.startswith('[') and x.endswith(']'):
        try:
            parsed = ast.literal_eval(x)
            # Ensure list of strings
            return [str(c).strip() for c in parsed]
        except Exception:
            pass
    
    # Fallback: comma-separated string
    return [c.strip() for c in x.split(',') if c.strip()]

df_matched["parsed_categories"] = df_matched["category_groups_list"].apply(parse_categories)

df_filtered = df_matched[df_matched["parsed_categories"].map(len) > 0].copy()
df_filtered = df_filtered.reset_index(drop=True)
print(f" Logos with at least 1 category: {len(df_filtered)}")

cat_counter = Counter()
for cats in df_matched["parsed_categories"]:
    cat_counter.update(cats)

# Convert to Series for convenience
category_counts = pd.Series(cat_counter).sort_values(ascending=False)

print(f"✓ Total unique categories (multi-label): {len(category_counts)}")
print(f"✓ Total samples (logos with at least 1 matched file): {len(df_matched)}")

# Distribution of category frequencies
print("\nCategory frequency distribution (based on individual categories):")
freq_values = df_matched['category_groups_list'].value_counts()
print(f"  Categories with 1 sample:      {(freq_values == 1).sum()}")
print(f"  Categories with 2-4 samples:   {((freq_values >= 2) & (freq_values <= 4)).sum()}")
print(f"  Categories with 5-9 samples:   {((freq_values >= 5) & (freq_values <= 9)).sum()}")
print(f"  Categories with 10-19 samples: {((freq_values >= 10) & (freq_values < 20)).sum()}")
print(f"  Categories with 20+ samples:   {(freq_values >= 20).sum()}")

print("\nTop 20 most common categories:")
for i, (cat, count) in enumerate(category_counts.head(20).items(), 1):
    short_cat = cat[:70] + "..." if len(cat) > 70 else cat
    print(f"{i:3d}. [{count:4d}] {short_cat}")


 Loaded 10000 samples
 Columns: ['uuid', 'name', 'type', 'permalink', 'cb_url', 'rank', 'created_at', 'updated_at', 'legal_name', 'roles', 'domain', 'homepage_url', 'region', 'city', 'address', 'postal_code', 'status', 'short_description', 'num_funding_rounds', 'total_funding_usd', 'total_funding', 'total_funding_currency_code', 'founded_on', 'last_funding_on', 'closed_on', 'employee_count', 'email', 'phone', 'facebook_url', 'linkedin_url', 'twitter_url', 'state_code', 'logo_url', 'country_code', 'category_groups_list', 'category_list', 'new_logo_url']
 Found 9998 logo files
✓ Matched 9990 logos (99.9%)
 Logos with at least 1 category: 9990
✓ Total unique categories (multi-label): 47
✓ Total samples (logos with at least 1 matched file): 9990

Category frequency distribution (based on individual categories):
  Categories with 1 sample:      2861
  Categories with 2-4 samples:   819
  Categories with 5-9 samples:   200
  Categories with 10-19 samples: 75
  Categories with 20+ samples:   

In [32]:
resnet = models.resnet18(pretrained=True)
resnet = nn.Sequential(*list(resnet.children())[:-1])  # drop final FC layer
resnet = resnet.to(device)
resnet.eval()

transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

features_list = []
success_indices = []   # indices in df_filtered that succeeded
failed = 0

print("\nExtracting ResNet18 features...")
with torch.no_grad():
    for idx, row in df_filtered.iterrows():
        logo_path = os.path.join(logo_dir, row["logo_filename"])
        try:
            img = Image.open(logo_path).convert("RGB")
            img_tensor = transform(img).unsqueeze(0).to(device)  # [1,3,224,224]

            # [1,512,1,1] -> [512]
            feat = resnet(img_tensor).view(-1).cpu().numpy()
            features_list.append(feat)
            success_indices.append(idx)

            if len(features_list) % 200 == 0:
                print(f"  {len(features_list)}/{len(df_filtered)}")
        except Exception:
            failed += 1

X = np.stack(features_list)   # [num_successful_logos, 512]
df_features = df_filtered.loc[success_indices].reset_index(drop=True)

print(f"\nDone. Extracted {len(X)} features (failed: {failed})")
print(" Feature matrix shape X:", X.shape)
print(" df_features rows:", len(df_features))


Extracting ResNet18 features...
  200/9990
  400/9990
  600/9990
  800/9990
  1000/9990
  1200/9990
  1400/9990
  1600/9990
  1800/9990
  2000/9990
  2200/9990
  2400/9990
  2600/9990
  2800/9990
  3000/9990
  3200/9990
  3400/9990
  3600/9990
  3800/9990
  4000/9990
  4200/9990
  4400/9990
  4600/9990
  4800/9990
  5000/9990
  5200/9990
  5400/9990
  5600/9990
  5800/9990
  6000/9990
  6200/9990
  6400/9990
  6600/9990
  6800/9990
  7000/9990
  7200/9990
  7400/9990
  7600/9990
  7800/9990
  8000/9990
  8200/9990
  8400/9990
  8600/9990
  8800/9990
  9000/9990
  9200/9990
  9400/9990
  9600/9990
  9800/9990

Done. Extracted 9943 features (failed: 47)
 Feature matrix shape X: (9943, 512)
 df_features rows: 9943


In [33]:
tmp = df_features[["parsed_categories"]].copy()
tmp = tmp.explode("parsed_categories")   # one row per (logo, category)
tmp = tmp.rename(columns={"parsed_categories": "category"})
tmp = tmp.reset_index().rename(columns={"index": "logo_idx"})
# columns: ['logo_idx', 'category']
# logo_idx now runs from 0 .. len(df_features)-1 and matches X

logo_indices = tmp["logo_idx"].values          # which row in X to use
X_expanded = X[logo_indices]                   # [num_pairs, 512]
y = tmp["category"].values                     # [num_pairs]

print("\nAfter exploding into (logo, category) pairs:")
print(" X_expanded shape:", X_expanded.shape)
print(" y length:", len(y))
print(" Example pairs:")
for i in range(5):
    print(f"  logo_idx={logo_indices[i]}, category={y[i]}")


After exploding into (logo, category) pairs:
 X_expanded shape: (38012, 512)
 y length: 38012
 Example pairs:
  logo_idx=0, category=Financial Services
  logo_idx=0, category=Hardware
  logo_idx=0, category=Internet Services
  logo_idx=0, category=Lending and Investments
  logo_idx=0, category=Mobile


In [34]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)

print(f"\n {len(le.classes_)} categories retained (single-label).")
binc = np.bincount(y_encoded)
print(f"  Samples per category: min={np.min(binc)}, max={np.max(binc)}, mean={np.mean(binc):.1f}")

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_expanded)

# Split by logo, so each logo is only in train OR test
unique_logos = np.unique(logo_indices)
print(f"\nUnique logos (after feature extraction): {len(unique_logos)}")

logo_train_ids, logo_test_ids = train_test_split(
    unique_logos,
    test_size=0.2,
    random_state=42
)

train_mask = np.isin(logo_indices, logo_train_ids)
test_mask  = np.isin(logo_indices, logo_test_ids)

X_train = X_scaled[train_mask]
y_train = y_encoded[train_mask]
logo_train_split = logo_indices[train_mask]

X_test  = X_scaled[test_mask]
y_test  = y_encoded[test_mask]
logo_test_split = logo_indices[test_mask]

print(f"\nTrain rows: {len(X_train)}, Test rows: {len(X_test)}")

print("\nTrain label distribution (first 10 classes):")
print(np.bincount(y_train)[:10])
print("Test label distribution (first 10 classes):")
print(np.bincount(y_test)[:10])



 47 categories retained (single-label).
  Samples per category: min=80, max=5052, mean=808.8

Unique logos (after feature extraction): 9943

Train rows: 30286, Test rows: 7726

Train label distribution (first 10 classes):
[ 382  226  125  592  864  632  136 1085  371  460]
Test label distribution (first 10 classes):
[104  53  34 156 218 142  27 280  93 130]


In [35]:
model = RandomForestClassifier(
    n_estimators=300,
    max_depth=25,
    min_samples_split=8,
    min_samples_leaf=3,
    max_features='sqrt',
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

print("\nTraining RandomForestClassifier...")
model.fit(X_train, y_train)
print("Training complete.")



Training RandomForestClassifier...
Training complete.


In [36]:
y_pred_train = model.predict(X_train)
y_pred_test  = model.predict(X_test)

train_acc = accuracy_score(y_train, y_pred_train)
test_acc  = accuracy_score(y_test, y_pred_test)
test_f1   = f1_score(y_test, y_pred_test, average='weighted')

print(f"\n{'='*80}")
print("RESULTS (Standard single-label accuracy on (logo, category) pairs)")
print(f"{'='*80}")
print(f"Training Accuracy: {train_acc:.4f} ({train_acc*100:.2f}%)")
print(f"Testing Accuracy:  {test_acc:.4f} ({test_acc*100:.2f}%)")
print(f"F1-Score (weighted): {test_f1:.4f}")
print(f"Overfitting Gap:     {(train_acc-test_acc)*100:.1f}%")

random_guess = 1.0 / len(le.classes_)
print(f"\nBaseline:")
print(f"  Random: {random_guess:.4f} ({random_guess*100:.2f}%)")
print(f"  Model:  {test_acc:.4f} ({test_acc*100:.2f}%)")
print(f"  Improvement: {test_acc/random_guess:.1f}x")

# Classification report only on labels present in y_test
print(f"\n{'='*80}")
print("CLASSIFICATION REPORT")
print(f"{'='*80}")
classes_in_test = np.unique(y_test)
target_names = le.inverse_transform(classes_in_test)
print(
    classification_report(
        y_test,
        y_pred_test,
        labels=classes_in_test,
        target_names=target_names,
        digits=3
    )
)



RESULTS (Standard single-label accuracy on (logo, category) pairs)
Training Accuracy: 0.2584 (25.84%)
Testing Accuracy:  0.0325 (3.25%)
F1-Score (weighted): 0.0260
Overfitting Gap:     22.6%

Baseline:
  Random: 0.0213 (2.13%)
  Model:  0.0325 (3.25%)
  Improvement: 1.5x

CLASSIFICATION REPORT
                                  precision    recall  f1-score   support

         Administrative Services      0.016     0.048     0.024       104
                     Advertising      0.000     0.000     0.000        53
         Agriculture and Farming      0.033     0.176     0.056        34
                            Apps      0.033     0.096     0.049       156
         Artificial Intelligence      0.017     0.014     0.015       218
                   Biotechnology      0.037     0.437     0.068       142
            Clothing and Apparel      0.032     0.185     0.055        27
           Commerce and Shopping      0.030     0.007     0.012       280
         Community and Lifestyle     

In [45]:
print("\n=== MEMBERSHIP ACCURACY (PER LOGO, Train SET) ===")

train_unique_logos = np.unique(logo_train_split)
correct_logos = 0

for logo_idx in train_unique_logos:
    rows = np.where(logo_train_split == logo_idx)[0]
    preds = le.inverse_transform(y_pred_train[rows])
    true_cats = df_features.loc[logo_idx, "parsed_categories"]

    if any(pred in true_cats for pred in preds):
        correct_logos += 1

print(f"Train logo-level membership accuracy: {correct_logos}/{len(train_unique_logos)} = {correct_logos/len(train_unique_logos):.4f}")


print("\n=== MEMBERSHIP ACCURACY (PER LOGO, TEST SET) ===")

test_unique_logos = np.unique(logo_test_split)
correct_logos = 0

for logo_idx in test_unique_logos:
    rows = np.where(logo_test_split == logo_idx)[0]
    preds = le.inverse_transform(y_pred_test[rows])
    true_cats = df_features.loc[logo_idx, "parsed_categories"]

    if any(pred in true_cats for pred in preds):
        correct_logos += 1

print(f"Test logo-level membership accuracy: {correct_logos}/{len(test_unique_logos)} = {correct_logos/len(test_unique_logos):.4f}")



=== MEMBERSHIP ACCURACY (PER LOGO, Train SET) ===
Train logo-level membership accuracy: 7827/7954 = 0.9840

=== MEMBERSHIP ACCURACY (PER LOGO, TEST SET) ===
Test logo-level membership accuracy: 251/1989 = 0.1262
