In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import os
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# PyTorch imports
import torch
import torch.nn as nn
from torchvision import models, transforms

# Scikit-learn imports
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
import joblib
import ast

print("BUSINESS LOGO DOMAIN CLASSIFICATION")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}\n")

BUSINESS LOGO DOMAIN CLASSIFICATION
PyTorch version: 2.6.0+cu124
CUDA available: True
Using device: cuda



In [13]:
csv_path = r"C:\BCIT\data_science\project\LogoPredictor\top10k_logos.csv"
df = pd.read_csv(csv_path)
logo_dir = r"C:\BCIT\data_science\project\logo_images"

print(f" Loaded {len(df)} samples")
print(f" Columns: {df.columns.tolist()}")

# Get list of available logo files
logo_files = os.listdir(logo_dir)
logo_files_set = set(logo_files)
print(f" Found {len(logo_files)} logo files")

def clean_name(name):
    if pd.isna(name):
        return ""
    name = str(name).lower().replace(' ', '_').replace('-', '_')
    return ''.join(c for c in name if c.isalnum() or c == '_')

df['clean_name'] = df['name'].apply(clean_name)


matched_rows = []
for logo_file in logo_files:
    logo_name_raw = os.path.splitext(logo_file)[0]

    # Many files may be like "12345_company_name", so drop leading ID part
    if '_' in logo_name_raw:
        logo_name = '_'.join(logo_name_raw.split('_', 1)[1:])
    else:
        logo_name = logo_name_raw
    
    logo_cleaned = clean_name(logo_name)
    
    # Find matching row in df
    matches = df[df['clean_name'] == logo_cleaned]
    if len(matches) > 0:
        row = matches.iloc[0].copy()
        row['logo_filename'] = logo_file
        matched_rows.append(row)

df_matched = pd.DataFrame(matched_rows)
print(f"✓ Matched {len(df_matched)} logos ({len(df_matched)/len(logo_files)*100:.1f}%)")

def parse_categories(x):
    """
    Convert the raw category_groups_list string into a Python list.
    Handles:
      - "['Software', 'AI']"   (Python-list style)
      - "Software, AI"        (comma-separated)
      - NaNs                  (returns [])
    """
    if pd.isna(x):
        return []
    
    x = str(x).strip()
    
    # Try to parse as Python list: "['A', 'B']"
    if x.startswith('[') and x.endswith(']'):
        try:
            parsed = ast.literal_eval(x)
            # Ensure list of strings
            return [str(c).strip() for c in parsed]
        except Exception:
            pass
    
    # Fallback: comma-separated string
    return [c.strip() for c in x.split(',') if c.strip()]

df_matched["parsed_categories"] = df_matched["category_groups_list"].apply(parse_categories)

cat_counter = Counter()
for cats in df_matched["parsed_categories"]:
    cat_counter.update(cats)

# Convert to Series for convenience
category_counts = pd.Series(cat_counter).sort_values(ascending=False)

print(f"✓ Total unique categories (multi-label): {len(category_counts)}")
print(f"✓ Total samples (logos with at least 1 matched file): {len(df_matched)}")

# Distribution of category frequencies
print("\nCategory frequency distribution (based on individual categories):")
freq_values = df_matched['category_groups_list'].value_counts()
print(f"  Categories with 1 sample:      {(freq_values == 1).sum()}")
print(f"  Categories with 2-4 samples:   {((freq_values >= 2) & (freq_values <= 4)).sum()}")
print(f"  Categories with 5-9 samples:   {((freq_values >= 5) & (freq_values <= 9)).sum()}")
print(f"  Categories with 10-19 samples: {((freq_values >= 10) & (freq_values < 20)).sum()}")
print(f"  Categories with 20+ samples:   {(freq_values >= 20).sum()}")

print("\nTop 20 most common categories:")
for i, (cat, count) in enumerate(category_counts.head(20).items(), 1):
    short_cat = cat[:70] + "..." if len(cat) > 70 else cat
    print(f"{i:3d}. [{count:4d}] {short_cat}")


 Loaded 10000 samples
 Columns: ['uuid', 'name', 'type', 'permalink', 'cb_url', 'rank', 'created_at', 'updated_at', 'legal_name', 'roles', 'domain', 'homepage_url', 'region', 'city', 'address', 'postal_code', 'status', 'short_description', 'num_funding_rounds', 'total_funding_usd', 'total_funding', 'total_funding_currency_code', 'founded_on', 'last_funding_on', 'closed_on', 'employee_count', 'email', 'phone', 'facebook_url', 'linkedin_url', 'twitter_url', 'state_code', 'logo_url', 'country_code', 'category_groups_list', 'category_list', 'new_logo_url']
 Found 9998 logo files
✓ Matched 9990 logos (99.9%)
✓ Total unique categories (multi-label): 47
✓ Total samples (logos with at least 1 matched file): 9990

Category frequency distribution (based on individual categories):
  Categories with 1 sample:      2861
  Categories with 2-4 samples:   819
  Categories with 5-9 samples:   200
  Categories with 10-19 samples: 75
  Categories with 20+ samples:   51

Top 20 most common categories:
  1

In [15]:
resnet = models.resnet18(pretrained=True)
resnet = nn.Sequential(*list(resnet.children())[:-1])
resnet = resnet.to(device)
resnet.eval()

transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

features_list = []
labels_list = []
failed = 0

print("Processing filtered logos...")
with torch.no_grad():
    for idx, row in df_filtered.iterrows():
        logo_path = os.path.join(logo_dir, row['logo_filename'])
        try:
            img = Image.open(logo_path).convert('RGB')
            img_tensor = transform(img).unsqueeze(0).to(device)
            feat = resnet(img_tensor).squeeze().cpu().numpy()
            
            features_list.append(feat)
            labels_list.append(row['category_groups_list'])
            
            if (len(features_list)) % 200 == 0:
                print(f"  {len(features_list)}/{len(df_filtered)}")
        except:
            failed += 1

X = np.array(features_list)
y = np.array(labels_list)

print(f" Extracted: {len(X)} samples (failed: {failed})")

Processing filtered logos...
  200/9990
  400/9990
  600/9990
  800/9990
  1000/9990
  1200/9990
  1400/9990
  1600/9990
  1800/9990
  2000/9990
  2200/9990
  2400/9990
  2600/9990
  2800/9990
  3000/9990
  3200/9990
  3400/9990
  3600/9990
  3800/9990
  4000/9990
  4200/9990
  4400/9990
  4600/9990
  4800/9990
  5000/9990
  5200/9990
  5400/9990
  5600/9990
  5800/9990
  6000/9990
  6200/9990
  6400/9990
  6600/9990
  6800/9990
  7000/9990
  7200/9990
  7400/9990
  7600/9990
  7800/9990
  8000/9990
  8200/9990
  8400/9990
  8600/9990
  8800/9990
  9000/9990
  9200/9990
  9400/9990
  9600/9990
  9800/9990
 Extracted: 9943 samples (failed: 47)


In [4]:
#Preparing data

le = LabelEncoder()
y_encoded = le.fit_transform(y)

print(f" {len(le.classes_)} category combinations retained")
print(f"  Samples per category: min={np.min(np.bincount(y_encoded))}, max={np.max(np.bincount(y_encoded))}, mean={np.mean(np.bincount(y_encoded)):.1f}")

# Scale
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Handle single-sample categories
category_counts_encoded = pd.Series(y_encoded).value_counts()
print(f"\nCategory distribution in encoded data:")
print(f"  Categories with 1 sample: {(category_counts_encoded == 1).sum()}")
print(f"  Categories with 2+ samples: {(category_counts_encoded >= 2).sum()}")

# Separate single-sample and multi-sample categories
single_sample_cats = category_counts_encoded[category_counts_encoded == 1].index
multi_sample_cats = category_counts_encoded[category_counts_encoded >= 2].index

single_mask = np.isin(y_encoded, single_sample_cats)
multi_mask = ~single_mask

print(f"\nSplit strategy:")
print(f"  Single-sample categories → all to training set: {single_mask.sum()} samples")
print(f"  Multi-sample categories → stratified split: {multi_mask.sum()} samples")

# Split only multi-sample data with stratification
if multi_mask.sum() > 0:
    X_multi = X_scaled[multi_mask]
    y_multi = y_encoded[multi_mask]
    
    X_train_multi, X_test, y_train_multi, y_test = train_test_split(
        X_multi, y_multi,
        test_size=0.20,
        random_state=42,
        stratify=y_multi
    )
    
    # Add single-sample data to training set
    if single_mask.sum() > 0:
        X_single = X_scaled[single_mask]
        y_single = y_encoded[single_mask]
        
        X_train = np.vstack([X_train_multi, X_single])
        y_train = np.concatenate([y_train_multi, y_single])
    else:
        X_train = X_train_multi
        y_train = y_train_multi
else:
    # All categories have single samples - cannot stratify
    print("    Cannot perform stratified split - using random split")
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y_encoded,
        test_size=0.20,
        random_state=42,
        stratify=None
    )

print(f"\n Train: {len(X_train)}, Test: {len(X_test)}")

 3994 category combinations retained
  Samples per category: min=1, max=444, mean=2.5

Category distribution in encoded data:
  Categories with 1 sample: 2856
  Categories with 2+ samples: 1138

Split strategy:
  Single-sample categories → all to training set: 2856 samples
  Multi-sample categories → stratified split: 7087 samples

 Train: 8525, Test: 1418


In [5]:
#Train Model

model = RandomForestClassifier(
    n_estimators=300,
    max_depth=25,
    min_samples_split=8,
    min_samples_leaf=3,
    max_features='sqrt',
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)
print("Training complete")

Training complete


In [6]:
#Evaluate

y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

train_acc = accuracy_score(y_train, y_pred_train)
test_acc = accuracy_score(y_test, y_pred_test)
test_f1 = f1_score(y_test, y_pred_test, average='weighted')

print(f"\n{'='*80}")
print("RESULTS")
print(f"{'='*80}")
print(f"Training Accuracy: {train_acc:.4f} ({train_acc*100:.2f}%)")
print(f"Testing Accuracy:  {test_acc:.4f} ({test_acc*100:.2f}%)")
print(f"F1-Score:          {test_f1:.4f}")
print(f"Overfitting Gap:   {(train_acc-test_acc)*100:.1f}%")

random_guess = 1.0 / len(le.classes_)
print(f"\nBaseline:")
print(f"  Random: {random_guess:.4f} ({random_guess*100:.2f}%)")
print(f"  Model:  {test_acc:.4f} ({test_acc*100:.2f}%)")
print(f"  Improvement: {test_acc/random_guess:.1f}x")

# Classification report
print(f"\n{'='*80}")
print("CLASSIFICATION REPORT")
print(f"{'='*80}")
print(classification_report(y_test, y_pred_test, target_names=le.classes_, digits=3))


RESULTS
Training Accuracy: 0.2256 (22.56%)
Testing Accuracy:  0.0007 (0.07%)
F1-Score:          0.0000
Overfitting Gap:   22.5%

Baseline:
  Random: 0.0003 (0.03%)
  Model:  0.0007 (0.07%)
  Improvement: 2.8x

CLASSIFICATION REPORT


ValueError: Number of classes, 1314, does not match size of target_names, 3994. Try specifying the labels parameter