In [59]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import os
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# PyTorch imports
import torch
import torch.nn as nn
from torchvision import models, transforms

# Scikit-learn imports
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
import joblib
import ast

print("BUSINESS LOGO DOMAIN CLASSIFICATION")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}\n")

BUSINESS LOGO DOMAIN CLASSIFICATION
PyTorch version: 2.6.0+cu124
CUDA available: True
Using device: cuda



In [52]:
csv_path = r"C:\BCIT\data_science\project\LogoPredictor\top10k_logos.csv"
df = pd.read_csv(csv_path)
logo_dir = r"C:\BCIT\data_science\project\logo_images"

print(f" Loaded {len(df)} samples")
print(f" Columns: {df.columns.tolist()}")

# Get list of available logo files
logo_files = os.listdir(logo_dir)
logo_files_set = set(logo_files)
print(f" Found {len(logo_files)} logo files")

def clean_name(name):
    if pd.isna(name):
        return ""
    name = str(name).lower().replace(' ', '_').replace('-', '_')
    return ''.join(c for c in name if c.isalnum() or c == '_')

df['clean_name'] = df['name'].apply(clean_name)


matched_rows = []
for logo_file in logo_files:
    logo_name_raw = os.path.splitext(logo_file)[0]

    # Many files may be like "12345_company_name", so drop leading ID part
    if '_' in logo_name_raw:
        logo_name = '_'.join(logo_name_raw.split('_', 1)[1:])
    else:
        logo_name = logo_name_raw
    
    logo_cleaned = clean_name(logo_name)
    
    # Find matching row in df
    matches = df[df['clean_name'] == logo_cleaned]
    if len(matches) > 0:
        row = matches.iloc[0].copy()
        row['logo_filename'] = logo_file
        matched_rows.append(row)

df_matched = pd.DataFrame(matched_rows)
print(f"✓ Matched {len(df_matched)} logos ({len(df_matched)/len(logo_files)*100:.1f}%)")

def parse_categories(x):
    """
    Convert the raw category_groups_list string into a Python list.
    Handles:
      - "['Software', 'AI']"   (Python-list style)
      - "Software, AI"        (comma-separated)
      - NaNs                  (returns [])
    """
    if pd.isna(x):
        return []
    
    x = str(x).strip()
    
    # Try to parse as Python list: "['A', 'B']"
    if x.startswith('[') and x.endswith(']'):
        try:
            parsed = ast.literal_eval(x)
            # Ensure list of strings
            return [str(c).strip() for c in parsed]
        except Exception:
            pass
    
    # Fallback: comma-separated string
    return [c.strip() for c in x.split(',') if c.strip()]

df_matched["parsed_categories"] = df_matched["category_groups_list"].apply(parse_categories)

df_filtered = df_matched[df_matched["parsed_categories"].map(len) > 0].copy()
df_filtered = df_filtered.reset_index(drop=True)
print(f" Logos with at least 1 category: {len(df_filtered)}")

cat_counter = Counter()
for cats in df_matched["parsed_categories"]:
    cat_counter.update(cats)

# Convert to Series for convenience
category_counts = pd.Series(cat_counter).sort_values(ascending=False)

print(f"✓ Total unique categories (multi-label): {len(category_counts)}")
print(f"✓ Total samples (logos with at least 1 matched file): {len(df_matched)}")

# Distribution of category frequencies
print("\nCategory frequency distribution (based on individual categories):")
freq_values = df_matched['category_groups_list'].value_counts()
print(f"  Categories with 1 sample:      {(freq_values == 1).sum()}")
print(f"  Categories with 2-4 samples:   {((freq_values >= 2) & (freq_values <= 4)).sum()}")
print(f"  Categories with 5-9 samples:   {((freq_values >= 5) & (freq_values <= 9)).sum()}")
print(f"  Categories with 10-19 samples: {((freq_values >= 10) & (freq_values < 20)).sum()}")
print(f"  Categories with 20+ samples:   {(freq_values >= 20).sum()}")

print("\nTop 20 most common categories:")
for i, (cat, count) in enumerate(category_counts.head(20).items(), 1):
    short_cat = cat[:70] + "..." if len(cat) > 70 else cat
    print(f"{i:3d}. [{count:4d}] {short_cat}")


 Loaded 10000 samples
 Columns: ['uuid', 'name', 'type', 'permalink', 'cb_url', 'rank', 'created_at', 'updated_at', 'legal_name', 'roles', 'domain', 'homepage_url', 'region', 'city', 'address', 'postal_code', 'status', 'short_description', 'num_funding_rounds', 'total_funding_usd', 'total_funding', 'total_funding_currency_code', 'founded_on', 'last_funding_on', 'closed_on', 'employee_count', 'email', 'phone', 'facebook_url', 'linkedin_url', 'twitter_url', 'state_code', 'logo_url', 'country_code', 'category_groups_list', 'category_list', 'new_logo_url']
 Found 9998 logo files
✓ Matched 9990 logos (99.9%)
 Logos with at least 1 category: 9990
✓ Total unique categories (multi-label): 47
✓ Total samples (logos with at least 1 matched file): 9990

Category frequency distribution (based on individual categories):
  Categories with 1 sample:      2861
  Categories with 2-4 samples:   819
  Categories with 5-9 samples:   200
  Categories with 10-19 samples: 75
  Categories with 20+ samples:   

In [53]:
resnet = models.resnet18(pretrained=True)
resnet = nn.Sequential(*list(resnet.children())[:-1])  # drop final FC layer
resnet = resnet.to(device)
resnet.eval()

transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

features_list = []
success_indices = []   # indices in df_filtered that succeeded
failed = 0

print("\nExtracting ResNet18 features...")
with torch.no_grad():
    for idx, row in df_filtered.iterrows():
        logo_path = os.path.join(logo_dir, row["logo_filename"])
        try:
            img = Image.open(logo_path).convert("RGB")
            img_tensor = transform(img).unsqueeze(0).to(device)  # [1,3,224,224]

            # [1,512,1,1] -> [512]
            feat = resnet(img_tensor).view(-1).cpu().numpy()
            features_list.append(feat)
            success_indices.append(idx)

            if len(features_list) % 200 == 0:
                print(f"  {len(features_list)}/{len(df_filtered)}")
        except Exception:
            failed += 1

X = np.stack(features_list)   # [num_successful_logos, 512]
df_features = df_filtered.loc[success_indices].reset_index(drop=True)

print(f"\nDone. Extracted {len(X)} features (failed: {failed})")
print(" Feature matrix shape X:", X.shape)
print(" df_features rows:", len(df_features))


Extracting ResNet18 features...
  200/9990
  400/9990
  600/9990
  800/9990
  1000/9990
  1200/9990
  1400/9990
  1600/9990
  1800/9990
  2000/9990
  2200/9990
  2400/9990
  2600/9990
  2800/9990
  3000/9990
  3200/9990
  3400/9990
  3600/9990
  3800/9990
  4000/9990
  4200/9990
  4400/9990
  4600/9990
  4800/9990
  5000/9990
  5200/9990
  5400/9990
  5600/9990
  5800/9990
  6000/9990
  6200/9990
  6400/9990
  6600/9990
  6800/9990
  7000/9990
  7200/9990
  7400/9990
  7600/9990
  7800/9990
  8000/9990
  8200/9990
  8400/9990
  8600/9990
  8800/9990
  9000/9990
  9200/9990
  9400/9990
  9600/9990
  9800/9990

Done. Extracted 9943 features (failed: 47)
 Feature matrix shape X: (9943, 512)
 df_features rows: 9943


In [54]:
tmp = df_features[["parsed_categories"]].copy()
tmp = tmp.explode("parsed_categories")   # one row per (logo, category)
tmp = tmp.rename(columns={"parsed_categories": "category"})
tmp = tmp.reset_index().rename(columns={"index": "logo_idx"})
# columns: ['logo_idx', 'category']
# logo_idx now runs from 0 .. len(df_features)-1 and matches X

logo_indices = tmp["logo_idx"].values          # which row in X to use
X_expanded = X[logo_indices]                   # [num_pairs, 512]
y = tmp["category"].values                     # [num_pairs]

print("\nAfter exploding into (logo, category) pairs:")
print(" X_expanded shape:", X_expanded.shape)
print(" y length:", len(y))
print(" Example pairs:")
for i in range(5):
    print(f"  logo_idx={logo_indices[i]}, category={y[i]}")


After exploding into (logo, category) pairs:
 X_expanded shape: (38012, 512)
 y length: 38012
 Example pairs:
  logo_idx=0, category=Financial Services
  logo_idx=0, category=Hardware
  logo_idx=0, category=Internet Services
  logo_idx=0, category=Lending and Investments
  logo_idx=0, category=Mobile


In [55]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)

print(f"\n {len(le.classes_)} categories retained (single-label).")
binc = np.bincount(y_encoded)
print(f"  Samples per category: min={np.min(binc)}, max={np.max(binc)}, mean={np.mean(binc):.1f}")

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_expanded)

# Split by logo, so each logo is only in train OR test
unique_logos = np.unique(logo_indices)
print(f"\nUnique logos (after feature extraction): {len(unique_logos)}")

logo_train_ids, logo_test_ids = train_test_split(
    unique_logos,
    test_size=0.2,
    random_state=42
)

train_mask = np.isin(logo_indices, logo_train_ids)
test_mask  = np.isin(logo_indices, logo_test_ids)

X_train = X_scaled[train_mask]
y_train = y_encoded[train_mask]
logo_train_split = logo_indices[train_mask]

X_test  = X_scaled[test_mask]
y_test  = y_encoded[test_mask]
logo_test_split = logo_indices[test_mask]

print(f"\nTrain rows: {len(X_train)}, Test rows: {len(X_test)}")

print("\nTrain label distribution (first 10 classes):")
print(np.bincount(y_train)[:10])
print("Test label distribution (first 10 classes):")
print(np.bincount(y_test)[:10])



 47 categories retained (single-label).
  Samples per category: min=80, max=5052, mean=808.8

Unique logos (after feature extraction): 9943

Train rows: 30286, Test rows: 7726

Train label distribution (first 10 classes):
[ 382  226  125  592  864  632  136 1085  371  460]
Test label distribution (first 10 classes):
[104  53  34 156 218 142  27 280  93 130]


# Helper function

In [63]:
def evaluate_membership_accuracy(model, X_train, y_train, X_test, y_test,
                                 logo_train_split, logo_test_split, le, df_features,
                                 name="MODEL"):
    print(f"\n\n=== {name} — MEMBERSHIP ACCURACY ONLY ===")

    # ---- TRAIN SET ----
    y_train_pred = model.predict(X_train)
    y_train_pred_labels = le.inverse_transform(y_train_pred)

    train_unique_logos = np.unique(logo_train_split)
    correct_train = 0

    for logo_idx in train_unique_logos:
        rows = np.where(logo_train_split == logo_idx)[0]
        preds = y_train_pred_labels[rows]
        true_cats = df_features.loc[logo_idx, "parsed_categories"]
        
        if any(p in true_cats for p in preds):
            correct_train += 1

    train_membership_acc = correct_train / len(train_unique_logos)


    # ---- TEST SET ----
    y_test_pred = model.predict(X_test)
    y_test_pred_labels = le.inverse_transform(y_test_pred)

    test_unique_logos = np.unique(logo_test_split)
    correct_test = 0

    for logo_idx in test_unique_logos:
        rows = np.where(logo_test_split == logo_idx)[0]
        preds = y_test_pred_labels[rows]
        true_cats = df_features.loc[logo_idx, "parsed_categories"]

        if any(p in true_cats for p in preds):
            correct_test += 1

    test_membership_acc = correct_test / len(test_unique_logos)


    # ---- Output ----
    print(f"\n{name} — TRAIN membership accuracy:")
    print(f"  {correct_train}/{len(train_unique_logos)} = {train_membership_acc:.4f}")

    print(f"\n{name} — TEST membership accuracy:")
    print(f"  {correct_test}/{len(test_unique_logos)} = {test_membership_acc:.4f}")

    return train_membership_acc, test_membership_acc


# Random Forest Classifier

In [65]:
model = RandomForestClassifier(
    n_estimators=300,
    max_depth=25,
    min_samples_split=8,
    min_samples_leaf=3,
    max_features='sqrt',
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

print("\nTraining RandomForestClassifier...")
model.fit(X_train, y_train)

evaluate_membership_accuracy(
    model=model,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    logo_train_split=logo_train_split,
    logo_test_split=logo_test_split,
    le=le,
    df_features=df_features,
    name="RandomForest"
)



Training RandomForestClassifier...


=== RandomForest — MEMBERSHIP ACCURACY ONLY ===

RandomForest — TRAIN membership accuracy:
  7827/7954 = 0.9840

RandomForest — TEST membership accuracy:
  251/1989 = 0.1262


(0.9840331908473724, 0.12619406737053795)

# Logistic Regression

In [61]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(
    multi_class="ovr",
    max_iter=2000,
    C=1.0,
    class_weight="balanced",
    n_jobs=-1
)

print("\nTraining LogisticRegression (OvR) on X_train...")
logreg.fit(X_train, y_train)
print("LogisticRegression training complete.")

evaluate_membership_accuracy(
    model=logreg,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    logo_train_split=logo_train_split,
    logo_test_split=logo_test_split,
    le=le,
    df_features=df_features,
    name="LogisticRegression"
)




Training LogisticRegression (OvR) on X_train...
LogisticRegression training complete.


=== LogisticRegression — MEMBERSHIP ACCURACY ONLY ===

LogisticRegression — TRAIN membership accuracy:
  1767/7954 = 0.2222

LogisticRegression — TEST membership accuracy:
  121/1989 = 0.0608


(0.2221523761629369, 0.06083459024635495)

# ResNet Classifier

In [None]:
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

class EmbeddingDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


train_ds = EmbeddingDataset(X_train, y_train)
test_ds  = EmbeddingDataset(X_test, y_test)

train_loader = DataLoader(train_ds, batch_size=128, shuffle=True)
test_loader  = DataLoader(test_ds, batch_size=256, shuffle=False)


class ResNetStyleHead(nn.Module):
    """
    A small residual MLP head on top of 512-d ResNet embeddings:
      x -> FC(512) -> ReLU -> FC(512) -> +x -> ReLU -> FC(num_classes)
    """
    def __init__(self, in_dim=512, num_classes=47):  # 47 if that's your class count
        super().__init__()
        self.fc1 = nn.Linear(in_dim, 512)
        self.fc2 = nn.Linear(512, 512)
        self.fc_out = nn.Linear(512, num_classes)

    def forward(self, x):
        # x: [B, in_dim]
        residual = x
        out = F.relu(self.fc1(x))
        out = self.fc2(out)
        out = F.relu(out + residual)  # residual connection
        out = self.fc_out(out)
        return out


num_classes = len(le.classes_)
resnet_head = ResNetStyleHead(in_dim=X_train.shape[1], num_classes=num_classes).to(device)

optimizer = torch.optim.Adam(resnet_head.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()


def train_resnet_head_one_epoch():
    resnet_head.train()
    total_loss = 0
    total_correct = 0
    total = 0

    for xb, yb in train_loader:
        xb = xb.to(device)
        yb = yb.to(device)

        optimizer.zero_grad()
        logits = resnet_head(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * xb.size(0)
        preds = logits.argmax(dim=1)
        total_correct += (preds == yb).sum().item()
        total += xb.size(0)

    return total_loss / total, total_correct / total


def eval_resnet_head(loader):
    resnet_head.eval()
    total_correct = 0
    total = 0

    with torch.no_grad():
        for xb, yb in loader:
            xb = xb.to(device)
            yb = yb.to(device)
            logits = resnet_head(xb)
            preds = logits.argmax(dim=1)
            total_correct += (preds == yb).sum().item()
            total += xb.size(0)

    acc = total_correct / total
    return acc


print("\nTraining ResNet-style MLP head on embeddings...")
num_epochs = 10
for epoch in range(num_epochs):
    train_loss, train_acc_resnet = train_resnet_head_one_epoch()
    test_acc_resnet = eval_resnet_head(test_loader)
    print(f"Epoch {epoch+1:2d} | Train loss: {train_loss:.4f} | "
          f"Train acc: {train_acc_resnet:.4f} | Test acc: {test_acc_resnet:.4f}")


class ResNetHeadWrapper:
    def __init__(self, net, device, batch_size=256):
        self.net = net
        self.device = device
        self.batch_size = batch_size

    def predict(self, X):
        """
        X: numpy array of shape [N, D] (same as X_train / X_test)
        returns: numpy array of predicted class indices [N]
        """
        self.net.eval()
        X_np = X.astype(np.float32)
        preds_all = []

        with torch.no_grad():
            for i in range(0, len(X_np), self.batch_size):
                xb = torch.from_numpy(X_np[i:i + self.batch_size]).to(self.device)
                logits = self.net(xb)
                preds = logits.argmax(dim=1).cpu().numpy()
                preds_all.append(preds)

        return np.concatenate(preds_all)


resnet_head_sklearn = ResNetHeadWrapper(resnet_head, device)

evaluate_membership_accuracy(
    model=resnet_head_sklearn,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    logo_train_split=logo_train_split,
    logo_test_split=logo_test_split,
    le=le,
    df_features=df_features,
    name="ResNet-MLP"
)



Training ResNet-style MLP head on embeddings...
Epoch  1 | Train loss: 3.4487 | Train acc: 0.1217 | Test acc: 0.1307
Epoch  2 | Train loss: 3.3140 | Train acc: 0.1315 | Test acc: 0.1311
Epoch  3 | Train loss: 3.2149 | Train acc: 0.1341 | Test acc: 0.1252
Epoch  4 | Train loss: 3.1011 | Train acc: 0.1363 | Test acc: 0.1222
Epoch  5 | Train loss: 2.9794 | Train acc: 0.1391 | Test acc: 0.1239
Epoch  6 | Train loss: 2.8564 | Train acc: 0.1433 | Test acc: 0.1032
Epoch  7 | Train loss: 2.7512 | Train acc: 0.1496 | Test acc: 0.1134
Epoch  8 | Train loss: 2.6584 | Train acc: 0.1507 | Test acc: 0.1117
Epoch  9 | Train loss: 2.5749 | Train acc: 0.1530 | Test acc: 0.0982
Epoch 10 | Train loss: 2.5096 | Train acc: 0.1539 | Test acc: 0.1020


=== ResNet-MLP — MEMBERSHIP ACCURACY ONLY ===

ResNet-MLP — TRAIN membership accuracy:
  6777/7954 = 0.8520

ResNet-MLP — TEST membership accuracy:
  788/1989 = 0.3962


(0.852024138798089, 0.3961789844142785)