In [5]:
import pandas as pd
import json

In [6]:
df = pd.read_csv('data/mini-ImageNet/train_labels.csv')

df.head()

Unnamed: 0,filename,class_id,class_name
0,00000_class045.jpg,45,n04251144
1,00001_class025.jpg,25,n02966193
2,00002_class013.jpg,13,n02108551
3,00003_class063.jpg,63,n13133613
4,00004_class043.jpg,43,n04067472


In [7]:
idx_to_class = json.load(open("data/mini-ImageNet/imagenet_class_index.json"))
code_to_human_readable = {}
for _, (code, readable) in idx_to_class.items():
    code_to_human_readable[code] = readable

class_codes = list(set(df["class_name"]))

available_classes = [code_to_human_readable[code] for code in class_codes]

print(len(available_classes))
available_classes


64


['worm_fence',
 'unicycle',
 'frying_pan',
 'beer_bottle',
 'barrel',
 'Newfoundland',
 'clog',
 'ashcan',
 'French_bulldog',
 'three-toed_sloth',
 'toucan',
 'organ',
 'photocopier',
 'tobacco_shop',
 'ladybug',
 'tank',
 'pencil_box',
 'hair_slide',
 'house_finch',
 'dome',
 'hotdog',
 'Tibetan_mastiff',
 'lipstick',
 'holster',
 'yawl',
 'Arctic_fox',
 'jellyfish',
 'solar_dish',
 'orange',
 'fire_screen',
 'file',
 'reel',
 'wok',
 'aircraft_carrier',
 'dishrag',
 'prayer_rug',
 'miniature_poodle',
 'komondor',
 'slot',
 'parallel_bars',
 'rock_beauty',
 'boxer',
 'triceratops',
 'tile_roof',
 'oboe',
 'cliff',
 'chime',
 'dugong',
 'spider_web',
 'carousel',
 'stage',
 'cocktail_shaker',
 'bolete',
 'snorkel',
 'Walker_hound',
 'ear',
 'consomme',
 'street_sign',
 'green_mamba',
 'upright',
 'harvestman',
 'Gordon_setter',
 'Saluki',
 'robin']

In [8]:
# Load Open Images class descriptions
oi_df = pd.read_csv('data/Open_Images/oidv7-class-descriptions.csv')
print(f"Total Open Images classes: {len(oi_df)}")
oi_df.head()


Total Open Images classes: 20931


Unnamed: 0,LabelName,DisplayName
0,/m/0c4936,'Nduja
1,/m/06w6y06,10 cane
2,/m/079zcf,100 metres hurdles
3,/m/02pv3hz,100plus
4,/m/0bb154,110 metres hurdles


In [9]:
excluding_labels = {
    "Animated cartoon",
    "Animation",
    "Clay animation",
    "Clip art",
    "Comics",
    "Comic book",
    "Manga",
    "Illustration",
    "Fashion illustration",
    "Screenshot",
    "Web page",
    "Website",
    "Computer wallpaper",
    "Wallpaper",
    "Wallpaper paste",
    "Diagram",
    "Circuit diagram",
    "Chart",
    "Atlas",
    "Map",
    "Texture",
    "Pattern",
    "Paisley (Pattern)",
    "Still life",
    "Still life photography",
    "Computer program",
    "Antivirus software",
    "Educational software",
    "Enterprise software",
    "Graphics software",
    "Multimedia software",
    "Network software",
    "Office application software",
    "Software",
    "Software Developer",
    "Software engineering",
    "Tax software",
    "Utility software",
    "Video editing software",
    "Video game software",
    "Text",
    "Text messaging",
}

# Filter out rows where DisplayName is in excluding_labels
clean_oi_df = oi_df[~oi_df['DisplayName'].isin(excluding_labels)]

clean_oi_df.shape

(20889, 2)

In [10]:
# Normalize class names for matching
def normalize_name(name):
    """Normalize class names: lowercase, replace underscores with spaces"""
    return name.lower().replace('_', ' ').replace('-', ' ').strip()

# Normalize available classes
normalized_available = {normalize_name(c): c for c in available_classes}
print("Sample normalized mini-ImageNet classes:")
list(normalized_available.keys())[:10]


Sample normalized mini-ImageNet classes:


['worm fence',
 'unicycle',
 'frying pan',
 'beer bottle',
 'barrel',
 'newfoundland',
 'clog',
 'ashcan',
 'french bulldog',
 'three toed sloth']

In [11]:
# Method 1: Exact and substring matching
def find_matches_substring(oi_classes, target_classes):
    """
    Find Open Images classes that contain any of the target class names.
    Returns dict mapping target class -> list of matching OI classes
    """
    matches = {tc: [] for tc in target_classes}

    for _, row in oi_classes.iterrows():
        oi_name = normalize_name(row['DisplayName'])
        oi_label = row['LabelName']

        for target in target_classes:
            norm_target = normalize_name(target)
            # Check if target is in OI name or OI name is in target
            if norm_target in oi_name or oi_name in norm_target:
                matches[target].append({
                    'LabelName': oi_label,
                    'DisplayName': row['DisplayName'],
                    'match_type': 'exact' if norm_target == oi_name else 'substring'
                })

    return matches

# Find substring matches
substring_matches = find_matches_substring(clean_oi_df, available_classes)

# Count matches
match_counts = {k: len(v) for k, v in substring_matches.items()}
print(f"Classes with matches: {sum(1 for v in match_counts.values() if v > 0)}")
print(f"Total matches found: {sum(match_counts.values())}")

# Show some examples
for cls, matches in list(substring_matches.items())[:5]:
    if matches:
        print(f"\n{cls}: {matches[:3]}")


Classes with matches: 63
Total matches found: 404

worm_fence: [{'LabelName': '/m/0sx_x', 'DisplayName': 'Fen', 'match_type': 'substring'}, {'LabelName': '/m/0blz9', 'DisplayName': 'Fence', 'match_type': 'substring'}, {'LabelName': '/m/084hf', 'DisplayName': 'Worm', 'match_type': 'substring'}]

unicycle: [{'LabelName': '/m/0f6nr', 'DisplayName': 'Unicycle', 'match_type': 'exact'}, {'LabelName': '/m/0gtt2kt', 'DisplayName': 'Unicycle hockey', 'match_type': 'substring'}, {'LabelName': '/m/017k3t', 'DisplayName': 'Unicycle trials', 'match_type': 'substring'}]

frying_pan: [{'LabelName': '/m/0dxrf', 'DisplayName': 'Frying', 'match_type': 'substring'}, {'LabelName': '/m/04v6l4', 'DisplayName': 'Frying pan', 'match_type': 'exact'}]

beer_bottle: [{'LabelName': '/m/01h3n', 'DisplayName': 'Bee', 'match_type': 'substring'}, {'LabelName': '/m/01599', 'DisplayName': 'Beer', 'match_type': 'substring'}, {'LabelName': '/m/044gvx', 'DisplayName': 'Beer bottle', 'match_type': 'exact'}]

barrel: [{'Lab

In [12]:
# Method 2: Semantic similarity using sentence-transformers (RECOMMENDED)

from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm import tqdm

# Load a lightweight model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Prepare class names (normalize for better matching)
mini_imagenet_names = [normalize_name(c) for c in available_classes]
oi_names = [normalize_name(name) for name in clean_oi_df['DisplayName'].tolist()]

print(f"Encoding {len(mini_imagenet_names)} mini-ImageNet classes...")
mini_embeddings = model.encode(mini_imagenet_names, show_progress_bar=True)

print(f"Encoding {len(oi_names)} Open Images classes...")
oi_embeddings = model.encode(oi_names, show_progress_bar=True, batch_size=256)


Encoding 64 mini-ImageNet classes...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Encoding 20889 Open Images classes...


Batches:   0%|          | 0/82 [00:00<?, ?it/s]

In [13]:
# Compute cosine similarity and find matches
from sklearn.metrics.pairwise import cosine_similarity

# Compute similarity matrix
similarity_matrix = cosine_similarity(mini_embeddings, oi_embeddings)

def find_semantic_matches(similarity_matrix, threshold=0.7, top_k=10):
    """
    Find Open Images classes that are semantically similar to mini-ImageNet classes.

    Args:
        similarity_matrix: (num_mini, num_oi) similarity scores
        threshold: minimum similarity score to consider a match
        top_k: maximum number of matches per class
    """
    semantic_matches = {}

    for i, mini_class in enumerate(available_classes):
        similarities = similarity_matrix[i]

        # Get indices sorted by similarity (descending)
        sorted_indices = np.argsort(similarities)[::-1]

        matches = []
        for idx in sorted_indices[:top_k]:
            if similarities[idx] >= threshold:
                matches.append({
                    'LabelName': clean_oi_df.iloc[idx]['LabelName'],
                    'DisplayName': clean_oi_df.iloc[idx]['DisplayName'],
                    'similarity': float(similarities[idx])
                })

        semantic_matches[mini_class] = matches

    return semantic_matches

# Find matches with different thresholds
semantic_matches = find_semantic_matches(similarity_matrix, threshold=0.6, top_k=15)

# Summary
matched_classes = sum(1 for v in semantic_matches.values() if len(v) > 0)
total_matches = sum(len(v) for v in semantic_matches.values())
print(f"Classes with semantic matches (threshold=0.6): {matched_classes}/{len(available_classes)}")
print(f"Total semantic matches found: {total_matches}")


Classes with semantic matches (threshold=0.6): 64/64
Total semantic matches found: 513


In [14]:
# Show examples of semantic matches
print("Examples of semantic matches:\n")

# Use actual classes from available_classes (first 10 that have matches)
sample_classes = [cls for cls in available_classes if len(semantic_matches.get(cls, [])) > 0][:10]
print(f"Sample classes from your dataset: {sample_classes}\n")

for cls in sample_classes:
    if cls in semantic_matches and semantic_matches[cls]:
        print(f"\n{cls}:")
        for match in semantic_matches[cls][:5]:
            print(f"  - {match['DisplayName']} (sim: {match['similarity']:.3f})")


Examples of semantic matches:

Sample classes from your dataset: ['worm_fence', 'unicycle', 'frying_pan', 'beer_bottle', 'barrel', 'Newfoundland', 'clog', 'ashcan', 'French_bulldog', 'three-toed_sloth']


worm_fence:
  - Fence (sim: 0.734)
  - Worm (sim: 0.687)
  - Wireworms (sim: 0.656)
  - Fence Post (sim: 0.629)
  - Ringed-worm (sim: 0.622)

unicycle:
  - Unicycle (sim: 1.000)
  - Unicycle hockey (sim: 0.768)
  - Unicycle trials (sim: 0.762)
  - Monocle (sim: 0.620)

frying_pan:
  - Frying pan (sim: 1.000)
  - Pan frying (sim: 0.921)
  - Frying (sim: 0.708)
  - Roasting pan (sim: 0.699)
  - Cake pan (sim: 0.667)

beer_bottle:
  - Beer bottle (sim: 1.000)
  - Wine bottle (sim: 0.789)
  - Bottle (sim: 0.781)
  - Glass bottle (sim: 0.781)
  - Beer glass (sim: 0.766)

barrel:
  - Barrel (sim: 1.000)
  - Gun barrel (sim: 0.927)
  - Rain barrel (sim: 0.766)
  - Muzzle (sim: 0.731)
  - Barrel drum (sim: 0.723)

Newfoundland:
  - Newfoundland (sim: 1.000)
  - Canada goose (sim: 0.686)
  - G

In [15]:
# Combine all matching methods and create final filtered dataset
def get_all_relevant_oi_classes(semantic_matches, substring_matches, similarity_threshold=0.6):
    """
    Combine semantic and substring matches to get all relevant Open Images classes.
    """
    all_matches = {}
    unique_oi_labels = set()

    for mini_class in available_classes:
        combined = []

        # Add semantic matches
        for match in semantic_matches.get(mini_class, []):
            if match['similarity'] >= similarity_threshold:
                combined.append({
                    **match,
                    'source': 'semantic'
                })
                unique_oi_labels.add(match['LabelName'])

        # Add substring matches
        for match in substring_matches.get(mini_class, []):
            if match['LabelName'] not in unique_oi_labels:
                combined.append({
                    **match,
                    'source': 'substring'
                })
                unique_oi_labels.add(match['LabelName'])

        all_matches[mini_class] = combined

    return all_matches, unique_oi_labels

# Get all relevant classes
all_matches, unique_oi_labels = get_all_relevant_oi_classes(
    semantic_matches,
    substring_matches,
    similarity_threshold=0.6
)

print(f"Total unique Open Images classes relevant to mini-ImageNet: {len(unique_oi_labels)}")

# Create filtered dataframe
mini_imagenet_like_oi_df = clean_oi_df[clean_oi_df['LabelName'].isin(unique_oi_labels)].copy()
print(f"Filtered dataframe shape: {mini_imagenet_like_oi_df.shape}")
mini_imagenet_like_oi_df.head(10)


Total unique Open Images classes relevant to mini-ImageNet: 726
Filtered dataframe shape: (726, 2)


Unnamed: 0,LabelName,DisplayName
36,/m/08y7p8,Abdomen
106,/m/0hgntgw,Active tank
129,/m/0h8p55r,Address sign
177,/m/011l1,Afghan hound
206,/m/0brytpf,Agent orange
251,/m/0k5j,Aircraft
253,/m/0xsc,Aircraft carrier
254,/m/02w078f,Aircraft cruiser
258,/m/0vg8,Airline
259,/m/0dhz0,Airliner


In [16]:
non_mini_imagenet_oi_df = clean_oi_df[~clean_oi_df['LabelName'].isin(unique_oi_labels)].copy()
print(f"Non mini-ImageNet OI classes: {non_mini_imagenet_oi_df.shape}")
non_mini_imagenet_oi_df.head(10)


Non mini-ImageNet OI classes: (20163, 2)


Unnamed: 0,LabelName,DisplayName
0,/m/0c4936,'Nduja
1,/m/06w6y06,10 cane
2,/m/079zcf,100 metres hurdles
3,/m/02pv3hz,100plus
4,/m/0bb154,110 metres hurdles
5,/m/03d11pb,1800 tequila
6,/m/0gcy3w,1937 ford
7,/m/0glxvb,1941 ford
8,/m/0glyd8,1949 ford
9,/m/0gl_d9,1952 ford


In [17]:
# Save the filtered classes and mappings
import json

# Save filtered Open Images classes
mini_imagenet_like_oi_df.to_csv('data/Open_Images/mini_imagenet_oi_classes.csv', index=False)
print(f"Saved filtered classes to data/Open_Images/mini_imagenet_oi_classes.csv")

non_mini_imagenet_oi_df.to_csv('data/Open_Images/non_mini_imagenet_oi_classes.csv', index=False)
print(f"Saved non mini-ImageNet OI classes to data/Open_Images/non_mini_imagenet_oi_classes.csv")

# Save the mapping from mini-ImageNet to Open Images
with open('data/Open_Images/mini_imagenet_to_oi_mapping.json', 'w') as f:
    json.dump(all_matches, f, indent=2)
print(f"Saved class mapping to data/Open_Images/mini_imagenet_to_oi_mapping.json")

# Print summary per mini-ImageNet class
print("\n=== Matches per mini-ImageNet class ===")
for cls in sorted(available_classes)[:20]:  # First 20
    n_matches = len(all_matches.get(cls, []))
    print(f"{cls}: {n_matches} matches")


Saved filtered classes to data/Open_Images/mini_imagenet_oi_classes.csv
Saved non mini-ImageNet OI classes to data/Open_Images/non_mini_imagenet_oi_classes.csv
Saved class mapping to data/Open_Images/mini_imagenet_to_oi_mapping.json

=== Matches per mini-ImageNet class ===
Arctic_fox: 15 matches
French_bulldog: 17 matches
Gordon_setter: 2 matches
Newfoundland: 3 matches
Saluki: 7 matches
Tibetan_mastiff: 12 matches
Walker_hound: 15 matches
aircraft_carrier: 18 matches
ashcan: 8 matches
barrel: 16 matches
beer_bottle: 16 matches
bolete: 15 matches
boxer: 14 matches
carousel: 2 matches
chime: 15 matches
cliff: 4 matches
clog: 7 matches
cocktail_shaker: 14 matches
consomme: 1 matches
dishrag: 1 matches
