In [None]:
import os
import argparse
import datetime
import json
import pandas as pd
import torch
import torchvision.models as models
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import urllib.request
import zipfile

In [None]:
!pip install ftfy

Collecting ftfy
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ftfy
Successfully installed ftfy-6.3.1


In [None]:
# Check if GPU is available, otherwise use CPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Step 1: Install required packages
print("Installing packages...")
!pip install clip-dissect torch torchvision pandas matplotlib numpy

# Step 2: Clone the CLIP-Dissect repository
print("Cloning CLIP-Dissect repository...")
!git clone https://github.com/Trustworthy-ML-Lab/CLIP-dissect.git
%cd CLIP-dissect

# Import necessary modules from CLIP-Dissect
import utils
import similarity
import data_utils

Using device: cuda
Installing packages...
[31mERROR: Could not find a version that satisfies the requirement clip-dissect (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for clip-dissect[0m[31m
[0mCloning CLIP-Dissect repository...
Cloning into 'CLIP-dissect'...
remote: Enumerating objects: 104, done.[K
remote: Counting objects: 100% (24/24), done.[K
remote: Compressing objects: 100% (16/16), done.[K
remote: Total 104 (delta 12), reused 14 (delta 8), pack-reused 80 (from 1)[K
Receiving objects: 100% (104/104), 15.94 MiB | 38.06 MiB/s, done.
Resolving deltas: 100% (33/33), done.
/content/CLIP-dissect


In [None]:
# Step 3: Download and extract the Broden dataset (224x224 resolution for ResNet18)
base_url = "http://netdissect.csail.mit.edu/data/"
dataset_name = "broden1_224"
zip_filename = f"{dataset_name}.zip"
download_dir = "dataset"
os.makedirs(download_dir, exist_ok=True)
full_zip_path = os.path.join(download_dir, zip_filename)

if not os.path.isfile(os.path.join(download_dir, dataset_name, "index.csv")):
    print(f"Downloading {dataset_name}...")
    urllib.request.urlretrieve(f"{base_url}{zip_filename}", full_zip_path)
    print(f"Unzipping {zip_filename}...")
    with zipfile.ZipFile(full_zip_path, "r") as zip_ref:
        zip_ref.extractall(download_dir)
    os.remove(full_zip_path)
    print(f"{dataset_name} downloaded and extracted to {download_dir}/{dataset_name}")
else:
    print(f"{dataset_name} already downloaded and extracted.")

# Set the dataset root path for Broden
data_utils.DATASET_ROOTS["broden"] = os.path.join(download_dir, dataset_name)

Downloading broden1_224...
Unzipping broden1_224.zip...
broden1_224 downloaded and extracted to dataset/broden1_224


In [None]:
# Verify Broden dataset structure
broden_path = os.path.join(download_dir, dataset_name)
if not os.path.isfile(os.path.join(broden_path, "index.csv")):
    print(f"Error: Broden dataset at {broden_path} is missing index.csv")
    raise FileNotFoundError("Broden dataset is not properly set up.")
print(f"Broden dataset verified at {broden_path}")

Broden dataset verified at dataset/broden1_224


In [None]:
# Check for image directories
image_dirs = [d for d in os.listdir(broden_path) if os.path.isdir(os.path.join(broden_path, d))]
print(f"Broden dataset verified at {broden_path}. Found image directories: {image_dirs}")

# Set the dataset root path for Broden
data_utils.DATASET_ROOTS["broden"] = broden_path

Broden dataset verified at dataset/broden1_224. Found image directories: ['images']


In [None]:
# Step 4: Create a simple concept set file (20k.txt)
os.makedirs("data", exist_ok=True)
concepts = [
    "dog", "cat", "car", "tree", "mountain", "sky", "water",
    "building", "grass", "road", "house", "forest", "river",
    "person", "bird", "cloud", "chair", "table", "window", "door"
]  # Small list for demo; expand with full Broden concepts for full analysis
concept_set_path = "data/20k.txt"
with open(concept_set_path, "w") as f:
    f.write("\n".join(concepts))
print(f"Concept set created at {concept_set_path} with {len(concepts)} concepts.")

Concept set created at data/20k.txt with 20 concepts.


In [None]:
# Step 5: Set up basic settings for the task
clip_model = "ViT-B/16"  # CLIP model for descriptions
target_layers = ["layer2", "layer3", "layer4"]  # Layers to dissect in ResNet18
d_probe = "broden"  # Dataset to use
batch_size = 50  # Reduced to avoid memory issues
activation_dir = "saved_activations"  # Where to save activations
result_dir = "results"  # Where to save results
pool_mode = "avg"  # Pooling mode for activations
similarity_fn_name = "soft_wpmi"  # Similarity function

In [None]:
# Step 6: Load ResNet18 models
# ImageNet-trained ResNet18
print("Loading ResNet18 for ImageNet...")
model_imagenet = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1).to(device)
model_imagenet.eval()

Loading ResNet18 for ImageNet...


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [None]:
# Places365-trained ResNet18
print("Loading ResNet18 for Places365...")
model_places = models.resnet18(weights=None)
model_places.fc = nn.Linear(model_places.fc.in_features, 365)  # Adjust for Places365 classes
places_url = "http://places2.csail.mit.edu/models_places365/resnet18_places365.pth.tar"

Loading ResNet18 for Places365...


In [None]:
try:
    checkpoint = torch.hub.load_state_dict_from_url(places_url, progress=True)
    state_dict = {k.replace('module.', ''): v for k, v in checkpoint['state_dict'].items()}
    model_places.load_state_dict(state_dict)
    model_places = model_places.to(device)
    model_places.eval()
    print("Places365 model loaded successfully.")
except Exception as e:
    print(f"Error loading Places365 model: {e}")
    raise

Places365 model loaded successfully.


In [None]:
import os
import torch
import torchvision.transforms as transforms
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import datetime
import json
from collections import Counter
import datetime
import json
import clip
from PIL import Image

In [None]:
# Step 7: Verify dataset loading and save activations for both models
os.makedirs(activation_dir, exist_ok=True)

# Define preprocessing transform for Broden dataset
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Verify dataset loading
try:
    print(f"Testing dataset loading for {d_probe}...")
    dataset = data_utils.get_data(d_probe, preprocess=preprocess)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False)
    print(f"Dataset loaded successfully. Number of samples: {len(dataset)}")
    for batch in dataloader:
        images, _ = batch
        print(f"Sample batch shape: {images.shape}")
        break
except Exception as e:
    print(f"Error loading dataset {d_probe}: {e}")
    raise

# Custom function to extract and save activations
def custom_save_activations(model, model_name, clip_model_name, target_layers, dataloader, concept_set_path, device, save_dir, pool_mode="avg"):
    # Load CLIP model
    clip_model, clip_preprocess = clip.load(clip_model_name, device=device)
    clip_model.eval()

    # Read concepts
    with open(concept_set_path, 'r') as f:
        concepts = f.read().splitlines()

    # Hook to capture activations
    activations = {}
    def hook_fn(module, input, output, layer_name):
        if pool_mode == "avg":
            activations[layer_name] = output.mean(dim=(2, 3))  # Average pooling over spatial dimensions
        else:
            activations[layer_name] = output.max(dim=2)[0].max(dim=2)[0]  # Max pooling

    # Register hooks for target layers
    hooks = []
    for layer_name in target_layers:
        layer = getattr(model, layer_name)
        hook = layer.register_forward_hook(lambda m, i, o, ln=layer_name: hook_fn(m, i, o, ln))
        hooks.append(hook)

    # Process dataset to collect target model activations
    target_acts = {layer: [] for layer in target_layers}
    with torch.no_grad():
        for images, _ in dataloader:
            images = images.to(device)
            activations.clear()
            model(images)  # Forward pass to trigger hooks
            for layer in target_layers:
                target_acts[layer].append(activations[layer].cpu())

    # Concatenate activations
    for layer in target_layers:
        target_acts[layer] = torch.cat(target_acts[layer], dim=0)

    # Remove hooks
    for hook in hooks:
        hook.remove()

    # Process images through CLIP for visual features
    clip_acts = []
    with torch.no_grad():
        for images, _ in dataloader:
            clip_images = torch.stack([clip_preprocess(Image.fromarray((img.permute(1, 2, 0).numpy() * 255).astype(np.uint8))) for img in images])
            clip_images = clip_images.to(device)
            clip_acts.append(clip_model.encode_image(clip_images).cpu())
    clip_acts = torch.cat(clip_acts, dim=0)

    # Process text (concepts) through CLIP
    text_inputs = clip.tokenize(concepts).to(device)
    with torch.no_grad():
        text_acts = clip_model.encode_text(text_inputs).cpu()

    # Save activations in CLIP-Dissect format
    for layer in target_layers:
        torch.save(target_acts[layer], os.path.join(save_dir, f"broden_{model_name}_{layer}.pt"))
        torch.save(clip_acts, os.path.join(save_dir, f"broden_{model_name}_{layer}_clip.pt"))
        torch.save(text_acts, os.path.join(save_dir, f"broden_{model_name}_text.pt"))

# Save activations using custom function
for model_name, model in [("resnet18", model_imagenet), ("resnet18_places", model_places)]:
    print(f"Saving activations for {model_name}...")
    try:
        print(f"Target layers for {model_name}: {target_layers}")
        custom_save_activations(
            model=model,
            model_name=model_name,
            clip_model_name=clip_model,
            target_layers=target_layers,
            dataloader=dataloader,
            concept_set_path=concept_set_path,
            device=device,
            save_dir=activation_dir,
            pool_mode=pool_mode
        )
        # Verify that activation files were created
        for layer in target_layers:
            activation_file = os.path.join(activation_dir, f"broden_{model_name}_{layer}.pt")
            clip_file = os.path.join(activation_dir, f"broden_{model_name}_{layer}_clip.pt")
            text_file = os.path.join(activation_dir, f"broden_{model_name}_text.pt")
            if os.path.isfile(activation_file) and os.path.isfile(clip_file) and os.path.isfile(text_file):
                print(f"Activation files created: {activation_file}, {clip_file}, {text_file}")
            else:
                print(f"Error: Activation files missing for {model_name}, layer {layer}")
                raise FileNotFoundError(f"Failed to save activations for {model_name}, layer {layer}")
        print(f"Activations saved successfully for {model_name}.")
    except Exception as e:
        print(f"Error saving activations for {model_name}: {e}")
        raise


Testing dataset loading for broden...
Dataset loaded successfully. Number of samples: 210365
Sample batch shape: torch.Size([50, 3, 224, 224])
Saving activations for resnet18...
Target layers for resnet18: ['layer2', 'layer3', 'layer4']
Activation files created: saved_activations/broden_resnet18_layer2.pt, saved_activations/broden_resnet18_layer2_clip.pt, saved_activations/broden_resnet18_text.pt
Activation files created: saved_activations/broden_resnet18_layer3.pt, saved_activations/broden_resnet18_layer3_clip.pt, saved_activations/broden_resnet18_text.pt
Activation files created: saved_activations/broden_resnet18_layer4.pt, saved_activations/broden_resnet18_layer4_clip.pt, saved_activations/broden_resnet18_text.pt
Activations saved successfully for resnet18.
Saving activations for resnet18_places...
Target layers for resnet18_places: ['layer2', 'layer3', 'layer4']
Activation files created: saved_activations/broden_resnet18_places_layer2.pt, saved_activations/broden_resnet18_places_la

In [None]:
# Step 8: Collect results for both models
similarity_fn = getattr(similarity, similarity_fn_name)
with open(concept_set_path, 'r') as f:
    words = f.read().splitlines()

for model_name in ["resnet18", "resnet18_places"]:
    print(f"Collecting results for {model_name}...")
    outputs = {"layer": [], "unit": [], "description": [], "similarity": []}
    for target_layer in target_layers:
        save_names = utils.get_save_names(
            clip_name=clip_model,
            target_name=model_name,
            target_layer=target_layer,
            d_probe=d_probe,
            concept_set=concept_set_path,
            pool_mode=pool_mode,
            save_dir=activation_dir
        )
        target_save_name, clip_save_name, text_save_name = save_names

        if not os.path.isfile(target_save_name):
            print(f"Error: Activation file {target_save_name} not found for {model_name}.")
            continue

        try:
            similarities = utils.get_similarity_from_activations(
                target_save_name, clip_save_name, text_save_name, similarity_fn,
                return_target_feats=False, device=device
            )
            vals, ids = torch.max(similarities, dim=1)
            descriptions = [words[int(idx)] for idx in ids]

            outputs["unit"].extend(range(len(vals)))
            outputs["layer"].extend([target_layer] * len(vals))
            outputs["description"].extend(descriptions)
            outputs["similarity"].extend(vals.cpu().numpy())
        except Exception as e:
            print(f"Error processing similarities for {model_name}, layer {target_layer}: {e}")
            continue

    # Save results to a CSV file
    df = pd.DataFrame(outputs)
    timestamp = datetime.datetime.now().strftime('%y_%m_%d_%H_%M')
    save_path = os.path.join(result_dir, f"{model_name}_{timestamp}")
    os.makedirs(save_path, exist_ok=True)
    csv_path = os.path.join(save_path, "descriptions.csv")
    df.to_csv(csv_path, index=False)
    print(f"Results saved for {model_name} at {csv_path}")


Collecting results for resnet18...


100%|██████████| 128/128 [00:00<00:00, 1439.29it/s]


torch.Size([128, 20])


100%|██████████| 256/256 [00:00<00:00, 6636.02it/s]


torch.Size([256, 20])


100%|██████████| 512/512 [00:00<00:00, 6752.35it/s]


torch.Size([512, 20])
Results saved for resnet18 at results/resnet18_25_07_21_20_32/descriptions.csv
Collecting results for resnet18_places...


100%|██████████| 128/128 [00:00<00:00, 6404.51it/s]


torch.Size([128, 20])


100%|██████████| 256/256 [00:00<00:00, 6757.73it/s]


torch.Size([256, 20])


100%|██████████| 512/512 [00:00<00:00, 6693.73it/s]

torch.Size([512, 20])
Results saved for resnet18_places at results/resnet18_places_25_07_21_20_32/descriptions.csv





In [None]:
# Step 9: Load results for analysis
try:
    latest_imagenet_dir = max([d for d in os.listdir(result_dir) if "resnet18_" in d and not "places" in d], key=lambda x: os.path.getctime(os.path.join(result_dir, x)))
    latest_places_dir = max([d for d in os.listdir(result_dir) if "resnet18_places" in d], key=lambda x: os.path.getctime(os.path.join(result_dir, x)))

    df_imagenet = pd.read_csv(os.path.join(result_dir, latest_imagenet_dir, "descriptions.csv"))
    df_places = pd.read_csv(os.path.join(result_dir, latest_places_dir, "descriptions.csv"))
    print("Result CSV files loaded successfully.")
except Exception as e:
    print(f"Error loading result CSV files: {e}")
    raise


Result CSV files loaded successfully.


In [None]:
# Step 10: Analyze results
# Most common concepts
print("\nTop 5 concepts for ResNet18 (ImageNet):")
imagenet_counts = Counter(df_imagenet['description'])
for concept, count in imagenet_counts.most_common(5):
    print(f"{concept}: {count} neurons")

print("\nTop 5 concepts for ResNet18 (Places365):")
places_counts = Counter(df_places['description'])
for concept, count in places_counts.most_common(5):
    print(f"{concept}: {count} neurons")

# Compare concepts between models
imagenet_set = set(imagenet_counts.keys())
places_set = set(places_counts.keys())
common_concepts = imagenet_set.intersection(places_set)
unique_imagenet = imagenet_set - places_set
unique_places = places_set - imagenet_set
print(f"\nCommon concepts between models: {len(common_concepts)}")
print(f"Concepts unique to ImageNet: {len(unique_imagenet)}")
print(f"Concepts unique to Places365: {len(unique_places)}")

# Count unique objects (excluding scene-like concepts)
non_object_concepts = ['sky', 'grass', 'road', 'mountain', 'water', 'cloud', 'forest', 'river']
imagenet_objects = len([c for c in imagenet_counts.keys() if c not in non_object_concepts])
places_objects = len([c for c in places_counts.keys() if c not in non_object_concepts])
print(f"\nUnique objects in ImageNet: {imagenet_objects}")
print(f"Unique objects in Places365: {places_objects}")

# Plot similarity score distribution
plt.figure(figsize=(10, 6))
plt.hist(df_imagenet['similarity'], bins=30, alpha=0.5, label='ImageNet', color='blue')
plt.hist(df_places['similarity'], bins=30, alpha=0.5, label='Places365', color='green')
plt.xlabel('Similarity Score')
plt.ylabel('Frequency')
plt.title('Similarity Score Distribution')
plt.legend()
plt.savefig(os.path.join(result_dir, 'similarity_distribution.png'))
plt.close()
print("Similarity distribution plot saved.")


Top 5 concepts for ResNet18 (ImageNet):
cloud: 246 neurons
chair: 90 neurons
person: 86 neurons
tree: 80 neurons
house: 67 neurons

Top 5 concepts for ResNet18 (Places365):
person: 180 neurons
sky: 151 neurons
cloud: 116 neurons
chair: 91 neurons
river: 62 neurons

Common concepts between models: 20
Concepts unique to ImageNet: 0
Concepts unique to Places365: 0

Unique objects in ImageNet: 12
Unique objects in Places365: 12
Similarity distribution plot saved.


In [None]:
# Plot layer-wise concept counts
for df, model_name in [(df_imagenet, "ResNet18_ImageNet"), (df_places, "ResNet18_Places365")]:
    layer_counts = df.groupby('layer')['description'].value_counts().unstack(fill_value=0)
    layer_counts.plot(kind='bar', stacked=True, figsize=(10, 6))
    plt.title(f'Concept Distribution by Layer - {model_name}')
    plt.xlabel('Layer')
    plt.ylabel('Number of Neurons')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.savefig(os.path.join(result_dir, f'{model_name}_layer_concepts.png'))
    plt.close()
    print(f"Layer-wise concept plot saved for {model_name}.")


Layer-wise concept plot saved for ResNet18_ImageNet.
Layer-wise concept plot saved for ResNet18_Places365.


In [None]:
# Save analysis summary
os.makedirs(result_dir, exist_ok=True)
with open(os.path.join(result_dir, 'analysis_summary.txt'), 'w') as f:
    f.write("Network Dissection Analysis Summary\n")
    f.write("==============================\n")
    f.write(f"Top 5 concepts (ImageNet):\n{json.dumps(dict(imagenet_counts.most_common(5)), indent=2)}\n")
    f.write(f"Top 5 concepts (Places365):\n{json.dumps(dict(places_counts.most_common(5)), indent=2)}\n")
    f.write(f"Common concepts: {len(common_concepts)}\n")
    f.write(f"Unique to ImageNet: {len(unique_imagenet)}\n")
    f.write(f"Unique to Places365: {len(unique_places)}\n")
    f.write(f"Unique objects (ImageNet): {imagenet_objects}\n")
    f.write(f"Unique objects (Places365): {places_objects}\n")
print("Analysis summary saved at results/analysis_summary.txt")

print("Analysis complete. Check the 'results' directory for outputs.")

Analysis summary saved at results/analysis_summary.txt
Analysis complete. Check the 'results' directory for outputs.


In [None]:
from google.colab import drive
import shutil
import os

# Mount Google Drive
drive.mount('/content/drive')

# Define source and destination paths
source_dir = 'results'
dest_dir = '/content/drive/MyDrive/CLIP_Dissect_Results'

# Copy the results directory to Google Drive
if os.path.exists(source_dir):
    shutil.copytree(source_dir, dest_dir, dirs_exist_ok=True)
    print(f"Results directory copied to {dest_dir}")
else:
    print(f"Error: {source_dir} does not exist")

Mounted at /content/drive
Results directory copied to /content/drive/MyDrive/CLIP_Dissect_Results
