全部重新理一遍
### 我们需要什么CLIP根据原有的提示词生成的嵌入 我们用attr标签生成了对应的text_imbeddings,利用图片本身生成了image_imbeddings 其中还分割了图片本身
在preprocess_clip_embeddings.ipynb中第一个就是（我觉得正确着呢）

### 利用上述生成的图像嵌入和文本嵌入训练出我们的映射网络
这时候就可以评测映射网络了，在文件text_to_image_embedder.ipynb中
唯一用到的（是第二个）

### tsne-后来的对比

In [3]:
# 导入所需库
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import os
from tqdm import tqdm
from torch.nn import functional as F

# 定义映射网络模型（确保与训练时使用的结构相同）
class TextToImageEmbedder(nn.Module):
    def __init__(self, clip_dim=512, embed_dim=512):
        super(TextToImageEmbedder, self).__init__()
        self.mapping = nn.Sequential(
            nn.Linear(clip_dim, 2048),
            nn.BatchNorm1d(2048),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(2048, 1024),
            nn.BatchNorm1d(1024),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(1024, embed_dim),
            nn.BatchNorm1d(embed_dim),
            nn.ReLU(),
            nn.Linear(embed_dim, embed_dim)
        )
    
    def forward(self, text_embeddings):
        image_embeddings = self.mapping(text_embeddings)
        return image_embeddings

# 设置设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 定义文件路径
MODEL_PATH = 'text_to_image_embedder.pth'
TEXT_EMBEDDINGS_PATH = './clip_embeddings/text_embeddings.pt'
IMAGE_EMBEDDINGS_PATH = './clip_embeddings/image_embeddings.pt'
OUTPUT_DIR = './comparison_results'
os.makedirs(OUTPUT_DIR, exist_ok=True)

# 加载嵌入
text_embeddings = torch.load(TEXT_EMBEDDINGS_PATH)
true_image_embeddings = torch.load(IMAGE_EMBEDDINGS_PATH)

# 检查和转换数据类型
print(f"Text embeddings data type: {text_embeddings.dtype}")
print(f"Image embeddings data type: {true_image_embeddings.dtype}")

# 确保两者都是 float32 类型
if text_embeddings.dtype != torch.float32:
    text_embeddings = text_embeddings.float()
    print("Converted text embeddings to float32")
if true_image_embeddings.dtype != torch.float32:
    true_image_embeddings = true_image_embeddings.float()
    print("Converted image embeddings to float32")

# 加载模型
embedder = TextToImageEmbedder().to(device)
embedder.load_state_dict(torch.load(MODEL_PATH, map_location=device))
embedder.eval()

# 使用训练好的映射网络生成图像嵌入
print("Generating predicted image embeddings...")
with torch.no_grad():
    # 分批处理以减少内存使用
    batch_size = 256
    pred_image_embeddings = []
    
    for i in tqdm(range(0, len(text_embeddings), batch_size)):
        batch = text_embeddings[i:i+batch_size].to(device)
        preds = embedder(batch).cpu()
        pred_image_embeddings.append(preds)
    
    pred_image_embeddings = torch.cat(pred_image_embeddings, dim=0)

# 确保嵌入向量已归一化（对于余弦相似度计算）
pred_image_embeddings_norm = F.normalize(pred_image_embeddings, p=2, dim=1)
true_image_embeddings_norm = F.normalize(true_image_embeddings, p=2, dim=1)

# 计算余弦相似度
print("Computing cosine similarity...")
cos_sim = torch.sum(pred_image_embeddings_norm * true_image_embeddings_norm, dim=1)
mean_cos_sim = cos_sim.mean().item()
std_cos_sim = cos_sim.std().item()

print(f"Average cosine similarity: {mean_cos_sim:.4f} ± {std_cos_sim:.4f}")

# 保存相似度分布直方图
plt.figure(figsize=(10, 6))
plt.hist(cos_sim.numpy(), bins=50, alpha=0.7)
plt.axvline(mean_cos_sim, color='r', linestyle='--', label=f'Mean: {mean_cos_sim:.4f}')
plt.title('Cosine Similarity Distribution Between Predicted and Real Image Embeddings')
plt.xlabel('Cosine Similarity')
plt.ylabel('Frequency')
plt.grid(True, alpha=0.3)
plt.legend()
plt.savefig(os.path.join(OUTPUT_DIR, 'cosine_similarity_histogram.png'))
plt.close()

# 使用t-SNE可视化嵌入
print("Performing t-SNE dimensionality reduction...")

# 为了提高效率，随机采样部分数据点（如果数据集很大）
sample_size = min(5000, len(pred_image_embeddings))
indices = np.random.choice(len(pred_image_embeddings), sample_size, replace=False)

# 准备t-SNE的输入数据
combined_embeddings = np.vstack([
    pred_image_embeddings[indices].numpy(),
    true_image_embeddings[indices].numpy()
])

# 创建标签（0表示预测嵌入，1表示真实嵌入）
labels = np.array([0] * sample_size + [1] * sample_size)

# 执行t-SNE降维
tsne = TSNE(n_components=2, random_state=42, perplexity=30, max_iter=1000)  # 使用max_iter而不是n_iter
embeddings_2d = tsne.fit_transform(combined_embeddings)

# 分离预测嵌入和真实嵌入的t-SNE结果
pred_2d = embeddings_2d[:sample_size]
true_2d = embeddings_2d[sample_size:]

# 绘制t-SNE散点图
plt.figure(figsize=(12, 10))
plt.scatter(pred_2d[:, 0], pred_2d[:, 1], alpha=0.5, label='Predicted Image Embeddings', s=10)
plt.scatter(true_2d[:, 0], true_2d[:, 1], alpha=0.5, label='Real Image Embeddings', s=10)
plt.title('t-SNE Visualization of Image Embeddings')
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig(os.path.join(OUTPUT_DIR, 'tsne_visualization.png'))
plt.close()

# 绘制连线图，显示相同索引的预测和真实嵌入之间的距离
plt.figure(figsize=(12, 10))
plt.scatter(pred_2d[:, 0], pred_2d[:, 1], alpha=0.5, label='Predicted Image Embeddings', s=10)
plt.scatter(true_2d[:, 0], true_2d[:, 1], alpha=0.5, label='Real Image Embeddings', s=10)

# 随机选择一些点（避免图形过于拥挤）
line_indices = np.random.choice(sample_size, 100, replace=False)
for idx in line_indices:
    plt.plot([pred_2d[idx, 0], true_2d[idx, 0]], 
             [pred_2d[idx, 1], true_2d[idx, 1]], 
             'k-', alpha=0.1)

plt.title('t-SNE Visualization of Image Embeddings (with Connection Lines)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig(os.path.join(OUTPUT_DIR, 'tsne_visualization_with_lines.png'))
plt.close()

print(f"Analysis complete! Results saved to {OUTPUT_DIR} directory")

Using device: cuda
Text embeddings data type: torch.float16
Image embeddings data type: torch.float16
Converted text embeddings to float32
Converted image embeddings to float32
Generating predicted image embeddings...


100%|██████████| 636/636 [00:00<00:00, 1473.61it/s]


Computing cosine similarity...
Average cosine similarity: 0.8324 ± 0.0507
Performing t-SNE dimensionality reduction...
Analysis complete! Results saved to ./comparison_results directory


### t-sne原本的内容

In [4]:
# 导入所需库
import torch
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import os
from tqdm import tqdm
from torch.nn import functional as F

# 设置设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 定义文件路径
TEXT_EMBEDDINGS_PATH = './clip_embeddings/text_embeddings.pt'
IMAGE_EMBEDDINGS_PATH = './clip_embeddings/image_embeddings.pt'
OUTPUT_DIR = './original_embeddings_comparison'
os.makedirs(OUTPUT_DIR, exist_ok=True)

# 加载嵌入
text_embeddings = torch.load(TEXT_EMBEDDINGS_PATH)
image_embeddings = torch.load(IMAGE_EMBEDDINGS_PATH)

# 检查和转换数据类型
print(f"Text embeddings data type: {text_embeddings.dtype}")
print(f"Image embeddings data type: {image_embeddings.dtype}")
print(f"Text embeddings shape: {text_embeddings.shape}")
print(f"Image embeddings shape: {image_embeddings.shape}")

# 确保两者都是 float32 类型
if text_embeddings.dtype != torch.float32:
    text_embeddings = text_embeddings.float()
    print("Converted text embeddings to float32")
if image_embeddings.dtype != torch.float32:
    image_embeddings = image_embeddings.float()
    print("Converted image embeddings to float32")

# 确保嵌入向量已归一化（对于余弦相似度计算）
text_embeddings_norm = F.normalize(text_embeddings, p=2, dim=1)
image_embeddings_norm = F.normalize(image_embeddings, p=2, dim=1)

# 计算余弦相似度
print("Computing cosine similarity between original text and image embeddings...")
cos_sim = torch.sum(text_embeddings_norm * image_embeddings_norm, dim=1)
mean_cos_sim = cos_sim.mean().item()
std_cos_sim = cos_sim.std().item()

print(f"Average cosine similarity: {mean_cos_sim:.4f} ± {std_cos_sim:.4f}")

# 保存相似度分布直方图
plt.figure(figsize=(10, 6))
plt.hist(cos_sim.numpy(), bins=50, alpha=0.7)
plt.axvline(mean_cos_sim, color='r', linestyle='--', label=f'Mean: {mean_cos_sim:.4f}')
plt.title('Cosine Similarity Distribution Between Original Text and Image Embeddings')
plt.xlabel('Cosine Similarity')
plt.ylabel('Frequency')
plt.grid(True, alpha=0.3)
plt.legend()
plt.savefig(os.path.join(OUTPUT_DIR, 'original_embeddings_cosine_similarity.png'))
plt.close()

# 使用t-SNE可视化嵌入
print("Performing t-SNE dimensionality reduction...")

# 为了提高效率，随机采样部分数据点（如果数据集很大）
sample_size = min(5000, len(text_embeddings))
indices = np.random.choice(len(text_embeddings), sample_size, replace=False)

# 准备t-SNE的输入数据
combined_embeddings = np.vstack([
    text_embeddings[indices].numpy(),
    image_embeddings[indices].numpy()
])

# 创建标签（0表示文本嵌入，1表示图像嵌入）
labels = np.array([0] * sample_size + [1] * sample_size)

# 执行t-SNE降维
tsne = TSNE(n_components=2, random_state=42, perplexity=30, max_iter=1000)
embeddings_2d = tsne.fit_transform(combined_embeddings)

# 分离文本嵌入和图像嵌入的t-SNE结果
text_2d = embeddings_2d[:sample_size]
image_2d = embeddings_2d[sample_size:]

# 绘制t-SNE散点图
plt.figure(figsize=(12, 10))
plt.scatter(text_2d[:, 0], text_2d[:, 1], alpha=0.5, label='Original Text Embeddings', s=10)
plt.scatter(image_2d[:, 0], image_2d[:, 1], alpha=0.5, label='Original Image Embeddings', s=10)
plt.title('t-SNE Visualization of Original Text and Image Embeddings')
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig(os.path.join(OUTPUT_DIR, 'original_embeddings_tsne.png'))
plt.close()

# 绘制连线图，显示相同索引的文本和图像嵌入之间的距离
plt.figure(figsize=(12, 10))
plt.scatter(text_2d[:, 0], text_2d[:, 1], alpha=0.5, label='Original Text Embeddings', s=10)
plt.scatter(image_2d[:, 0], image_2d[:, 1], alpha=0.5, label='Original Image Embeddings', s=10)

# 随机选择一些点（避免图形过于拥挤）
line_indices = np.random.choice(sample_size, 100, replace=False)
for idx in line_indices:
    plt.plot([text_2d[idx, 0], image_2d[idx, 0]], 
             [text_2d[idx, 1], image_2d[idx, 1]], 
             'k-', alpha=0.1)

plt.title('t-SNE Visualization of Original Embeddings (with Connection Lines)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig(os.path.join(OUTPUT_DIR, 'original_embeddings_tsne_with_lines.png'))
plt.close()

# 计算每对嵌入之间的欧氏距离
euclidean_distances = torch.norm(text_embeddings - image_embeddings, dim=1)
mean_distance = euclidean_distances.mean().item()
std_distance = euclidean_distances.std().item()

print(f"Average Euclidean distance: {mean_distance:.4f} ± {std_distance:.4f}")

# 保存欧氏距离分布直方图
plt.figure(figsize=(10, 6))
plt.hist(euclidean_distances.numpy(), bins=50, alpha=0.7)
plt.axvline(mean_distance, color='r', linestyle='--', label=f'Mean: {mean_distance:.4f}')
plt.title('Euclidean Distance Distribution Between Original Text and Image Embeddings')
plt.xlabel('Euclidean Distance')
plt.ylabel('Frequency')
plt.grid(True, alpha=0.3)
plt.legend()
plt.savefig(os.path.join(OUTPUT_DIR, 'original_embeddings_euclidean_distance.png'))
plt.close()

print(f"Analysis complete! Results saved to {OUTPUT_DIR} directory")

# 为了比较，计算之前分析中预测图像嵌入与真实图像嵌入之间的相似度与原始文本-图像嵌入相似度
print("\nComparing similarities:")
print(f"Original text-to-image cosine similarity: {mean_cos_sim:.4f}")
print(f"Model-predicted text-to-image cosine similarity: 0.8324 (from previous analysis)")
improvement = 0.8324 - mean_cos_sim
print(f"Improvement: {improvement:.4f} ({improvement/mean_cos_sim*100:.2f}%)")

Using device: cuda
Text embeddings data type: torch.float16
Image embeddings data type: torch.float16
Text embeddings shape: torch.Size([162770, 512])
Image embeddings shape: torch.Size([162770, 512])
Converted text embeddings to float32
Converted image embeddings to float32
Computing cosine similarity between original text and image embeddings...
Average cosine similarity: 0.2546 ± 0.0209
Performing t-SNE dimensionality reduction...
Average Euclidean distance: 11.8694 ± 0.6074
Analysis complete! Results saved to ./original_embeddings_comparison directory

Comparing similarities:
Original text-to-image cosine similarity: 0.2546
Model-predicted text-to-image cosine similarity: 0.8324 (from previous analysis)
Improvement: 0.5778 (226.95%)


### PCA的对比

In [8]:
# 导入所需库
import torch
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
from torch.nn import functional as F
from sklearn.decomposition import PCA
import torch.nn as nn

# 设置设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 定义文件路径
TEXT_EMBEDDINGS_PATH = './clip_embeddings/text_embeddings.pt'
IMAGE_EMBEDDINGS_PATH = './clip_embeddings/image_embeddings.pt'
MODEL_PATH = 'text_to_image_embedder.pth'
OUTPUT_DIR = './pca_visualization_results'
os.makedirs(OUTPUT_DIR, exist_ok=True)

# 定义映射网络模型
class TextToImageEmbedder(nn.Module):
    def __init__(self, clip_dim=512, embed_dim=512):
        super(TextToImageEmbedder, self).__init__()
        self.mapping = nn.Sequential(
            nn.Linear(clip_dim, 2048),
            nn.BatchNorm1d(2048),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(2048, 1024),
            nn.BatchNorm1d(1024),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(1024, embed_dim),
            nn.BatchNorm1d(embed_dim),
            nn.ReLU(),
            nn.Linear(embed_dim, embed_dim)
        )
    
    def forward(self, text_embeddings):
        image_embeddings = self.mapping(text_embeddings)
        return image_embeddings

# 加载嵌入
print("Loading embeddings...")
text_embeddings = torch.load(TEXT_EMBEDDINGS_PATH)
image_embeddings = torch.load(IMAGE_EMBEDDINGS_PATH)

# 检查和转换数据类型
print(f"Text embeddings data type: {text_embeddings.dtype}")
print(f"Image embeddings data type: {image_embeddings.dtype}")
print(f"Text embeddings shape: {text_embeddings.shape}")
print(f"Image embeddings shape: {image_embeddings.shape}")

# 确保两者都是 float32 类型
if text_embeddings.dtype != torch.float32:
    text_embeddings = text_embeddings.float()
    print("Converted text embeddings to float32")
if image_embeddings.dtype != torch.float32:
    image_embeddings = image_embeddings.float()
    print("Converted image embeddings to float32")

# ------------ 第一部分：比较原始文本和图像嵌入 ------------

# 确保嵌入向量已归一化（对于余弦相似度计算）
text_embeddings_norm = F.normalize(text_embeddings, p=2, dim=1)
image_embeddings_norm = F.normalize(image_embeddings, p=2, dim=1)

# 计算余弦相似度
print("Computing cosine similarity between original text and image embeddings...")
cos_sim = torch.sum(text_embeddings_norm * image_embeddings_norm, dim=1)
mean_cos_sim = cos_sim.mean().item()
std_cos_sim = cos_sim.std().item()

print(f"Average cosine similarity: {mean_cos_sim:.4f} ± {std_cos_sim:.4f}")

# 为了提高效率，随机采样部分数据点
sample_size = min(5000, len(text_embeddings))
indices = np.random.choice(len(text_embeddings), sample_size, replace=False)

# 准备PCA的输入数据
print("Preparing data for PCA visualization...")
original_combined_embeddings = np.vstack([
    text_embeddings[indices].numpy(),
    image_embeddings[indices].numpy()
])

# 执行PCA降维
print("Performing PCA dimensionality reduction for original embeddings...")
pca = PCA(n_components=2, random_state=42)
original_embeddings_2d = pca.fit_transform(original_combined_embeddings)
print(f"PCA explained variance ratio: {pca.explained_variance_ratio_}")

# 分离文本嵌入和图像嵌入的PCA结果
text_2d = original_embeddings_2d[:sample_size]
image_2d = original_embeddings_2d[sample_size:]

# 绘制PCA散点图
plt.figure(figsize=(12, 10))
plt.scatter(text_2d[:, 0], text_2d[:, 1], alpha=0.5, label='Original Text Embeddings', s=10)
plt.scatter(image_2d[:, 0], image_2d[:, 1], alpha=0.5, label='Original Image Embeddings', s=10)
plt.title('PCA Visualization of Original Text and Image Embeddings')
plt.xlabel(f'Principal Component 1 ({pca.explained_variance_ratio_[0]:.2%})')
plt.ylabel(f'Principal Component 2 ({pca.explained_variance_ratio_[1]:.2%})')
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig(os.path.join(OUTPUT_DIR, 'original_embeddings_pca.png'))
plt.close()

# 绘制连线图，显示相同索引的文本和图像嵌入之间的距离
plt.figure(figsize=(12, 10))
plt.scatter(text_2d[:, 0], text_2d[:, 1], alpha=0.5, label='Original Text Embeddings', s=10)
plt.scatter(image_2d[:, 0], image_2d[:, 1], alpha=0.5, label='Original Image Embeddings', s=10)

# 随机选择一些点（避免图形过于拥挤）
line_indices = np.random.choice(sample_size, 100, replace=False)
for idx in line_indices:
    plt.plot([text_2d[idx, 0], image_2d[idx, 0]], 
             [text_2d[idx, 1], image_2d[idx, 1]], 
             'k-', alpha=0.1)

plt.title('PCA Visualization of Original Embeddings (with Connection Lines)')
plt.xlabel(f'Principal Component 1 ({pca.explained_variance_ratio_[0]:.2%})')
plt.ylabel(f'Principal Component 2 ({pca.explained_variance_ratio_[1]:.2%})')
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig(os.path.join(OUTPUT_DIR, 'original_embeddings_pca_with_lines.png'))
plt.close()

# ------------ 第二部分：使用映射网络生成预测图像嵌入并比较 ------------

# 加载模型
print("Loading model...")
embedder = TextToImageEmbedder().to(device)
embedder.load_state_dict(torch.load(MODEL_PATH, map_location=device))
embedder.eval()

# 使用训练好的映射网络生成图像嵌入
print("Generating predicted image embeddings...")
with torch.no_grad():
    # 分批处理以减少内存使用
    batch_size = 256
    pred_image_embeddings = []
    
    for i in tqdm(range(0, len(text_embeddings), batch_size)):
        batch = text_embeddings[i:i+batch_size].to(device)
        preds = embedder(batch).cpu()
        pred_image_embeddings.append(preds)
    
    pred_image_embeddings = torch.cat(pred_image_embeddings, dim=0)

# 确保嵌入向量已归一化
pred_image_embeddings_norm = F.normalize(pred_image_embeddings, p=2, dim=1)

# 计算预测图像嵌入与真实图像嵌入的余弦相似度
print("Computing cosine similarity between predicted and real image embeddings...")
pred_cos_sim = torch.sum(pred_image_embeddings_norm[indices] * image_embeddings_norm[indices], dim=1)
pred_mean_cos_sim = pred_cos_sim.mean().item()
pred_std_cos_sim = pred_cos_sim.std().item()

print(f"Average cosine similarity (predicted vs real): {pred_mean_cos_sim:.4f} ± {pred_std_cos_sim:.4f}")

# 准备PCA的输入数据
print("Preparing data for PCA visualization...")
predicted_combined_embeddings = np.vstack([
    pred_image_embeddings[indices].numpy(),
    image_embeddings[indices].numpy()
])

# 执行PCA降维
print("Performing PCA dimensionality reduction for predicted vs real embeddings...")
pca_pred = PCA(n_components=2, random_state=42)
predicted_embeddings_2d = pca_pred.fit_transform(predicted_combined_embeddings)
print(f"PCA explained variance ratio: {pca_pred.explained_variance_ratio_}")

# 分离预测图像嵌入和真实图像嵌入的PCA结果
pred_2d = predicted_embeddings_2d[:sample_size]
true_2d = predicted_embeddings_2d[sample_size:]

# 绘制PCA散点图
plt.figure(figsize=(12, 10))
plt.scatter(pred_2d[:, 0], pred_2d[:, 1], alpha=0.5, label='Predicted Image Embeddings', s=10)
plt.scatter(true_2d[:, 0], true_2d[:, 1], alpha=0.5, label='Real Image Embeddings', s=10)
plt.title('PCA Visualization of Predicted vs Real Image Embeddings')
plt.xlabel(f'Principal Component 1 ({pca_pred.explained_variance_ratio_[0]:.2%})')
plt.ylabel(f'Principal Component 2 ({pca_pred.explained_variance_ratio_[1]:.2%})')
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig(os.path.join(OUTPUT_DIR, 'predicted_vs_real_pca.png'))
plt.close()

# 绘制连线图
plt.figure(figsize=(12, 10))
plt.scatter(pred_2d[:, 0], pred_2d[:, 1], alpha=0.5, label='Predicted Image Embeddings', s=10)
plt.scatter(true_2d[:, 0], true_2d[:, 1], alpha=0.5, label='Real Image Embeddings', s=10)

# 使用相同的line_indices保持一致性
for idx in line_indices:
    plt.plot([pred_2d[idx, 0], true_2d[idx, 0]], 
             [pred_2d[idx, 1], true_2d[idx, 1]], 
             'k-', alpha=0.1)

plt.title('PCA Visualization of Predicted vs Real Image Embeddings (with Connection Lines)')
plt.xlabel(f'Principal Component 1 ({pca_pred.explained_variance_ratio_[0]:.2%})')
plt.ylabel(f'Principal Component 2 ({pca_pred.explained_variance_ratio_[1]:.2%})')
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig(os.path.join(OUTPUT_DIR, 'predicted_vs_real_pca_with_lines.png'))
plt.close()

# 比较改进情况
improvement = pred_mean_cos_sim - mean_cos_sim
print("\nComparison of similarities:")
print(f"Original text-to-image cosine similarity: {mean_cos_sim:.4f}")
print(f"Predicted-to-real image cosine similarity: {pred_mean_cos_sim:.4f}")
print(f"Improvement: {improvement:.4f} ({improvement/mean_cos_sim*100:.2f}%)")

print(f"\nVisualization complete! Results saved to {OUTPUT_DIR} directory")

Using device: cuda
Loading embeddings...


Text embeddings data type: torch.float16
Image embeddings data type: torch.float16
Text embeddings shape: torch.Size([162770, 512])
Image embeddings shape: torch.Size([162770, 512])
Converted text embeddings to float32
Converted image embeddings to float32
Computing cosine similarity between original text and image embeddings...
Average cosine similarity: 0.2546 ± 0.0209
Preparing data for PCA visualization...
Performing PCA dimensionality reduction for original embeddings...
PCA explained variance ratio: [0.44088757 0.04314086]
Loading model...
Generating predicted image embeddings...


100%|██████████| 636/636 [00:00<00:00, 1740.40it/s]


Computing cosine similarity between predicted and real image embeddings...
Average cosine similarity (predicted vs real): 0.8313 ± 0.0518
Preparing data for PCA visualization...
Performing PCA dimensionality reduction for predicted vs real embeddings...
PCA explained variance ratio: [0.7328643  0.05078154]

Comparison of similarities:
Original text-to-image cosine similarity: 0.2546
Predicted-to-real image cosine similarity: 0.8313
Improvement: 0.5768 (226.54%)

Visualization complete! Results saved to ./pca_visualization_results directory


### MDS的对比

In [10]:
# 导入所需库
import torch
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
from torch.nn import functional as F
from sklearn.manifold import MDS
import torch.nn as nn
from sklearn.metrics.pairwise import cosine_distances

# 设置设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 定义文件路径
TEXT_EMBEDDINGS_PATH = './clip_embeddings/text_embeddings.pt'
IMAGE_EMBEDDINGS_PATH = './clip_embeddings/image_embeddings.pt'
MODEL_PATH = 'text_to_image_embedder.pth'
OUTPUT_DIR = './improved_mds_visualization'
os.makedirs(OUTPUT_DIR, exist_ok=True)

# 定义映射网络模型
class TextToImageEmbedder(nn.Module):
    def __init__(self, clip_dim=512, embed_dim=512):
        super(TextToImageEmbedder, self).__init__()
        self.mapping = nn.Sequential(
            nn.Linear(clip_dim, 2048),
            nn.BatchNorm1d(2048),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(2048, 1024),
            nn.BatchNorm1d(1024),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(1024, embed_dim),
            nn.BatchNorm1d(embed_dim),
            nn.ReLU(),
            nn.Linear(embed_dim, embed_dim)
        )
    
    def forward(self, text_embeddings):
        image_embeddings = self.mapping(text_embeddings)
        return image_embeddings

# 加载嵌入
print("Loading embeddings...")
text_embeddings = torch.load(TEXT_EMBEDDINGS_PATH)
image_embeddings = torch.load(IMAGE_EMBEDDINGS_PATH)

# 检查和转换数据类型
print(f"Text embeddings data type: {text_embeddings.dtype}")
print(f"Image embeddings data type: {image_embeddings.dtype}")
print(f"Text embeddings shape: {text_embeddings.shape}")
print(f"Image embeddings shape: {image_embeddings.shape}")

# 确保两者都是 float32 类型
if text_embeddings.dtype != torch.float32:
    text_embeddings = text_embeddings.float()
    print("Converted text embeddings to float32")
if image_embeddings.dtype != torch.float32:
    image_embeddings = image_embeddings.float()
    print("Converted image embeddings to float32")

# 确保嵌入向量已归一化（对于余弦相似度计算）
text_embeddings_norm = F.normalize(text_embeddings, p=2, dim=1)
image_embeddings_norm = F.normalize(image_embeddings, p=2, dim=1)

# 加载模型并生成预测嵌入
print("Loading model...")
embedder = TextToImageEmbedder().to(device)
embedder.load_state_dict(torch.load(MODEL_PATH, map_location=device))
embedder.eval()

# 使用训练好的映射网络生成图像嵌入
print("Generating predicted image embeddings...")
with torch.no_grad():
    # 分批处理以减少内存使用
    batch_size = 256
    pred_image_embeddings = []
    
    for i in tqdm(range(0, len(text_embeddings), batch_size)):
        batch = text_embeddings[i:i+batch_size].to(device)
        preds = embedder(batch).cpu()
        pred_image_embeddings.append(preds)
    
    pred_image_embeddings = torch.cat(pred_image_embeddings, dim=0)

# 确保预测嵌入向量已归一化
pred_image_embeddings_norm = F.normalize(pred_image_embeddings, p=2, dim=1)

# 计算余弦相似度
print("Computing cosine similarities...")
orig_cos_sim = torch.sum(text_embeddings_norm * image_embeddings_norm, dim=1)
pred_cos_sim = torch.sum(pred_image_embeddings_norm * image_embeddings_norm, dim=1)

orig_mean_cos_sim = orig_cos_sim.mean().item()
pred_mean_cos_sim = pred_cos_sim.mean().item()

print(f"Original text-to-image cosine similarity: {orig_mean_cos_sim:.4f}")
print(f"Predicted-to-real image cosine similarity: {pred_mean_cos_sim:.4f}")
improvement = pred_mean_cos_sim - orig_mean_cos_sim
print(f"Improvement: {improvement:.4f} ({improvement/orig_mean_cos_sim*100:.2f}%)")

# 为了更好的可视化效果，增加采样点数
sample_size = 800  # 使用800个点来平衡计算速度和视觉效果
indices = np.random.choice(len(text_embeddings), sample_size, replace=False)

# 提取采样的嵌入
sampled_text_embeds = text_embeddings[indices]
sampled_image_embeds = image_embeddings[indices]
sampled_pred_embeds = pred_image_embeddings[indices]

# 设置MDS参数 - 调整以更关注整体分布重叠
# 使用较小的eps值和更多迭代次数来提高精度
mds_params = {
    'n_components': 2,
    'random_state': 42,
    'n_init': 1,
    'max_iter': 500,  # 增加迭代次数
    'n_jobs': 1,
    'dissimilarity': 'precomputed',
    'normalized_stress': 'auto',
    'eps': 1e-6  # 更严格的收敛标准
}

# ------------ 第一部分：所有三种嵌入的综合可视化 ------------
print("Creating combined visualization of all embeddings...")

# 准备包含所有三种嵌入的数据
all_combined_embeddings = np.vstack([
    sampled_text_embeds.numpy(),
    sampled_image_embeds.numpy(),
    sampled_pred_embeds.numpy()
])

# 计算余弦距离矩阵
cosine_dist_matrix = cosine_distances(all_combined_embeddings)

# 执行MDS降维
print("Performing MDS for all three embedding types...")
mds = MDS(**mds_params)
all_embeddings_2d = mds.fit_transform(cosine_dist_matrix)
print(f"MDS stress: {mds.stress_:.4f}")

# 分离三种嵌入的MDS结果
text_2d = all_embeddings_2d[:sample_size]
image_2d = all_embeddings_2d[sample_size:2*sample_size]
pred_2d = all_embeddings_2d[2*sample_size:]

# 绘制包含三种嵌入的散点图
plt.figure(figsize=(14, 12))
plt.scatter(text_2d[:, 0], text_2d[:, 1], alpha=0.6, label='Original Text Embeddings', s=15, color='blue')
plt.scatter(image_2d[:, 0], image_2d[:, 1], alpha=0.6, label='Real Image Embeddings', s=15, color='orange')
plt.scatter(pred_2d[:, 0], pred_2d[:, 1], alpha=0.6, label='Predicted Image Embeddings', s=15, color='green')

# 添加标题和标签
plt.title('MDS Visualization of All Three Embedding Types', fontsize=16)
plt.legend(fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, 'three_embeddings_mds.png'), dpi=300)
plt.close()

# ------------ 第二部分：原始文本和图像嵌入与预测图像嵌入的距离连线 ------------
print("Creating visualizations with connection lines...")

# 绘制文本到预测图像的连线 - 显示映射效果
plt.figure(figsize=(14, 12))
plt.scatter(text_2d[:, 0], text_2d[:, 1], alpha=0.5, label='Original Text Embeddings', s=10, color='blue')
plt.scatter(pred_2d[:, 0], pred_2d[:, 1], alpha=0.5, label='Predicted Image Embeddings', s=10, color='green')

# 选择50个点显示连线
line_indices = np.random.choice(sample_size, 50, replace=False)
for idx in line_indices:
    plt.plot([text_2d[idx, 0], pred_2d[idx, 0]], 
             [text_2d[idx, 1], pred_2d[idx, 1]], 
             'k-', alpha=0.2)

plt.title('Text to Predicted Image Embedding Mapping', fontsize=16)
plt.legend(fontsize=12)
plt.grid(True, alpha=0.3)
plt.savefig(os.path.join(OUTPUT_DIR, 'text_to_predicted_connections.png'), dpi=300)
plt.close()

# 绘制预测图像到真实图像的连线 - 显示预测与真实的差距
plt.figure(figsize=(14, 12))
plt.scatter(pred_2d[:, 0], pred_2d[:, 1], alpha=0.5, label='Predicted Image Embeddings', s=10, color='green')
plt.scatter(image_2d[:, 0], image_2d[:, 1], alpha=0.5, label='Real Image Embeddings', s=10, color='orange')

# 使用相同的索引绘制连线
for idx in line_indices:
    plt.plot([pred_2d[idx, 0], image_2d[idx, 0]], 
             [pred_2d[idx, 1], image_2d[idx, 1]], 
             'k-', alpha=0.2)

plt.title('Predicted vs Real Image Embeddings', fontsize=16)
plt.legend(fontsize=12)
plt.grid(True, alpha=0.3)
plt.savefig(os.path.join(OUTPUT_DIR, 'predicted_to_real_connections.png'), dpi=300)
plt.close()

# ------------ 第三部分：距离分布直方图 ------------
print("Creating distance distribution histograms...")

# 计算欧氏距离
text_to_image_dist = np.sqrt(np.sum((sampled_text_embeds.numpy() - sampled_image_embeds.numpy())**2, axis=1))
pred_to_image_dist = np.sqrt(np.sum((sampled_pred_embeds.numpy() - sampled_image_embeds.numpy())**2, axis=1))

# 绘制欧氏距离分布直方图
plt.figure(figsize=(12, 8))
plt.hist(text_to_image_dist, bins=50, alpha=0.5, label='Text to Real Image', color='blue')
plt.hist(pred_to_image_dist, bins=50, alpha=0.5, label='Predicted to Real Image', color='green')
plt.axvline(np.mean(text_to_image_dist), color='blue', linestyle='--', 
           label=f'Mean Text-Image: {np.mean(text_to_image_dist):.2f}')
plt.axvline(np.mean(pred_to_image_dist), color='green', linestyle='--', 
           label=f'Mean Pred-Image: {np.mean(pred_to_image_dist):.2f}')
plt.title('Euclidean Distance Distribution', fontsize=16)
plt.xlabel('Distance', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.legend(fontsize=12)
plt.grid(True, alpha=0.3)
plt.savefig(os.path.join(OUTPUT_DIR, 'distance_distribution.png'), dpi=300)
plt.close()

print(f"Visualization complete! Results saved to {OUTPUT_DIR} directory")

Using device: cuda
Loading embeddings...
Text embeddings data type: torch.float16
Image embeddings data type: torch.float16
Text embeddings shape: torch.Size([162770, 512])
Image embeddings shape: torch.Size([162770, 512])
Converted text embeddings to float32
Converted image embeddings to float32
Loading model...
Generating predicted image embeddings...


100%|██████████| 636/636 [00:00<00:00, 1716.21it/s]


Computing cosine similarities...
Original text-to-image cosine similarity: 0.2546
Predicted-to-real image cosine similarity: 0.8324
Improvement: 0.5778 (226.94%)
Creating combined visualization of all embeddings...
Performing MDS for all three embedding types...
MDS stress: 25435.1744
Creating visualizations with connection lines...
Creating distance distribution histograms...
Visualization complete! Results saved to ./improved_mds_visualization directory


## 映射结果的计算 我们告一段落，主要用MDS即可，然后辅助性的图可以用t-sne图，还有余弦相似度的图，还有欧几里得距离的图

### 利用以上消息，我们接着干干嘛呢？ 接着我们就需要训练我们的模型本身内容 训练模型本身

其中代码中明确注释标记了：

0: train (训练集)

1: val (验证集)

2: test (测试集)

文件对应关系

因此，生成的文件对应关系如下：

训练用的嵌入文件:

文本嵌入: text_embeddings_partition_0.pt

图像嵌入: image_embeddings_partition_0.pt

验证用的嵌入文件:

文本嵌入: text_embeddings_partition_1.pt

图像嵌入: image_embeddings_partition_1.pt

测试用的嵌入文件:

文本嵌入: text_embeddings_partition_2.pt

图像嵌入: image_embeddings_partition_2.pt


所以我们本就应该用生成的图像嵌入和原本的图像作为训练的内容进行训练，也只能这样

训练的代码取自realtest3_clipcvae.ipynb的倒数第二个，最后一个是在这个基础上改的。


### 我们在CVAE训练的基础上 ， 用 Renyi divergence 重新训练我们的模型 训练成功了 就在Renyi_test2.ipynb中 Alpha=0.9

In [3]:
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import clip
from PIL import Image
import numpy as np
import pandas as pd
from tqdm import tqdm
import lpips
from scipy import stats

# FID计算需要的库
try:
    from pytorch_fid import fid_score
except ImportError:
    print("pytorch_fid未安装，请使用 pip install pytorch-fid 安装")

# Inception Score计算需要的库
try:
    from torchvision.models import inception_v3
except ImportError:
    print("torchvision未安装或版本过低，请使用 pip install -U torchvision 更新")

# ------------------------ #
# 1. 设备设置
# ------------------------ #
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"使用设备: {device}")

# ------------------------ #
# 2. 加载CLIP模型
# ------------------------ #
clip_model, preprocess = clip.load("ViT-B/32", device=device)
clip_model.eval()
print("CLIP模型已加载。")

# ------------------------ #
# 3. 定义CelebA数据集类（用于测试集）
# ------------------------ #
class CelebADataset(Dataset):
    def __init__(self, img_dir, attr_path, bbox_path, partition_path, 
                 image_embeddings_path, transform=None, partition=2):  # 默认加载测试集
        self.img_dir = img_dir
        self.transform = transform

        # 读取属性文件
        attr_df = pd.read_csv(attr_path, sep=',', header=0)
        
        # 读取分区文件
        partition_df = pd.read_csv(partition_path, sep=',', header=0, names=['image_id', 'partition'])
        
        # 读取边界框文件
        bbox_df = pd.read_csv(bbox_path, sep=',', header=0, names=['image_id', 'x_1', 'y_1', 'width', 'height'])
        
        # 合并数据
        attr_df = attr_df.merge(partition_df, on='image_id')
        attr_df = attr_df.merge(bbox_df, on='image_id')
        
        # 筛选分区
        print(f"Total samples before filtering: {len(attr_df)}")
        self.attr_df = attr_df[attr_df['partition'] == partition].reset_index(drop=True)
        print(f"Total samples after filtering partition {partition}: {len(self.attr_df)}")

        # 获取属性名称
        self.attr_names = [col for col in attr_df.columns if col not in ['image_id', 'partition', 'x_1', 'y_1', 'width', 'height']]
        
        # 加载预计算的图像嵌入
        self.image_embeddings = torch.load(image_embeddings_path)
        if len(self.image_embeddings) != len(self.attr_df):
            raise ValueError("图像嵌入的数量与数据集中的图像数量不一致。")
        self.image_embeddings = self.image_embeddings.float()

    def __len__(self):
        return len(self.attr_df)

    def __getitem__(self, idx):
        # 获取图像路径
        img_name = self.attr_df.iloc[idx, self.attr_df.columns.get_loc('image_id')]
        img_path = os.path.join(self.img_dir, img_name)
        image = Image.open(img_path).convert('RGB')

        if self.transform:
            image = self.transform(image)

        # 获取属性标签 
        attrs = self.attr_df.iloc[idx][self.attr_names].values
        attrs = (attrs + 1) // 2  # 将-1转为0，1保持1
        attrs = attrs.astype(np.float32)

        # 获取对应的图像嵌入
        clip_embedding = self.image_embeddings[idx]

        return image, attrs, clip_embedding, img_name

# ------------------------ #
# 4. 定义CVAE模型（KL和Rényi）
# ------------------------ #
class ClipCVAE(nn.Module):
    def __init__(self, img_channels=3, img_size=64, latent_dim=128, 
                 cond_dim=40, clip_dim=512):
        super(ClipCVAE, self).__init__()
        self.img_size = img_size
        self.latent_dim = latent_dim
        self.cond_dim = cond_dim
        self.clip_dim = clip_dim

        # 编码器部分
        self.encoder = nn.Sequential(
            nn.Conv2d(img_channels + cond_dim + clip_dim, 64, kernel_size=4, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 128, kernel_size=4, stride=2, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.Conv2d(128, 256, kernel_size=4, stride=2, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.Conv2d(256, 512, kernel_size=4, stride=2, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.Flatten()
        )
        self.fc_mu = nn.Linear(512*4*4, latent_dim)
        self.fc_logvar = nn.Linear(512*4*4, latent_dim)

        # 解码器部分
        self.decoder_input = nn.Linear(latent_dim + cond_dim + clip_dim, 512*4*4)
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(512, 256, kernel_size=4, stride=2, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.ConvTranspose2d(256, 128, kernel_size=4, stride=2, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.ConvTranspose2d(128, 64, kernel_size=4, stride=2, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.ConvTranspose2d(64, img_channels, kernel_size=4, stride=2, padding=1),
            nn.Tanh()
        )

    def encode(self, x, c, clip_embedding):
        c = c.view(c.size(0), self.cond_dim, 1, 1).repeat(1, 1, self.img_size, self.img_size)
        clip_embedding = clip_embedding.view(clip_embedding.size(0), self.clip_dim, 1, 1).repeat(1, 1, self.img_size, self.img_size)
        x = torch.cat([x, c, clip_embedding], dim=1)
        x = self.encoder(x)
        mu = self.fc_mu(x)
        logvar = self.fc_logvar(x)
        return mu, logvar

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z, c, clip_embedding):
        z = torch.cat([z, c, clip_embedding], dim=1)
        x = self.decoder_input(z)
        x = x.view(-1, 512, 4, 4)
        x = self.decoder(x)
        return x

    def forward(self, x, c, clip_embedding):
        mu, logvar = self.encode(x, c, clip_embedding)
        z = self.reparameterize(mu, logvar)
        recon_x = self.decode(z, c, clip_embedding)
        return recon_x, mu, logvar

# 对于Rényi模型，使用同样的架构
ClipCVAE_Renyi = ClipCVAE

# ------------------------ #
# 5. 评估指标函数
# ------------------------ #

# 5.1 FID评估函数
def calculate_fid(real_images, generated_images):
    """
    计算FID分数
    :param real_images: 真实图像张量
    :param generated_images: 生成图像张量
    :return: FID分数
    """
    # 创建临时目录
    os.makedirs("temp_real", exist_ok=True)
    os.makedirs("temp_gen", exist_ok=True)
    
    # 保存真实图像
    for i, img in enumerate(real_images):
        img = (img.cpu().numpy().transpose(1, 2, 0) * 0.5 + 0.5) * 255
        img = img.astype(np.uint8)
        Image.fromarray(img).save(f"temp_real/img_{i}.png")
    
    # 保存生成图像
    for i, img in enumerate(generated_images):
        img = (img.cpu().numpy().transpose(1, 2, 0) * 0.5 + 0.5) * 255
        img = img.astype(np.uint8)
        Image.fromarray(img).save(f"temp_gen/img_{i}.png")
    
    # 计算FID
    fid_value = fid_score.calculate_fid_given_paths(["temp_real", "temp_gen"], 50, device, 2048)
    
    # 清理临时文件
    import shutil
    shutil.rmtree("temp_real")
    shutil.rmtree("temp_gen")
    
    return fid_value

# 5.2 Inception Score计算
def calculate_inception_score(imgs, batch_size=32, splits=10):
    """
    计算Inception Score
    :param imgs: 图像张量 [N, 3, H, W], 范围[-1, 1]
    :return: (mean, std) of IS
    """
    # 加载预训练InceptionV3
    inception_model = inception_v3(pretrained=True, transform_input=False).to(device)
    inception_model.eval()
    
    # 调整图像大小为299x299（Inception V3的标准输入大小）
    resize = torch.nn.Upsample(size=(299, 299), mode='bilinear', align_corners=True)
    
    # 获取预测
    N = len(imgs)
    preds = []
    
    for i in range(0, N, batch_size):
        batch = imgs[i:i+batch_size].to(device)
        batch = (batch + 1) / 2  # 从[-1,1]转换到[0,1]
        batch = resize(batch)  # 调整大小为299x299
        
        with torch.no_grad():
            pred = inception_model(batch)
            pred = torch.nn.functional.softmax(pred, dim=1)
            preds.append(pred.cpu().numpy())
    
    preds = np.concatenate(preds, axis=0)
    
    # 计算IS
    scores = []
    for k in range(splits):
        part = preds[k * (N // splits): (k+1) * (N // splits), :]
        kl = part * (np.log(part) - np.log(np.expand_dims(np.mean(part, 0), 0)))
        kl = np.mean(np.sum(kl, 1))
        scores.append(np.exp(kl))
    
    return np.mean(scores), np.std(scores)

# 5.3 LPIPS感知相似度计算
def calculate_lpips(real_images, generated_images):
    """
    计算LPIPS感知相似度
    :param real_images: 真实图像张量 [N, 3, H, W]
    :param generated_images: 生成图像张量 [N, 3, H, W]
    :return: 平均LPIPS分数
    """
    loss_fn_alex = lpips.LPIPS(net='alex').to(device)
    total_distance = 0.0
    batch_size = 32
    n_batches = len(real_images) // batch_size + (1 if len(real_images) % batch_size != 0 else 0)
    
    with torch.no_grad():
        for i in range(n_batches):
            start_idx = i * batch_size
            end_idx = min(start_idx + batch_size, len(real_images))
            
            real_batch = real_images[start_idx:end_idx].to(device)
            gen_batch = generated_images[start_idx:end_idx].to(device)
            
            distance = loss_fn_alex(real_batch, gen_batch)
            total_distance += distance.sum().item()
    
    return total_distance / len(real_images)

# 6. 执行评估的主函数
def evaluate_models():
    # 数据路径
    img_dir = '/root/autodl-tmp/celeba_datasets/img_align_celeba/img_align_celeba'
    attr_path = '/root/autodl-tmp/celeba_datasets/list_attr_celeba.txt'
    bbox_path = '/root/autodl-tmp/celeba_datasets/list_bbox_celeba.txt'
    partition_path = '/root/autodl-tmp/celeba_datasets/list_eval_partition.txt'
    image_embeddings_test_path = '/root/autodl-tmp/clip_embeddings/image_embeddings_partition_2.pt'
    
    # 模型路径
    kl_model_path = 'clip_cvae_celeba.pth'
    renyi_model_path = 'model_checkpoints/cvae_renyi_alpha-2.0_best.pth'
    
    # 检查模型文件是否存在
    if not os.path.exists(kl_model_path):
        print(f"KL模型文件不存在: {kl_model_path}")
        return
    
    if not os.path.exists(renyi_model_path):
        print(f"Rényi模型文件不存在: {renyi_model_path}")
        return
    
    # 数据预处理
    transform = transforms.Compose([
        transforms.Resize((64, 64)),
        transforms.ToTensor(),
        transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
    ])
    
    # 加载测试数据集
    print("加载测试数据集...")
    test_dataset = CelebADataset(img_dir, attr_path, bbox_path, partition_path, 
                                image_embeddings_test_path, transform=transform, partition=2)
    test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=4)
    
    # 模型参数
    latent_dim = 128
    cond_dim = 40
    clip_dim = 512
    
    # 加载KL模型
    print("加载KL模型...")
    kl_model = ClipCVAE(img_channels=3, img_size=64, latent_dim=latent_dim, 
                         cond_dim=cond_dim, clip_dim=clip_dim).to(device)
    kl_model.load_state_dict(torch.load(kl_model_path))
    kl_model.eval()
    
    # 加载Rényi模型
    print("加载Rényi模型...")
    renyi_model = ClipCVAE_Renyi(img_channels=3, img_size=64, latent_dim=latent_dim, 
                                 cond_dim=cond_dim, clip_dim=clip_dim).to(device)
    try:
        # 尝试直接加载状态字典
        renyi_model.load_state_dict(torch.load(renyi_model_path))
    except:
        # 如果失败，尝试从checkpoint中加载
        checkpoint = torch.load(renyi_model_path)
        if 'model_state_dict' in checkpoint:
            renyi_model.load_state_dict(checkpoint['model_state_dict'])
        else:
            print("无法加载Rényi模型，检查文件格式")
            return
    renyi_model.eval()
    
    # 采样图像用于评估（限制样本数量以加快计算）
    n_samples = 1000  # 使用1000张图像进行评估
    print(f"从测试集采样{n_samples}张图像进行评估...")
    
    # 收集真实图像和两个模型生成的图像
    real_images = []
    kl_generated_images = []
    renyi_generated_images = []
    
    with torch.no_grad():
        for data, attrs, clip_emb, _ in tqdm(test_loader, desc="生成评估图像"):
            if len(real_images) >= n_samples:
                break
                
            # 移动数据到设备
            data = data.to(device)
            attrs = attrs.to(device)
            clip_emb = clip_emb.to(device)
            
            # 生成重构图像
            kl_recon, _, _ = kl_model(data, attrs, clip_emb)
            renyi_recon, _, _ = renyi_model(data, attrs, clip_emb)
            
            # 添加到列表
            real_images.append(data.cpu())
            kl_generated_images.append(kl_recon.cpu())
            renyi_generated_images.append(renyi_recon.cpu())
            
            if len(real_images) * data.size(0) >= n_samples:
                break
    
    # 合并batch
    real_images = torch.cat(real_images, dim=0)[:n_samples]
    kl_generated_images = torch.cat(kl_generated_images, dim=0)[:n_samples]
    renyi_generated_images = torch.cat(renyi_generated_images, dim=0)[:n_samples]
    
    print(f"采样完成，每个类别有{len(real_images)}张图像")
    
    # 计算FID
    print("计算FID...")
    kl_fid = calculate_fid(real_images, kl_generated_images)
    renyi_fid = calculate_fid(real_images, renyi_generated_images)
    
    # 计算IS
    print("计算Inception Score...")
    kl_is, kl_is_std = calculate_inception_score(kl_generated_images)
    renyi_is, renyi_is_std = calculate_inception_score(renyi_generated_images)
    
    # 计算LPIPS
    print("计算LPIPS...")
    kl_lpips = calculate_lpips(real_images, kl_generated_images)
    renyi_lpips = calculate_lpips(real_images, renyi_generated_images)
    
    # 计算改进百分比
    fid_improvement = ((kl_fid - renyi_fid) / kl_fid) * 100
    is_improvement = ((renyi_is - kl_is) / kl_is) * 100
    lpips_improvement = ((renyi_lpips - kl_lpips) / kl_lpips) * 100
    
    # 输出结果
    print("\n=========== 评估结果 ===========")
    print(f"CVAE-KL FID: {kl_fid:.4f}")
    print(f"CVAE-Rényi FID: {renyi_fid:.4f} (改进: {fid_improvement:.2f}%)")
    print(f"CVAE-KL IS: {kl_is:.4f} ± {kl_is_std:.4f}")
    print(f"CVAE-Rényi IS: {renyi_is:.4f} ± {renyi_is_std:.4f} (改进: {is_improvement:.2f}%)")
    print(f"CVAE-KL LPIPS: {kl_lpips:.4f}")
    print(f"CVAE-Rényi LPIPS: {renyi_lpips:.4f} (改进: {lpips_improvement:.2f}%)")
    
    # 创建表格数据
    table_data = {
        'Model': ['CVAE-KL', 'CVAE-Rényi (α=0.9)'],
        'FID': [f"{kl_fid:.4f}", f"{renyi_fid:.4f}"],
        'IS': [f"{kl_is:.4f}", f"{renyi_is:.4f}"],
        'LPIPS': [f"{kl_lpips:.4f}", f"{renyi_lpips:.4f}"],
        'p-value': ['N/A', 'N/A']  # 由于只有一次运行，无法计算p值
    }
    
    df = pd.DataFrame(table_data)
    print("\nLatex表格数据:")
    print(df.to_latex(index=False))
    
    # 保存结果
    df.to_csv("divergence_comparison_results.csv", index=False)
    print("结果已保存到 divergence_comparison_results.csv")
    
    # 返回结果
    return {
        'kl_fid': kl_fid,
        'renyi_fid': renyi_fid,
        'kl_is': kl_is,
        'renyi_is': renyi_is,
        'kl_lpips': kl_lpips,
        'renyi_lpips': renyi_lpips
    }

if __name__ == "__main__":
    # 执行评估
    results = evaluate_models()

使用设备: cuda
CLIP模型已加载。
加载测试数据集...
Total samples before filtering: 202599
Total samples after filtering partition 2: 19962
加载KL模型...
加载Rényi模型...
从测试集采样1000张图像进行评估...


生成评估图像:   5%|▍         | 15/312 [00:01<00:21, 14.07it/s]


采样完成，每个类别有1000张图像
计算FID...


100%|██████████| 20/20 [00:00<00:00, 28.71it/s]
100%|██████████| 20/20 [00:00<00:00, 29.39it/s]
100%|██████████| 20/20 [00:00<00:00, 29.35it/s]
100%|██████████| 20/20 [00:00<00:00, 29.64it/s]


计算Inception Score...




计算LPIPS...
Setting up [LPIPS] perceptual loss: trunk [alex], v[0.1], spatial [off]




Loading model from: /root/miniconda3/envs/pytorc_test1/lib/python3.12/site-packages/lpips/weights/v0.1/alex.pth
Setting up [LPIPS] perceptual loss: trunk [alex], v[0.1], spatial [off]
Loading model from: /root/miniconda3/envs/pytorc_test1/lib/python3.12/site-packages/lpips/weights/v0.1/alex.pth

CVAE-KL FID: 62.7659
CVAE-Rényi FID: 328.9333 (改进: -424.06%)
CVAE-KL IS: 2.2409 ± 0.1771
CVAE-Rényi IS: 1.6552 ± 0.0376 (改进: -26.14%)
CVAE-KL LPIPS: 0.1379
CVAE-Rényi LPIPS: 0.3872 (改进: 180.87%)

Latex表格数据:
\begin{tabular}{lllll}
\toprule
Model & FID & IS & LPIPS & p-value \\
\midrule
CVAE-KL & 62.7659 & 2.2409 & 0.1379 & N/A \\
CVAE-Rényi (α=0.9) & 328.9333 & 1.6552 & 0.3872 & N/A \\
\bottomrule
\end{tabular}

结果已保存到 divergence_comparison_results.csv


### 上述结果表明renyi alpha=0.9 效果不如KL divergence 诶
所以我们现在有了0.5 0.7 0.9的FID的值，去评价不同指标，这样也比较好写。

### 如何做插值分析？
我们需要什么，一个训练好的模型函数，一个生成函数，pdf。训练的收敛曲线，