In [1]:
import os
import numpy as np
import pandas as pd
from typing import Dict, List, Tuple
import matplotlib.pyplot as plt
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import multiprocessing as mp
from functools import partial
import time

# 检查是否安装了必要的库
try:
    import neo
    import quantities as pq
    from elephant.statistics import instantaneous_rate
    from elephant.kernels import GaussianKernel
    print("✅ neo和elephant库已安装")
except Exception as e:
    print(f"⚠️ 警告: 无法导入neo和elephant库: {e}")
    neo = None
    pq = None
    instantaneous_rate = None
    GaussianKernel = None

print(f"可用CPU核心数: {mp.cpu_count()}")


✅ neo和elephant库已安装
可用CPU核心数: 32


In [4]:
# 配置参数
DATE_STR = "20240112"
SAMPLE_RATE = 30000.0  # Hz
SAMPLING_PERIOD_SEC = 0.01  # 20ms采样周期
KERNEL_SIGMA_SEC = 0.02  # 40ms高斯核

# 文件路径
BASE_DIR = "/media/ubuntu/sda/Monkey/sorted_result_combined"
TRIGGER_DIR = "/media/ubuntu/sda/Monkey/trigger"
TRAIN_IMAGE_CSV = "/media/ubuntu/sda/Monkey/scripts/test_image.csv"

# 要合并的array数据
ARRAYS = [
    'Hub1-instance1_V1',
    'Hub2-instance1_V1'
]

print(f"处理日期: {DATE_STR}")
print(f"合并的arrays: {ARRAYS}")
print(f"采样率: {SAMPLE_RATE} Hz")
print(f"采样周期: {SAMPLING_PERIOD_SEC} 秒")
print(f"高斯核标准差: {KERNEL_SIGMA_SEC} 秒")


处理日期: 20240112
合并的arrays: ['Hub1-instance1_V1', 'Hub2-instance1_V1']
采样率: 30000.0 Hz
采样周期: 0.01 秒
高斯核标准差: 0.02 秒


In [5]:
def load_image_class_mapping() -> Dict[int, str]:
    """加载图像ID到类别的映射"""
    print("加载图像类别映射...")
    
    # 读取train_image.csv
    train_df = pd.read_csv(TRAIN_IMAGE_CSV)
    
    # 从文件路径中提取图像ID
    image_id_to_class = {}
    
    # 方法1: 如果things_path包含ID信息
    for _, row in train_df.iterrows():
        things_path = row['things_path']
        class_name = row['class']
        
        # 尝试从路径中提取数字作为图像ID
        import re
        numbers = re.findall(r'\d+', things_path)
        if numbers:
            # 使用路径中的最后一个数字作为可能的图像ID
            image_id = int(numbers[-1])
            image_id_to_class[image_id] = class_name
    
    print(f"加载了 {len(image_id_to_class)} 个图像类别映射")
    print(f"类别示例: {list(set(list(image_id_to_class.values())[:10]))}")
    
    return image_id_to_class

# 加载图像类别映射
image_class_mapping = load_image_class_mapping()


加载图像类别映射...
加载了 11 个图像类别映射
类别示例: ['bulldozer', 'helicopter', 'wig', 'monkey', 'iguana', 'whip', 'tamale', 'dough', 'wasp', 'key']


In [None]:
# 简化的快速处理方法 - 使用binning而不是instantaneous rate
def compute_firing_rate_matrix_binned(
    start_sec: float,
    stop_sec: float,
    neuron_order: pd.DataFrame,
    spike_times_by_neuron: Dict[Tuple[str, int], np.ndarray],
    bin_width_sec: float
) -> Tuple[np.ndarray, np.ndarray]:
    """使用binning方法快速计算发放率矩阵"""
    duration = max(0.0, float(stop_sec) - float(start_sec))
    if duration <= 0:
        edges = np.array([start_sec, start_sec + bin_width_sec], dtype=float)
        n_bins = 1
    else:
        n_bins = int(np.ceil(duration / bin_width_sec))
        edges = start_sec + np.arange(n_bins + 1, dtype=float) * bin_width_sec
        if edges[-1] < stop_sec:
            edges = np.append(edges, stop_sec)
            n_bins = len(edges) - 1
    
    n_neurons = len(neuron_order)
    fr = np.zeros((n_neurons, n_bins), dtype=float)
    
    # 对每个神经元计算binned发放率
    for _, row in neuron_order.iterrows():
        idx = int(row['neuron_index'])
        key = (row['array'], int(row['cluster_id']))
        t = spike_times_by_neuron.get(key)
        
        if t is None or t.size == 0:
            continue
        
        # 过滤时间窗口内的spikes
        mask = (t >= start_sec) & (t < stop_sec)
        rel_t = t[mask] - start_sec
        
        if rel_t.size == 0:
            continue
        
        # 使用numpy的histogram进行快速binning
        hist, _ = np.histogram(rel_t, bins=edges - start_sec)
        fr[idx, :] = hist / bin_width_sec  # 转换为Hz
    
    return fr, edges

def build_firing_rate_matrices_fast(
    triggers_df: pd.DataFrame,
    neuron_order: pd.DataFrame,
    spikes_by_neuron: Dict[Tuple[str, int], np.ndarray],
    bin_width_sec: float,
    image_class_mapping: Dict[int, str]
) -> Dict[str, Dict[str, object]]:
    """快速构建所有trigger窗口的发放率矩阵"""
    start_time = time.time()
    
    print(f"开始快速构建发放率矩阵，共 {len(triggers_df)} 个试次...")
    
    fr_dict: Dict[str, Dict[str, object]] = {}
    
    for idx, (_, row) in enumerate(triggers_df.iterrows()):
        if idx % 100 == 0:
            print(f"处理进度: {idx}/{len(triggers_df)} ({idx/len(triggers_df)*100:.1f}%)")
        
        start_sec = float(row['start_time'])
        stop_sec = float(row['stop_time'])
        
        # 生成唯一键
        train_val = int(row['train_image']) if not pd.isna(row['train_image']) else 0
        test_val = int(row['test_image']) if not pd.isna(row['test_image']) else 0
        
        is_train = train_val != 0
        if is_train:
            image_id = train_val
            phase = 'train'
        else:
            image_id = test_val
            phase = 'test'
        
        rep_num = int(row['image_rep_num']) if not pd.isna(row['image_rep_num']) else 0
        single_rep = int(row['single_train_rep']) if not pd.isna(row['single_train_rep']) else 0
        key = f"{phase}_{image_id}_{rep_num}_{single_rep}"
        
        try:
            # 使用快速binning方法
            fr_mat, edges = compute_firing_rate_matrix_binned(
                start_sec=start_sec,
                stop_sec=stop_sec,
                neuron_order=neuron_order,
                spike_times_by_neuron=spikes_by_neuron,
                bin_width_sec=bin_width_sec,
            )
            
            # 获取图像类别
            image_class = image_class_mapping.get(image_id, 'unknown')
            
            fr_dict[key] = {
                'phase': phase,
                'image_id': image_id,
                'image_class': image_class,
                'image_rep_num': rep_num,
                'single_train_rep': single_rep,
                'edges_sec': edges,
                'bin_width_sec': bin_width_sec,
                'neuron_order': neuron_order.copy(),
                'firing_rate': fr_mat,
                'start_time': start_sec,
                'stop_time': stop_sec,
            }
            
        except Exception as e:
            print(f"警告: 处理试次 {key} 时出错: {e}")
            continue
    
    print(f"\n完成！成功构建了 {len(fr_dict)} 个发放率矩阵")
    print(f"总耗时: {time.time() - start_time:.2f} 秒")
    print(f"平均每个试次: {(time.time() - start_time) / len(triggers_df):.3f} 秒")
    
    return fr_dict


In [None]:
# 快速加载合并数据
def load_combined_data_fast(date_str: str, arrays: List[str]) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """快速加载合并的cluster和spike数据"""
    start_time = time.time()
    
    cluster_frames = []
    spike_frames = []
    
    for array_name in arrays:
        cluster_path = os.path.join(BASE_DIR, date_str, f"cluster_inf_{array_name}.csv")
        spike_path = os.path.join(BASE_DIR, date_str, f"spike_inf_{array_name}.csv")
        
        if os.path.exists(cluster_path) and os.path.exists(spike_path):
            print(f"加载 {array_name} 数据...")
            
            cluster_df = pd.read_csv(cluster_path)
            spike_df = pd.read_csv(spike_path)
            
            cluster_frames.append(cluster_df)
            spike_frames.append(spike_df)
            
            print(f"  - cluster数据: {len(cluster_df)} 个神经元")
            print(f"  - spike数据: {len(spike_df)} 个spikes")
        else:
            print(f"警告: {array_name} 数据文件不存在")
    
    if not cluster_frames or not spike_frames:
        raise ValueError("没有找到有效的数据文件")
    
    # 合并数据
    combined_cluster = pd.concat(cluster_frames, ignore_index=True)
    combined_spike = pd.concat(spike_frames, ignore_index=True)
    
    print(f"\n合并后数据:")
    print(f"- 总神经元数: {len(combined_cluster)}")
    print(f"- 总spikes数: {len(combined_spike)}")
    print(f"- 加载耗时: {time.time() - start_time:.2f} 秒")
    
    return combined_cluster, combined_spike

# 加载合并数据
cluster_df, spike_df = load_combined_data_fast(DATE_STR, ARRAYS)


In [None]:
# 快速加载trigger数据（排除Block1）
def load_triggers_fast(date_str: str) -> pd.DataFrame:
    """快速加载trigger数据，排除Block1"""
    start_time = time.time()
    
    trigger_files = []
    
    # 获取所有相关的trigger文件（排除Block1）
    for block_num in [2, 3, 4]:  # 排除Block1
        for instance_num in [1, 2]:
            trigger_file = os.path.join(TRIGGER_DIR, f"trigger_df_monkyF_{date_str}_B{block_num}_instance{instance_num}.csv")
            if os.path.exists(trigger_file):
                trigger_files.append(trigger_file)
                print(f"找到trigger文件: Block{block_num}_instance{instance_num}")
            else:
                print(f"警告: trigger文件不存在: {trigger_file}")
    
    if not trigger_files:
        raise ValueError("没有找到有效的trigger文件")
    
    # 加载并合并trigger数据
    trigger_frames = []
    for trigger_file in trigger_files:
        df = pd.read_csv(trigger_file)
        trigger_frames.append(df)
    
    combined_triggers = pd.concat(trigger_frames, ignore_index=True)
    
    # 数据清理
    required_cols = ['start_time', 'stop_time', 'train_image', 'test_image', 'image_rep_num', 'single_train_rep']
    for col in required_cols:
        if col not in combined_triggers.columns:
            raise ValueError(f"Trigger文件缺少列: {col}")
    
    # 仅保留有效图像试次
    if 'valid_image' in combined_triggers.columns:
        combined_triggers = combined_triggers[combined_triggers['valid_image'] == 1].copy()
    
    # 填充NaN值
    combined_triggers['train_image'] = combined_triggers['train_image'].fillna(0).astype(int)
    combined_triggers['test_image'] = combined_triggers['test_image'].fillna(0).astype(int)
    combined_triggers['image_rep_num'] = combined_triggers['image_rep_num'].fillna(0).astype(int)
    combined_triggers['single_train_rep'] = combined_triggers['single_train_rep'].fillna(0).astype(int)
    
    print(f"\n加载的trigger数据:")
    print(f"- 总试次数: {len(combined_triggers)}")
    print(f"- 时间范围: {combined_triggers['start_time'].min():.2f} - {combined_triggers['stop_time'].max():.2f} 秒")
    print(f"- 加载耗时: {time.time() - start_time:.2f} 秒")
    
    return combined_triggers

# 加载trigger数据（排除Block1）
triggers_df = load_triggers_fast(DATE_STR)


In [None]:
# 构建神经元索引和分组spike时间
def build_neuron_index_and_group_spikes(cluster_df: pd.DataFrame, spike_df: pd.DataFrame, sample_rate: float) -> Tuple[pd.DataFrame, Dict[Tuple[str, int], np.ndarray]]:
    """一次性构建神经元索引和分组spike时间"""
    start_time = time.time()
    
    # 构建神经元索引
    if 'group' in cluster_df.columns:
        cluster_df = cluster_df[cluster_df['group'] == 'good'].copy()
    
    neurons = cluster_df[['array', 'cluster_id']].copy()
    neurons['array'] = neurons['array'].astype(str)
    neurons['cluster_id'] = neurons['cluster_id'].astype(int)
    neurons = neurons.drop_duplicates().reset_index(drop=True)
    neurons['neuron_index'] = np.arange(len(neurons), dtype=int)
    
    # 创建映射字典
    neuron_mapping: Dict[Tuple[str, int], int] = {
        (row['array'], row['cluster_id']): int(row['neuron_index'])
        for _, row in neurons.iterrows()
    }
    
    # 分组spike时间
    s = spike_df[['array', 'cluster_id', 'time']].copy()
    s['array'] = s['array'].astype(str)
    s['cluster_id'] = s['cluster_id'].astype(int)
    s['t_sec'] = s['time'].astype(float) / float(sample_rate)
    
    # 快速分组
    spikes_by_neuron: Dict[Tuple[str, int], np.ndarray] = {}
    for (arr, clu), g in s.groupby(['array', 'cluster_id']):
        spikes_by_neuron[(arr, int(clu))] = g['t_sec'].to_numpy()
    
    print(f"构建神经元索引和分组spike时间:")
    print(f"- 总神经元数: {len(neurons)}")
    print(f"- 总spike数: {sum(len(spikes) for spikes in spikes_by_neuron.values())}")
    print(f"- 各array的神经元数:")
    for array_name in neurons['array'].unique():
        count = len(neurons[neurons['array'] == array_name])
        print(f"  - {array_name}: {count}")
    print(f"- 处理耗时: {time.time() - start_time:.2f} 秒")
    
    return neurons, spikes_by_neuron

# 构建神经元索引和分组spike时间
neuron_order, spikes_by_neuron = build_neuron_index_and_group_spikes(cluster_df, spike_df, SAMPLE_RATE)


In [None]:
# 构建发放率矩阵（快速版本）
firing_rate_dict = build_firing_rate_matrices_fast(
    triggers_df,
    neuron_order,
    spikes_by_neuron,
    SAMPLING_PERIOD_SEC,
    image_class_mapping
)


In [None]:
# 分析结果（包含类别信息）
print("=== 发放率矩阵分析（含类别信息）===")
print(f"总试次数: {len(firing_rate_dict)}")

# 统计各phase的试次数
phase_counts = {}
class_counts = {}
image_counts = {}

for key, data in firing_rate_dict.items():
    phase = data['phase']
    image_id = data['image_id']
    image_class = data['image_class']
    
    phase_counts[phase] = phase_counts.get(phase, 0) + 1
    class_counts[image_class] = class_counts.get(image_class, 0) + 1
    image_counts[image_id] = image_counts.get(image_id, 0) + 1

print(f"\n各phase的试次数:")
for phase, count in sorted(phase_counts.items()):
    print(f"  {phase}: {count}")

print(f"\n各图像类别的试次数（前20个）:")
sorted_classes = sorted(class_counts.items(), key=lambda x: x[1], reverse=True)
for class_name, count in sorted_classes[:20]:
    print(f"  {class_name}: {count}")

print(f"\n总类别数: {len(class_counts)}")
print(f"唯一图像数: {len(image_counts)}")

# 检查矩阵维度
if firing_rate_dict:
    sample_key = list(firing_rate_dict.keys())[0]
    sample_data = firing_rate_dict[sample_key]
    fr_shape = sample_data['firing_rate'].shape
    print(f"\n发放率矩阵维度: {fr_shape}")
    print(f"  - 神经元数: {fr_shape[0]}")
    print(f"  - 时间bins: {fr_shape[1]}")
    print(f"  - 时间窗口长度: {sample_data['stop_time'] - sample_data['start_time']:.3f} 秒")
    print(f"  - 每个bin长度: {sample_data['bin_width_sec']:.3f} 秒")
    print(f"  - 示例类别: {sample_data['image_class']}")


In [None]:
# 保存结果（包含类别信息）
def save_firing_rate_dict_with_classes(fr_dict: Dict[str, Dict[str, object]], output_dir: str, date_str: str):
    """保存发放率字典到文件（包含类别信息）"""
    os.makedirs(output_dir, exist_ok=True)
    
    # 保存为npz格式
    npz_path = os.path.join(output_dir, f"firing_rate_matrices_{date_str}_combined_with_classes.npz")
    pack = {}
    
    for k, v in fr_dict.items():
        pack[f'{k}__firing_rate'] = v['firing_rate']
        pack[f'{k}__edges_sec'] = v['edges_sec']
        pack[f'{k}__meta'] = np.array([
            v['phase'],
            str(v['image_id']),
            v['image_class'],
            str(v['image_rep_num']),
            str(v['single_train_rep']),
            str(v['bin_width_sec']),
            str(v['start_time']),
            str(v['stop_time'])
        ], dtype=object)
    
    # 保存神经元顺序信息
    if fr_dict:
        sample_data = list(fr_dict.values())[0]
        neuron_order = sample_data['neuron_order']
        pack['neuron_order_csv'] = neuron_order.to_csv(index=False).encode('utf-8')
    
    np.savez_compressed(npz_path, **pack)
    print(f"发放率矩阵已保存到: {npz_path}")
    
    # 保存汇总信息（包含类别）
    summary_path = os.path.join(output_dir, f"firing_rate_summary_{date_str}_combined_with_classes.csv")
    summary_data = []
    
    for key, data in fr_dict.items():
        summary_data.append({
            'key': key,
            'phase': data['phase'],
            'image_id': data['image_id'],
            'image_class': data['image_class'],
            'image_rep_num': data['image_rep_num'],
            'single_train_rep': data['single_train_rep'],
            'start_time': data['start_time'],
            'stop_time': data['stop_time'],
            'duration': data['stop_time'] - data['start_time'],
            'n_neurons': data['firing_rate'].shape[0],
            'n_bins': data['firing_rate'].shape[1]
        })
    
    summary_df = pd.DataFrame(summary_data)
    summary_df.to_csv(summary_path, index=False)
    print(f"汇总信息已保存到: {summary_path}")
    
    # 保存类别统计信息
    class_stats_path = os.path.join(output_dir, f"class_statistics_{date_str}_combined.csv")
    class_stats = []
    
    for class_name, count in sorted(class_counts.items(), key=lambda x: x[1], reverse=True):
        class_stats.append({
            'class_name': class_name,
            'trial_count': count,
            'percentage': count / len(fr_dict) * 100
        })
    
    class_stats_df = pd.DataFrame(class_stats)
    class_stats_df.to_csv(class_stats_path, index=False)
    print(f"类别统计信息已保存到: {class_stats_path}")
    
    return npz_path, summary_path, class_stats_path

# 保存结果
output_dir = "/media/ubuntu/sda/Monkey/sorted_result_combined/firing_rate_matrices"
npz_path, summary_path, class_stats_path = save_firing_rate_dict_with_classes(firing_rate_dict, output_dir, DATE_STR)


In [None]:
# 可视化示例（包含类别信息）
if firing_rate_dict:
    # 选择一个示例进行可视化
    sample_key = list(firing_rate_dict.keys())[0]
    sample_data = firing_rate_dict[sample_key]
    fr_matrix = sample_data['firing_rate']
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # 1. 整体发放率矩阵热图
    ax1 = axes[0, 0]
    im1 = ax1.imshow(fr_matrix, aspect='auto', cmap='viridis', interpolation='nearest')
    ax1.set_title(f'发放率矩阵热图\n{sample_key}\n类别: {sample_data["image_class"]}')
    ax1.set_xlabel('时间bins')
    ax1.set_ylabel('神经元索引')
    plt.colorbar(im1, ax=ax1, label='发放率 (Hz)')
    
    # 2. 平均发放率随时间变化
    ax2 = axes[0, 1]
    mean_fr = np.mean(fr_matrix, axis=0)
    time_bins = np.arange(len(mean_fr)) * sample_data['bin_width_sec']
    ax2.plot(time_bins, mean_fr, 'b-', linewidth=2)
    ax2.set_title('平均发放率随时间变化')
    ax2.set_xlabel('时间 (秒)')
    ax2.set_ylabel('平均发放率 (Hz)')
    ax2.grid(True, alpha=0.3)
    
    # 3. 神经元平均发放率分布
    ax3 = axes[1, 0]
    neuron_mean_fr = np.mean(fr_matrix, axis=1)
    ax3.hist(neuron_mean_fr, bins=50, alpha=0.7, color='skyblue', edgecolor='black')
    ax3.set_title('神经元平均发放率分布')
    ax3.set_xlabel('平均发放率 (Hz)')
    ax3.set_ylabel('神经元数量')
    ax3.grid(True, alpha=0.3)
    
    # 4. 发放率统计和类别信息
    ax4 = axes[1, 1]
    ax4.axis('off')
    stats_text = f"""
    统计信息:
    神经元数: {fr_matrix.shape[0]}
    时间bins: {fr_matrix.shape[1]}
    时间窗口: {sample_data['stop_time'] - sample_data['start_time']:.3f} 秒
    平均发放率: {np.mean(fr_matrix):.2f} Hz
    最大发放率: {np.max(fr_matrix):.2f} Hz
    最小发放率: {np.min(fr_matrix):.2f} Hz
    
    试次信息:
    Phase: {sample_data['phase']}
    Image ID: {sample_data['image_id']}
    图像类别: {sample_data['image_class']}
    Rep Num: {sample_data['image_rep_num']}
    """
    ax4.text(0.1, 0.9, stats_text, transform=ax4.transAxes, fontsize=11,
             verticalalignment='top', fontfamily='monospace')
    
    plt.tight_layout()
    plt.show()
    
    print(f"\n可视化完成！示例数据来自试次: {sample_key}")
    print(f"图像类别: {sample_data['image_class']}")
    
    # 类别分布饼图
    plt.figure(figsize=(12, 8))
    top_classes = sorted(class_counts.items(), key=lambda x: x[1], reverse=True)[:15]
    class_names = [item[0] for item in top_classes]
    class_values = [item[1] for item in top_classes]
    
    plt.pie(class_values, labels=class_names, autopct='%1.1f%%', startangle=90)
    plt.title('前15个图像类别的试次分布')
    plt.axis('equal')
    plt.show()
    
    print(f"\n类别分布统计完成！")
