In [None]:
import os
import numpy as np
import pandas as pd
import wfdb
from tqdm import tqdm
from sklearn.preprocessing import MultiLabelBinarizer

In [ ]:

BASE_DIR = 'WFDBRecords'  
MAPPING_FILE = 'ConditionNames_SNOMED-CT.csv'  

def load_chapman_metadata(base_path=BASE_DIR):
  
    records = []
    
 
    for l1 in range(46):
        l1_dir = os.path.join(base_path, f"{l1:02d}")
 
        for l2 in range(10):
            l2_dir = os.path.join(l1_dir, f"{l2:02d}")
            
       
            for fname in os.listdir(l2_dir):
                if fname.endswith('.hea'):
                    record_id = fname.split('.')[0]
                    record_path = os.path.join(l2_dir, record_id)
                    
         
                    header = wfdb.rdheader(record_path)
                    
                    meta = {
                        'record_id': record_id,
                        'path': record_path,
                        'fs': header.fs,
                        'n_samples': header.sig_len,
                        'n_leads': header.n_sig,
                        'leads': ';'.join(header.sig_name),
                        'comments': '|'.join(header.comments)
                    }
                    records.append(meta)
    
    return pd.DataFrame(records)

def extract_labels(comments):

    labels = []
    for comment in comments.split('|'):
        if comment.startswith('Dx:'):
    
            codes = [c.strip() for c in comment[3:].split(',')]
            labels.extend(codes)
    return list(set(labels)) 

def create_dataset(df, mapping_file=MAPPING_FILE):
    snomed_map = pd.read_excel(mapping_file)
    condition_map = dict(zip(snomed_map['SNOMED CT Code'], snomed_map['Condition Name']))

    df['labels'] = df['comments'].apply(extract_labels)
    
    all_labels = sorted(set([label for sublist in df['labels'] for label in sublist]))
    mlb = MultiLabelBinarizer(classes=all_labels)
    label_matrix = mlb.fit_transform(df['labels'])
    
 
    label_df = pd.DataFrame(
        label_matrix, 
        columns=mlb.classes_,
        index=df.index
    )
    label_df.columns = [condition_map.get(int(c), f'SNOMED_{c}') for c in label_df.columns]
    
    return pd.concat([df, label_df], axis=1)

class ChapmanDataset:
 
    def __init__(self, metadata_df, target_classes=None):
        """
       
        """
        self.metadata = metadata_df
        self.target_classes = target_classes or self.get_all_classes()
        
    def get_all_classes(self):
      
        return [col for col in self.metadata.columns if col.startswith('SNOMED')]
    
    def __len__(self):
        return len(self.metadata)
    
    def __getitem__(self, idx):
        record = self.metadata.iloc[idx]
        
      
        signals, _ = wfdb.rdsamp(record['path'])

        if self.target_classes:
            labels = record[self.target_classes].values.astype(np.float32)
        else:
            labels = None
        
        return {
            'ecg': signals.T.astype(np.float32),  # 转置为[12, 5000]
            'labels': labels,
            'record_id': record['record_id']
        }

In [ ]:
if __name__ == "__main__":
 
    print("正在加载元数据...")
    meta_df = load_chapman_metadata()
    
    print("正在处理标签...")
    dataset_df = create_dataset(meta_df)
    
    
    dataset_df.to_csv('chapman_metadata.csv', index=False)
    print(f"元数据已保存，包含{len(dataset_df)}条记录")
    
    target_diseases = ['Atrial fibrillation', 'Sinus tachycardia']
    
    chapman_ds = ChapmanDataset(
        metadata_df=dataset_df,
        target_classes=target_diseases
    )
    
    
    sample = chapman_ds[0]
    print(f"ECG形状: {sample['ecg'].shape}")
    print(f"标签: {sample['labels']}")