In [48]:
import pandas as pd

# 读取数据
file_path = 'datasets\TCGA-LUAD.gistic.tsv'
data = pd.read_csv(file_path, sep='\t')

print(data)



              Gene Symbol  TCGA-69-8253-01A  TCGA-69-8255-01A  \
0      ENSG00000008128.21                 0                 0   
1      ENSG00000008130.14                 0                 0   
2      ENSG00000067606.14                 0                 0   
3      ENSG00000078369.16                 0                 0   
4      ENSG00000078808.15                 0                 0   
...                   ...               ...               ...   
19724   ENSG00000277745.1                 0                 0   
19725   ENSG00000277858.1                 0                 0   
19726  ENSG00000124333.13                 0                 0   
19727  ENSG00000124334.15                 0                 0   
19728   ENSG00000168939.9                 0                 0   

       TCGA-86-8278-01A  TCGA-97-8179-01A  TCGA-83-5908-01A  TCGA-69-8254-01A  \
0                     1                 0                 0                 0   
1                     1                 0                

In [49]:
import os
import requests
import gzip
import shutil
import pandas as pd
import numpy as np

# 样本类型代码映射
type_mapping = {
    '01': 1,  # Primary Solid Tumor
    '02': 1,  # Recurrent Solid Tumor
    '03': 1,  # Primary Blood Derived Cancer - Peripheral Blood
    '04': 1,  # Recurrent Blood Derived Cancer - Bone Marrow
    '05': 1,  # Additional - New Primary
    '06': 1,  # Metastatic
    '07': 1,  # Additional Metastatic
    '08': 1,  # Human Tumor Original Cells
    '09': 1,  # Primary Blood Derived Cancer - Bone Marrow
    '10': 0,  # Blood Derived Normal
    '11': 0,  # Solid Tissue Normal
    '12': 0,  # Buccal Cell Normal
    '13': 0,  # EBV Immortalized Normal
    '14': 0,  # Bone Marrow Normal
    '15': 0,  # sample type 15
    '16': 0,  # sample type 16
    '20': 0,  # Control Analyte
    '40': 1,  # Recurrent Blood Derived Cancer - Peripheral Blood
    '50': 0,  # Cell Lines
    '60': 0,  # Primary Xenograft Tissue
    '61': 0,  # Cell Line Derived Xenograft Tissue
    '99': 0   # sample type 99
}

def process_data(file_path):
    # 读取数据
    df = pd.read_csv(file_path, sep='\t')
    
    # 提取样本编号
    def get_status(sample_id):
        sample_id_str = str(sample_id).strip()  # 确保样本编号是字符串，并去除空格
        if len(sample_id_str) < 5:  # 检查样本编号长度
            return 'Unknown'  # 样本编号格式不正确
        sample_type_code = sample_id_str.split('-')[-1][:2]  # 提取样本类型代码
        return type_mapping.get(sample_type_code, 'Unknown')  # 查找对应的状态
    
    # 创建状态行
    status_row = {}
    for column in df.columns[1:]:  # 跳过第一列 'Gene Symbol'
        sample_id = df[column].iloc[0]  # 取该列的样本编号
        status_row[column] = get_status(sample_id)
    
    # 将状态行添加到数据框中
    status_df = pd.DataFrame(status_row, index=['Status']).T  # 转置数据框
    df_with_status = pd.concat([df, status_df], axis=0)

    return df_with_status

def save_processed_data(df, output_path):
    # 保存处理后的数据
    df.to_pickle(output_path)

def find_healthy_samples(df):
    # 查找健康样本
    healthy_samples_df = df[df['Status'] == 0]
    return healthy_samples_df

# 使用示例
file_path = 'datasets/TCGA-OV.gistic.tsv'
output_pkl_path = './datasets/TCGA-OV.gistic_processed.pkl'

# 处理数据
processed_df = process_data(file_path)

# 保存处理后的数据
save_processed_data(processed_df, output_pkl_path)
