In [5]:
import os
import gzip
import shutil
import pandas as pd
import numpy as np

def manual_parse_data(dataset_name):
    pkl_file_path = './datasets/' + 'TCGA-' + dataset_name + '.gistic.pkl'
    gz_file_path ='./datasets/TCGA.OV.sampleMap_Gistic2_CopyNumber_Gistic2_all_thresholded.by_genes (1).gz'#修改名字
    tsv_file_path = './datasets/' + 'TCGA-' + dataset_name + '.gistic.tsv'
    # 确保.gz文件存在
    if not os.path.exists(gz_file_path):
        raise FileNotFoundError(f"File {gz_file_path} not found. Please download manually.")
    # 解压缩文件
    with gzip.open(gz_file_path, 'rb') as f_in:
        with open(tsv_file_path, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

    # 读取解压后的.tsv文件
    df = pd.read_csv(tsv_file_path, sep='\t')

    # 转置数据
    df_t = df.transpose()

    # 设置列名为第一行的内容
    df_t.columns = df_t.iloc[0]
    df_t = df_t.drop(df_t.index[0])

    # 保存为.pkl文件以便以后快速加载
    df_t.to_pickle(pkl_file_path)

    # 删除中间文件
    os.remove(gz_file_path)
    os.remove(tsv_file_path)

    # 将所有数据转换为数值型
    df_t = df_t.apply(pd.to_numeric)
    return df_t



# 使用手动解析函数处理数据
dataset_name = "OV"
df = manual_parse_data(dataset_name)



In [22]:
import pandas as pd

# 读取处理后的.pkl文件
pkl_file_path = './datasets/TCGA-OV.gistic.pkl'
df = pd.read_pickle(pkl_file_path)

# 计算每一列的平均值
mean_values = df.mean()

# 对列标题及其平均值进行排序
sorted_mean_values = mean_values.sort_values(ascending=False)

# 将排序后的结果保存到新的 DataFrame
sorted_df = pd.DataFrame({
    'Column': sorted_mean_values.index,
    'Mean Value': sorted_mean_values.values
})
# 取出排名前20的列名
Top20_gene = sorted_df['Column'].head(20).tolist()

# 输出Top20的列名
print("Top 20 列名：", Top20_gene)

# 将结果保存为 CSV 文件
csv_output_path = './datasets/sorted_mean_values.csv'
sorted_df.to_csv(csv_output_path, index=False)
print(f"结果已保存到 {csv_output_path}")

Top 20 列名： ['MECOM', 'PVT1', 'ACTRT3', 'TERC', 'LRRC34', 'MYNN', 'LRRIQ4', 'MYC', 'LRRC31', 'MIR1208', 'MIR1207', 'SAMD7', 'SEC62', 'RN7SKP226', 'PHC3', 'MIR1205', 'GPR160', 'TMEM75', 'EGFEM1P', 'CASC8']
结果已保存到 ./datasets/sorted_mean_values.csv


In [45]:
import pandas as pd


# 读取 probemap 文件
probemap_file = 'datasets/gencode.v36.annotation.gtf.gene.probemap'  # 请替换为你的 probemap 文件路径
probemap_df = pd.read_csv(probemap_file, sep='\t')

# 根据 Top20 基因名称获取 ID
Top20_id = []  # 创建新的列表以存储ID
for gene in Top20_gene:
    id = probemap_df.loc[probemap_df['gene'] == gene, 'id']
    if not id.empty:
        Top20_id.append(id.values[0])  # 将找到的ID添加到 Top_id 列表中
    else:
        Top20_id.append(None)  # 如果没有找到ID，添加 None

# 打印结果
result_df = pd.DataFrame({
    'Gene': Top20_gene,  # 第一列为基因名称
    'ID': Top20_id       # 第二列为对应的 ID
})

# 打印结果 DataFrame
print(result_df)


         Gene                  ID
0       MECOM  ENSG00000085276.19
1        PVT1  ENSG00000249859.12
2      ACTRT3   ENSG00000184378.3
3        TERC   ENSG00000270141.3
4      LRRC34  ENSG00000171757.17
5        MYNN  ENSG00000085274.16
6      LRRIQ4   ENSG00000188306.6
7         MYC  ENSG00000136997.21
8      LRRC31  ENSG00000114248.10
9     MIR1208   ENSG00000221261.1
10    MIR1207   ENSG00000221176.1
11      SAMD7   ENSG00000187033.9
12      SEC62  ENSG00000008952.17
13  RN7SKP226   ENSG00000201782.1
14       PHC3  ENSG00000173889.16
15    MIR1205   ENSG00000221771.1
16     GPR160  ENSG00000173890.17
17     TMEM75   ENSG00000280055.2
18    EGFEM1P  ENSG00000206120.11
19      CASC8   ENSG00000246228.6


In [30]:
#下载gdc  ABSOLUTE拷贝
import os
import gzip
import shutil
import pandas as pd
import numpy as np

def manual_parse_data(dataset_name):
    pkl_file_path = './datasets/' + 'TCGA-' + dataset_name + '.gdc.pkl'
    gz_file_path ='./datasets/TCGA-' + dataset_name +'.gene-level_absolute.tsv.gz'
    tsv_file_path = './datasets/' + 'TCGA-' + dataset_name + '.gdc.tsv'
    # 确保.gz文件存在
    if not os.path.exists(gz_file_path):
        raise FileNotFoundError(f"File {gz_file_path} not found. Please download manually.")
    # 解压缩文件
    with gzip.open(gz_file_path, 'rb') as f_in:
        with open(tsv_file_path, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

    # 读取解压后的.tsv文件
    df = pd.read_csv(tsv_file_path, sep='\t')

    # 转置数据
    df_t = df.transpose()

    # 设置列名为第一行的内容
    df_t.columns = df_t.iloc[0]
    df_t = df_t.drop(df_t.index[0])

    # 数值转换
    #df_t = np.power(2, df_t.astype(float)) - 1
    #df_t = df_t.round(4)


    # 保存为.pkl文件以便以后快速加载
    df_t.to_pickle(pkl_file_path)

    # 删除中间文件
    #os.remove(gz_file_path)
    os.remove(tsv_file_path)

    # 将所有数据转换为数值型
    df_t = df_t.apply(pd.to_numeric)
    return df_t



# 使用手动解析函数处理数据
dataset_name = "OV"#修改名字
df = manual_parse_data(dataset_name)

In [47]:
import pandas as pd

# 读取数据表
data_file = 'datasets/TCGA-OV.gdc.pkl'  # 请替换为你的数据文件路径
data = pd.read_pickle(data_file)

# 创建一个字典存储每个 Top_id 对应的平均值
average_values = {}

for gene_id in Top20_id:
    if gene_id is not None:  # 确保 ID 不为 None
        # 计算对应列的平均值
        mean_value = data[gene_id].mean()
        average_values[gene_id] = mean_value


# 将平均值添加到 DataFrame 中，命名为 'ABSOLUTE' 列
result_df['ABSOLUTE'] = result_df['ID'].map(average_values)

# 打印结果 DataFrame
print(result_df)

         Gene                  ID  ABSOLUTE
0       MECOM  ENSG00000085276.19  4.388193
1        PVT1  ENSG00000249859.12  4.576029
2      ACTRT3   ENSG00000184378.3  4.395349
3        TERC   ENSG00000270141.3  4.395349
4      LRRC34  ENSG00000171757.17  4.384615
5        MYNN  ENSG00000085274.16  4.389587
6      LRRIQ4   ENSG00000188306.6  4.377460
7         MYC  ENSG00000136997.21  4.590340
8      LRRC31  ENSG00000114248.10  4.377460
9     MIR1208   ENSG00000221261.1  4.584973
10    MIR1207   ENSG00000221176.1  4.588551
11      SAMD7   ENSG00000187033.9  4.366726
12      SEC62  ENSG00000008952.17  4.366726
13  RN7SKP226   ENSG00000201782.1  4.582437
14       PHC3  ENSG00000173889.16  4.350626
15    MIR1205   ENSG00000221771.1  4.581395
16     GPR160  ENSG00000173890.17  4.368515
17     TMEM75   ENSG00000280055.2  4.577061
18    EGFEM1P  ENSG00000206120.11  4.284436
19      CASC8   ENSG00000246228.6  4.524150


In [37]:
#下载gdc  ASCAT3拷贝
import os
import gzip
import shutil
import pandas as pd
import numpy as np

def manual_parse_data(dataset_name):
    pkl_file_path = './datasets/' + 'TCGA-' + dataset_name + '.gdc1.pkl'
    gz_file_path ='./datasets/TCGA-' + dataset_name +'.gene-level_ascat3.tsv.gz'
    tsv_file_path = './datasets/' + 'TCGA-' + dataset_name + '.gdc1.tsv'
    # 确保.gz文件存在
    if not os.path.exists(gz_file_path):
        raise FileNotFoundError(f"File {gz_file_path} not found. Please download manually.")
    # 解压缩文件
    with gzip.open(gz_file_path, 'rb') as f_in:
        with open(tsv_file_path, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

    # 读取解压后的.tsv文件
    df = pd.read_csv(tsv_file_path, sep='\t')

    # 转置数据
    df_t = df.transpose()

    # 设置列名为第一行的内容
    df_t.columns = df_t.iloc[0]
    df_t = df_t.drop(df_t.index[0])

    # 数值转换
    #df_t = np.power(2, df_t.astype(float)) - 1
    #df_t = df_t.round(4)


    # 保存为.pkl文件以便以后快速加载
    df_t.to_pickle(pkl_file_path)

    # 删除中间文件
    #os.remove(gz_file_path)
    os.remove(tsv_file_path)

    # 将所有数据转换为数值型
    df_t = df_t.apply(pd.to_numeric)
    return df_t



# 使用手动解析函数处理数据
dataset_name = "OV"#修改名字
df = manual_parse_data(dataset_name)

In [48]:
import pandas as pd

# 读取数据表
data_file = 'datasets/TCGA-OV.gdc1.pkl'  # 请替换为你的数据文件路径
data = pd.read_pickle(data_file)

# 创建一个字典存储每个 Top_id 对应的平均值
average_values = {}

for gene_id in Top20_id:
    if gene_id is not None:  # 确保 ID 不为 None
        # 计算对应列的平均值
        mean_value = data[gene_id].mean()
        average_values[gene_id] = mean_value

# 将平均值添加到 DataFrame 中，命名为 'ASCAT3' 列
result_df['ASCAT3'] = result_df['ID'].map(average_values)

# 打印结果 DataFrame
print(result_df)


         Gene                  ID  ABSOLUTE    ASCAT3
0       MECOM  ENSG00000085276.19  4.388193  4.994595
1        PVT1  ENSG00000249859.12  4.576029  5.459459
2      ACTRT3   ENSG00000184378.3  4.395349  4.963964
3        TERC   ENSG00000270141.3  4.395349  4.963964
4      LRRC34  ENSG00000171757.17  4.384615  4.963964
5        MYNN  ENSG00000085274.16  4.389587  4.963964
6      LRRIQ4   ENSG00000188306.6  4.377460  4.954955
7         MYC  ENSG00000136997.21  4.590340  5.448649
8      LRRC31  ENSG00000114248.10  4.377460  4.945946
9     MIR1208   ENSG00000221261.1  4.584973  5.457658
10    MIR1207   ENSG00000221176.1  4.588551  5.455856
11      SAMD7   ENSG00000187033.9  4.366726  4.911712
12      SEC62  ENSG00000008952.17  4.366726  4.911712
13  RN7SKP226   ENSG00000201782.1  4.582437  5.454054
14       PHC3  ENSG00000173889.16  4.350626  4.872072
15    MIR1205   ENSG00000221771.1  4.581395  5.455856
16     GPR160  ENSG00000173890.17  4.368515  4.900901
17     TMEM75   ENSG00000280

In [41]:
#下载表达量
import os
import gzip
import shutil
import pandas as pd
import numpy as np

def manual_parse_data(dataset_name):
    pkl_file_path = './datasets/' + 'TCGA-' + dataset_name + '.count.pkl'
    gz_file_path ='./datasets/TCGA-' + dataset_name +'.star_counts.tsv.gz'
    tsv_file_path = './datasets/' + 'TCGA-' + dataset_name + '.count.tsv'
    # 确保.gz文件存在
    if not os.path.exists(gz_file_path):
        raise FileNotFoundError(f"File {gz_file_path} not found. Please download manually.")
    # 解压缩文件
    with gzip.open(gz_file_path, 'rb') as f_in:
        with open(tsv_file_path, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

    # 读取解压后的.tsv文件
    df = pd.read_csv(tsv_file_path, sep='\t')

    # 转置数据
    df_t = df.transpose()

    # 设置列名为第一行的内容
    df_t.columns = df_t.iloc[0]
    df_t = df_t.drop(df_t.index[0])

    # 数值转换
    #df_t = np.power(2, df_t.astype(float)) - 1
    #df_t = df_t.round(4)


    # 保存为.pkl文件以便以后快速加载
    df_t.to_pickle(pkl_file_path)

    # 删除中间文件
    #os.remove(gz_file_path)
    os.remove(tsv_file_path)

    # 将所有数据转换为数值型
    df_t = df_t.apply(pd.to_numeric)
    return df_t



# 使用手动解析函数处理数据
dataset_name = "OV"#修改名字
df = manual_parse_data(dataset_name)

In [49]:
import pandas as pd

# 读取数据表
data_file = 'datasets/TCGA-OV.count.pkl'  # 请替换为你的数据文件路径
data = pd.read_pickle(data_file)

# 创建一个字典存储每个 Top_id 对应的平均值
average_values = {}

for gene_id in Top20_id:
    if gene_id is not None:  # 确保 ID 不为 None
        # 计算对应列的平均值
        mean_value = data[gene_id].mean()
        average_values[gene_id] = mean_value



# 将平均值添加到 DataFrame 中，命名为 'expression' 列
result_df['expression'] = result_df['ID'].map(average_values)

# 打印结果 DataFrame
print(result_df)

# 将结果保存到 CSV 文件：
result_df.to_csv('top20_genes_with_ids_and_expression_average.csv', index=False)


         Gene                  ID  ABSOLUTE    ASCAT3  expression
0       MECOM  ENSG00000085276.19  4.388193  4.994595   12.712127
1        PVT1  ENSG00000249859.12  4.576029  5.459459    9.961079
2      ACTRT3   ENSG00000184378.3  4.395349  4.963964    7.524944
3        TERC   ENSG00000270141.3  4.395349  4.963964    0.759914
4      LRRC34  ENSG00000171757.17  4.384615  4.963964    6.711959
5        MYNN  ENSG00000085274.16  4.389587  4.963964   10.512125
6      LRRIQ4   ENSG00000188306.6  4.377460  4.954955    3.418098
7         MYC  ENSG00000136997.21  4.590340  5.448649   12.599711
8      LRRC31  ENSG00000114248.10  4.377460  4.945946    2.447415
9     MIR1208   ENSG00000221261.1  4.584973  5.457658    0.136640
10    MIR1207   ENSG00000221176.1  4.588551  5.455856    0.000000
11      SAMD7   ENSG00000187033.9  4.366726  4.911712    0.618588
12      SEC62  ENSG00000008952.17  4.366726  4.911712   13.368621
13  RN7SKP226   ENSG00000201782.1  4.582437  5.454054    0.087039
14       P

## non-cell