In [None]:
import pandas as pd
from scipy.stats import pearsonr, spearmanr, kendalltau
import numpy as np

def calculate_correlations(df):
    # 初始化结果列表
    results = []

    # 获取列数
    num_cols = df.shape[1]
    
    # 判断NA值
    na_check = df.isnull()

    # 对前 n-1 列与第 n 列计算相关性
    for i in range(num_cols - 1):
        # 提取当前列和第 n 列
        col1 = df.iloc[(~na_check.iloc[:,i]).tolist(),i]
        col2 = df.iloc[(~na_check.iloc[:,i]).tolist(),-1]

        # 计算皮尔逊相关性及其 p 值
        pearson_corr, pearson_p_value = pearsonr(col1, col2)

        # 计算斯皮尔曼相关性及其 p 值
        spearman_corr, spearman_p_value = spearmanr(col1, col2)

        # 计算肯德尔相关性及其 p 值
        kendall_corr, kendall_p_value = kendalltau(col1, col2)

        # 将结果添加到列表
        results.append({
            'Column': df.columns[i],
            'Pearson Correlation': pearson_corr,
            'Pearson P-value': pearson_p_value,
            'Spearman Correlation': spearman_corr,
            'Spearman P-value': spearman_p_value,
            'Kendall Correlation': kendall_corr,
            'Kendall P-value': kendall_p_value
        })

    # 转换结果为数据框
    result_df = pd.DataFrame(results)
    return result_df

In [None]:
tissue_list = ['Blood','Brain','Lung','Skin']# ['Blood','Brain','Lung','Skin']
for tissue in tissue_list:
    print(tissue)
    df = pd.read_csv(f'../../train_data/bootstrap_{tissue}_0.2_add.csv',index_col = 0)

    # 计算相关性并输出结果数据框
    result_df = calculate_correlations(df)

    # 读取染色体的位置信息
    data = pd.read_csv(f'../../train_data/merge_{tissue}_withchr.csv',index_col = 0)
    data.index = [str(i) for i in data.index.tolist()]
    data = data.loc[df.columns.tolist()[:-1]]
    data.reset_index(inplace = True,drop = True)

    # 将Chr、Start、End和相关性以及P值concat到一起
    data_merge = pd.concat([data.iloc[:,0:3],result_df.iloc[:,1:]],axis = 1)
    data_merge.to_csv(f'Manhattan_data_{tissue}_add.csv')