In [1]:
import os, itertools, pickle
import numpy as np
import pandas as pd

from sklearn import feature_selection

RAW_PATH = "/data2/zhoujb/project/cowpea_project/rawData/"
SNP_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/snpDir/"
CLUSTER_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/SNPMMSeqCluster/"
RES_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/Result/"
ML_RAW_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/ML/rawData/"

def rmLowVarAndHighCor(data, feature_prefix='feature_', var_threshold=0.01, cor_threshods=0.8):
    
    data_feat = data.filter(regex='^{}'.format(feature_prefix), axis=1)
    
    # remove quasi-constant features
    # 0.01 indicates 99% of observations approximately
    var_sel = feature_selection.VarianceThreshold(threshold=var_threshold)
    var_sel.fit(data_feat)
    
    feat_keep = data_feat.columns[var_sel.get_support()]
    data_feat = data_feat[feat_keep]
    
    # drop highly correlated features
    corr_mat = data_feat.corr().abs()
    upper = corr_mat.where(np.triu(np.ones(corr_mat.shape), k=1).astype(np.bool_))
    to_drop = [column for column in upper.columns if any(upper[column] > cor_threshods)]
    
    feat_keep = [column for column in data_feat.columns if column not in to_drop]
    return feat_keep

In [2]:
# 表型数据
raw_phenos_1 = pd.read_excel(os.path.join(RAW_PATH, "Phenotypes for GWAS-upload public database.xlsx"), index_col="Accessions")
raw_phenos_2 = pd.read_excel(os.path.join(RAW_PATH, "BMK21067-AK808-data-2.xlsx"), index_col="BMK-ID")
raw_phenos_2 = raw_phenos_2[['HZ Pod Cellulose (mg/g)', 'GZ Pod Cellulose (mg/g)', 'HZ Seed Cellulose (mg/g)', 'GZ Seed Cellulose (mg/g)']].copy()
raw_phenos = raw_phenos_1.merge(raw_phenos_2, how='left', left_index=True, right_index=True)

raw_phenos = raw_phenos.drop(columns="ID")
raw_phenos = raw_phenos.replace({"-":np.nan})
raw_phenos = raw_phenos.dropna(axis=0, how="all")
raw_phenos = raw_phenos.rename(columns={'HZ-Pod length':'HZ-PL', 'GZ-Pod length':'GZ-PL', 
                                        'HZ Pod Sugar content (mg/g)':'HZ-PSugar','GZ Pod Sugar content  (mg/g)':'GZ-PSugar', 
                                        'HZ Pod Starch content(mg/g)':'HZ-PStarch','GZ Pod Strach Content (mg/g)':'GZ-PStarch', 
                                        'HZ Pod Protein content(g/Kg)':'HZ-PProtein','GZ Pod protein  (g/Kg)':'GZ-PProtein', 
                                        'HZ Seed Sugar (mg/g)':'HZ-SSuger','GZ Seed Sugar (mg/g)':'GZ-SSuger', 
                                        'HZ Seed Starch (mg/g)':'HZ-SStarch','GZ Seed Starch (mg/g)':'GZ-SStarch', 
                                        'HZ Seed Protein (g/kg)':'HZ-SProtein','GZ Seed Protein (g/kg)':'GZ-SProtein', 
                                        'Pod shattering':'PS', 'HZ Pod Cellulose (mg/g)':"HZ-PC", 'GZ Pod Cellulose (mg/g)':"GZ-PC",
                                        'HZ Seed Cellulose (mg/g)':"HZ-SC", 'GZ Seed Cellulose (mg/g)':"GZ-SC"})

target_df = raw_phenos[["GZ-PC"]].dropna()
keep_col = [x for x in raw_phenos.columns if (not x.startswith("HZ")) and (not x.endswith("PC"))]
pheno_data = raw_phenos[keep_col]
pheno_data = pheno_data.rename(columns={x:x.split("-")[-1] for x in pheno_data.columns})
pheno_data = pheno_data.rename(columns={x:"fea_{}".format(x) for x in pheno_data.columns})

# 基因组分数
genotype_hap_score = pd.read_table(os.path.join(RES_PATH, "genotype_hap_score.txt"), index_col=0)
genotype_hap_score = genotype_hap_score.rename(columns={x:"fea_{}".format(x) for x in genotype_hap_score.columns})

# 代谢组数据
deg_sample_val = pd.read_table(os.path.join(RAW_PATH, "metabolome_sample_value.txt"), index_col=0)
deg_sample_val = deg_sample_val.rename(columns={x:"fea_{}".format(x) for x in deg_sample_val.columns})

inte_sample = list(set(target_df.index).intersection(genotype_hap_score.index).intersection(deg_sample_val.index))

raw_data = pd.concat([genotype_hap_score.loc[inte_sample], deg_sample_val.loc[inte_sample], pheno_data.loc[inte_sample]], axis=1)
#raw_data = pd.concat([genotype_hap_score.loc[inte_sample], deg_sample_val.loc[inte_sample]], axis=1)
feat_keep = rmLowVarAndHighCor(raw_data, feature_prefix='fea_', var_threshold=0.01, cor_threshods=0.8)
raw_data = raw_data[feat_keep].copy()
raw_data = pd.concat([raw_data, target_df.loc[inte_sample]], axis=1)

# Write raw data
raw_data.to_csv(os.path.join(ML_RAW_PATH, "raw_data_PC.txt"), sep="\t")

In [3]:
raw_data

Unnamed: 0,fea_Chr01_51400001_51530000,fea_Chr01_4550001_4690000,fea_Chr01_44170001_44300000,fea_Chr01_5960001_6170000,fea_Chr01_34570001_34760000,fea_Chr01_2330001_2630000,fea_Chr01_43270001_43430000,fea_Chr02_680001_1040000,fea_Chr01_49970001_50400000,fea_Chr01_46910001_47010000,...,fea_PSugar,fea_PStarch,fea_PProtein,fea_SSuger,fea_SStarch,fea_SProtein,fea_PS,fea_TSW,fea_SC,GZ-PC
D476,0.820745,0.468997,0.652068,0.390666,0.800345,0.859828,0.760746,0.609217,2.032302,,...,16.96890,17.542300,20.103461,105.603650,193.225972,260.230,0.0,159.5926,590.729880,44.542493
D548,0.012978,0.468997,0.652068,0.390666,0.800345,0.151889,0.096657,0.609217,2.032302,0.618527,...,14.56650,13.106150,24.410009,78.676704,203.738339,284.495,0.0,185.9453,594.182140,54.958960
D603,0.820745,0.468997,0.652068,0.390666,0.008652,0.859828,0.760746,0.609217,2.032302,0.618527,...,14.72640,14.425917,22.860231,74.550300,237.777629,258.720,0.0,88.8846,727.499649,58.568627
D674,0.012978,0.468997,0.082849,0.390666,0.800345,0.859828,0.096657,0.609217,0.028427,0.008652,...,10.41105,21.000667,21.398974,84.652778,218.969288,253.500,0.0,146.7826,696.049199,66.573360
D383,0.820745,0.206476,0.652068,0.004944,0.800345,0.859828,0.760746,0.609217,0.028427,0.618527,...,11.62200,17.258117,20.086232,98.379304,237.391265,289.670,0.0,122.5714,615.448743,45.407227
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
D521,0.012978,0.206476,0.652068,0.004944,0.800345,0.859828,0.760746,0.609217,2.032302,0.618527,...,13.53105,14.161000,25.015035,75.265876,233.545932,288.920,0.0,174.3934,649.268673,39.719027
D517,0.820745,0.206476,0.652068,0.004944,0.800345,0.859828,0.760746,0.609217,2.032302,0.618527,...,13.90935,16.853517,21.277441,76.573629,237.010476,278.275,0.0,127.7028,681.412827,38.037160
D479,0.820745,0.468997,0.652068,0.390666,0.800345,0.859828,0.760746,0.609217,2.032302,0.618527,...,17.09955,14.380158,21.659949,86.596294,179.577908,284.980,0.0,166.1453,562.603777,49.183493
D480,0.820745,0.468997,0.652068,0.390666,0.800345,0.859828,0.760746,0.609217,2.032302,0.618527,...,19.07880,16.169550,19.108218,91.434992,173.234572,278.315,0.0,164.3820,688.344122,53.245360
