In [1]:
import os, itertools, pickle
import numpy as np
import pandas as pd

from sklearn import feature_selection

RAW_PATH = "/data2/zhoujb/project/cowpea_project/rawData/"
SNP_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/snpDir/"
CLUSTER_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/SNPMMSeqCluster/"
RES_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/Result/"
ML_RAW_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/ML/rawData/"

def rmLowVarAndHighCor(data, feature_prefix='feature_', var_threshold=0.01, cor_threshods=0.8):
    
    data_feat = data.filter(regex='^{}'.format(feature_prefix), axis=1)
    
    # remove quasi-constant features
    # 0.01 indicates 99% of observations approximately
    var_sel = feature_selection.VarianceThreshold(threshold=var_threshold)
    var_sel.fit(data_feat)
    
    feat_keep = data_feat.columns[var_sel.get_support()]
    data_feat = data_feat[feat_keep]
    
    # drop highly correlated features
    corr_mat = data_feat.corr().abs()
    upper = corr_mat.where(np.triu(np.ones(corr_mat.shape), k=1).astype(np.bool_))
    to_drop = [column for column in upper.columns if any(upper[column] > cor_threshods)]
    
    feat_keep = [column for column in data_feat.columns if column not in to_drop]
    return feat_keep

In [2]:
# 表型数据
raw_phenos_1 = pd.read_excel(os.path.join(RAW_PATH, "Phenotypes for GWAS-upload public database.xlsx"), index_col="Accessions")
raw_phenos_2 = pd.read_excel(os.path.join(RAW_PATH, "BMK21067-AK808-data-2.xlsx"), index_col="BMK-ID")
raw_phenos_2 = raw_phenos_2[['HZ Pod Cellulose (mg/g)', 'GZ Pod Cellulose (mg/g)', 'HZ Seed Cellulose (mg/g)', 'GZ Seed Cellulose (mg/g)']].copy()
raw_phenos = raw_phenos_1.merge(raw_phenos_2, how='left', left_index=True, right_index=True)

raw_phenos = raw_phenos.drop(columns="ID")
raw_phenos = raw_phenos.replace({"-":np.nan})
raw_phenos = raw_phenos.dropna(axis=0, how="all")
raw_phenos = raw_phenos.rename(columns={'HZ-Pod length':'HZ-PL', 'GZ-Pod length':'GZ-PL', 
                                        'HZ Pod Sugar content (mg/g)':'HZ-PSugar','GZ Pod Sugar content  (mg/g)':'GZ-PSugar', 
                                        'HZ Pod Starch content(mg/g)':'HZ-PStarch','GZ Pod Strach Content (mg/g)':'GZ-PStarch', 
                                        'HZ Pod Protein content(g/Kg)':'HZ-PProtein','GZ Pod protein  (g/Kg)':'GZ-PProtein', 
                                        'HZ Seed Sugar (mg/g)':'HZ-SSuger','GZ Seed Sugar (mg/g)':'GZ-SSuger', 
                                        'HZ Seed Starch (mg/g)':'HZ-SStarch','GZ Seed Starch (mg/g)':'GZ-SStarch', 
                                        'HZ Seed Protein (g/kg)':'HZ-SProtein','GZ Seed Protein (g/kg)':'GZ-SProtein', 
                                        'Pod shattering':'PS', 'HZ Pod Cellulose (mg/g)':"HZ-PC", 'GZ Pod Cellulose (mg/g)':"GZ-PC",
                                        'HZ Seed Cellulose (mg/g)':"HZ-SC", 'GZ Seed Cellulose (mg/g)':"GZ-SC"})

target_df = raw_phenos[["HZ-PStarch"]].dropna()
keep_col = [x for x in raw_phenos.columns if (not x.startswith("GZ")) and (not x.endswith("PStarch"))]
pheno_data = raw_phenos[keep_col]
pheno_data = pheno_data.rename(columns={x:x.split("-")[-1] for x in pheno_data.columns})
pheno_data = pheno_data.rename(columns={x:"fea_{}".format(x) for x in pheno_data.columns})

# 基因组分数
genotype_hap_score = pd.read_table(os.path.join(RES_PATH, "genotype_hap_score.txt"), index_col=0)
genotype_hap_score = genotype_hap_score.rename(columns={x:"fea_{}".format(x) for x in genotype_hap_score.columns})

# 代谢组数据
deg_sample_val = pd.read_table(os.path.join(RAW_PATH, "metabolome_sample_value.txt"), index_col=0)
deg_sample_val = deg_sample_val.rename(columns={x:"fea_{}".format(x) for x in deg_sample_val.columns})

inte_sample = list(set(target_df.index).intersection(genotype_hap_score.index).intersection(deg_sample_val.index))

raw_data = pd.concat([genotype_hap_score.loc[inte_sample], deg_sample_val.loc[inte_sample], pheno_data.loc[inte_sample]], axis=1)
#raw_data = pd.concat([genotype_hap_score.loc[inte_sample], deg_sample_val.loc[inte_sample]], axis=1)
feat_keep = rmLowVarAndHighCor(raw_data, feature_prefix='fea_', var_threshold=0.01, cor_threshods=0.8)
raw_data = raw_data[feat_keep].copy()
raw_data = pd.concat([raw_data, target_df.loc[inte_sample]], axis=1)

# Write raw data
raw_data.to_csv(os.path.join(ML_RAW_PATH, "raw_data_PStarch.txt"), sep="\t")

In [3]:
raw_data

Unnamed: 0,fea_Chr01_51400001_51530000,fea_Chr01_4550001_4690000,fea_Chr01_44170001_44300000,fea_Chr01_5960001_6170000,fea_Chr01_34570001_34760000,fea_Chr01_2330001_2630000,fea_Chr01_43270001_43430000,fea_Chr02_680001_1040000,fea_Chr01_49970001_50400000,fea_Chr01_46910001_47010000,...,fea_PProtein,fea_SSuger,fea_SStarch,fea_SProtein,fea_PS,fea_TSW,fea_GNP,fea_PC,fea_SC,HZ-PStarch
D382,0.012978,0.206476,0.082849,0.390666,0.800345,0.013596,0.760746,0.609217,2.032302,0.096657,...,22.143891,77.967227,265.637143,175.100000,1.0,82.94440,16.2,61.065281,203.1568,11.464921
D465,0.820745,0.206476,0.652068,0.004944,0.800345,0.013596,0.760746,0.609217,2.032302,0.618527,...,23.005435,90.798757,350.424588,244.990625,0.0,115.83060,19.4,42.919265,258.1348,11.387701
D483,0.820745,0.468997,0.652068,0.390666,0.800345,0.859828,0.760746,0.609217,2.032302,0.618527,...,21.903415,98.392500,425.608077,248.268750,0.0,134.61060,16.5,42.760054,205.7272,13.329656
D523,0.012978,0.468997,0.652068,0.004944,0.800345,0.859828,0.760746,0.609217,0.028427,0.618527,...,27.942429,90.133333,350.813889,282.368750,1.0,109.75265,14.8,43.745087,228.1944,9.078669
D604,0.012978,0.206476,0.652068,0.004944,0.008652,0.859828,0.760746,0.609217,2.032302,0.096657,...,22.465471,70.420333,336.003691,275.909375,0.0,78.42470,16.7,35.781460,237.9048,11.748399
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
D624,0.820745,0.468997,0.652068,0.390666,0.008652,0.013596,0.760746,0.609217,2.032302,0.618527,...,23.539030,81.625946,296.550450,265.662500,0.0,143.81610,16.3,40.833282,240.2848,12.756516
D477,0.820745,0.468997,0.652068,0.390666,0.800345,0.859828,0.760746,0.609217,2.032302,0.618527,...,23.795076,84.632342,342.901104,264.340625,0.0,146.20470,17.9,38.676350,273.9380,10.742952
D395,0.820745,0.206476,0.652068,0.004944,0.800345,0.151889,0.760746,0.609217,2.032302,0.618527,...,20.494294,83.343000,330.664167,294.743750,0.0,187.26545,16.6,44.423999,219.1504,11.698922
D568,0.820745,0.468997,0.652068,0.390666,0.800345,0.859828,0.760746,0.609217,2.032302,0.618527,...,23.116171,85.691082,261.733029,275.918750,0.0,139.63430,14.9,45.895487,260.2292,12.251152
