In [1]:
import os, itertools, pickle
import numpy as np
import pandas as pd

from sklearn import feature_selection

RAW_PATH = "/data2/zhoujb/project/cowpea_project/rawData/"
SNP_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/snpDir/"
CLUSTER_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/SNPMMSeqCluster/"
RES_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/Result/"
ML_RAW_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/ML/rawData/"

def rmLowVarAndHighCor(data, feature_prefix='feature_', var_threshold=0.01, cor_threshods=0.8):
    
    data_feat = data.filter(regex='^{}'.format(feature_prefix), axis=1)
    
    # remove quasi-constant features
    # 0.01 indicates 99% of observations approximately
    var_sel = feature_selection.VarianceThreshold(threshold=var_threshold)
    var_sel.fit(data_feat)
    
    feat_keep = data_feat.columns[var_sel.get_support()]
    data_feat = data_feat[feat_keep]
    
    # drop highly correlated features
    corr_mat = data_feat.corr().abs()
    upper = corr_mat.where(np.triu(np.ones(corr_mat.shape), k=1).astype(np.bool_))
    to_drop = [column for column in upper.columns if any(upper[column] > cor_threshods)]
    
    feat_keep = [column for column in data_feat.columns if column not in to_drop]
    return feat_keep

In [3]:
# 表型数据
#raw_phenos = pd.read_excel(os.path.join(RAW_PATH, "Phenotypes for GWAS-upload public database.xlsx"), index_col="Accessions")
raw_phenos_1 = pd.read_excel(os.path.join(RAW_PATH, "Phenotypes for GWAS-upload public database.xlsx"), index_col="Accessions")
raw_phenos_2 = pd.read_excel(os.path.join(RAW_PATH, "BMK21067-AK808-data-2.xlsx"), index_col="BMK-ID")
raw_phenos_2 = raw_phenos_2[['HZ Pod Cellulose (mg/g)', 'GZ Pod Cellulose (mg/g)', 'HZ Seed Cellulose (mg/g)', 'GZ Seed Cellulose (mg/g)']].copy()
raw_phenos = raw_phenos_1.merge(raw_phenos_2, how='left', left_index=True, right_index=True)

raw_phenos = raw_phenos.drop(columns="ID")
raw_phenos = raw_phenos.replace({"-":np.nan})
raw_phenos = raw_phenos.dropna(axis=0, how="all")
raw_phenos = raw_phenos.rename(columns={'HZ-Pod length':'HZ-PL', 'GZ-Pod length':'GZ-PL', 
                                        'HZ Pod Sugar content (mg/g)':'HZ-PSugar','GZ Pod Sugar content  (mg/g)':'GZ-PSugar', 
                                        'HZ Pod Starch content(mg/g)':'HZ-PStarch','GZ Pod Strach Content (mg/g)':'GZ-PStarch', 
                                        'HZ Pod Protein content(g/Kg)':'HZ-PProtein','GZ Pod protein  (g/Kg)':'GZ-PProtein', 
                                        'HZ Seed Sugar (mg/g)':'HZ-SSuger','GZ Seed Sugar (mg/g)':'GZ-SSuger', 
                                        'HZ Seed Starch (mg/g)':'HZ-SStarch','GZ Seed Starch (mg/g)':'GZ-SStarch', 
                                        'HZ Seed Protein (g/kg)':'HZ-SProtein','GZ Seed Protein (g/kg)':'GZ-SProtein', 
                                        'Pod shattering':'PS', 'HZ Pod Cellulose (mg/g)':"HZ-PC", 'GZ Pod Cellulose (mg/g)':"GZ-PC",
                                        'HZ Seed Cellulose (mg/g)':"HZ-SC", 'GZ Seed Cellulose (mg/g)':"GZ-SC"})
target_df = raw_phenos[["HZ-PL"]].dropna()
keep_col = [x for x in raw_phenos.columns if (not x.startswith("GZ")) and (not x.endswith("PL"))]
pheno_data = raw_phenos[keep_col]
pheno_data = pheno_data.rename(columns={x:x.split("-")[-1] for x in pheno_data.columns})
pheno_data = pheno_data.rename(columns={x:"fea_{}".format(x) for x in pheno_data.columns})

# 基因组分数
genotype_hap_score = pd.read_table(os.path.join(RES_PATH, "genotype_hap_score.txt"), index_col=0)
genotype_hap_score = genotype_hap_score.rename(columns={x:"fea_{}".format(x) for x in genotype_hap_score.columns})

# 代谢组数据
deg_sample_val = pd.read_table(os.path.join(RAW_PATH, "metabolome_sample_value.txt"), index_col=0)
deg_sample_val = deg_sample_val.rename(columns={x:"fea_{}".format(x) for x in deg_sample_val.columns})

inte_sample = list(set(target_df.index).intersection(genotype_hap_score.index).intersection(deg_sample_val.index))

raw_data = pd.concat([genotype_hap_score.loc[inte_sample], deg_sample_val.loc[inte_sample], pheno_data.loc[inte_sample]], axis=1)
#raw_data = pd.concat([genotype_hap_score.loc[inte_sample], deg_sample_val.loc[inte_sample]], axis=1)
feat_keep = rmLowVarAndHighCor(raw_data, feature_prefix='fea_', var_threshold=0.01, cor_threshods=0.8)
raw_data = raw_data[feat_keep].copy()
raw_data = pd.concat([raw_data, target_df.loc[inte_sample]], axis=1)

# Write raw data
raw_data.to_csv(os.path.join(ML_RAW_PATH, "raw_data_PL.txt"), sep="\t")

In [4]:
raw_data

Unnamed: 0,fea_Chr01_51400001_51530000,fea_Chr01_4550001_4690000,fea_Chr01_44170001_44300000,fea_Chr01_5960001_6170000,fea_Chr01_34570001_34760000,fea_Chr01_2330001_2630000,fea_Chr01_43270001_43430000,fea_Chr02_680001_1040000,fea_Chr01_49970001_50400000,fea_Chr01_46910001_47010000,...,fea_PProtein,fea_SSuger,fea_SStarch,fea_SProtein,fea_PS,fea_TSW,fea_GNP,fea_PC,fea_SC,HZ-PL
D517,0.820745,0.206476,0.652068,0.004944,0.800345,0.859828,0.760746,0.609217,2.032302,0.618527,...,23.422734,89.681800,351.255077,269.881250,0.0,110.88690,19.1,38.632907,236.9528,70.443750
D519,0.820745,0.468997,0.652068,0.390666,0.800345,0.013596,0.760746,0.609217,2.032302,0.618527,...,25.192389,85.786071,333.175714,274.700000,0.0,118.95830,11.5,34.852001,247.7104,36.000000
D462,0.012978,0.082849,0.082849,0.390666,0.800345,0.013596,0.760746,0.082849,0.028427,0.096657,...,26.940335,97.447500,345.132692,253.768750,1.0,136.80530,15.8,50.517514,208.2976,18.400000
D639,,,0.652068,,0.800345,,0.096657,0.609217,,,...,24.642885,92.625379,386.838201,259.968750,0.0,146.24720,12.2,42.062069,254.6600,73.050000
D566,0.012978,0.206476,0.652068,0.004944,0.800345,0.013596,0.760746,0.609217,2.032302,0.008652,...,23.937340,74.928380,326.003317,244.228125,0.0,91.29080,12.9,47.669295,274.5568,23.075000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
D378,0.012978,0.206476,0.652068,0.004944,0.800345,0.859828,0.760746,0.609217,2.032302,0.008652,...,23.135204,89.803499,356.819457,258.953125,0.0,102.58620,15.9,48.164146,206.1556,54.825000
D422,0.820745,0.468997,0.652068,0.390666,0.800345,0.859828,0.760746,0.609217,2.032302,0.618527,...,20.670335,84.895624,320.692464,281.034375,0.0,158.24560,18.6,53.282070,264.4180,61.446429
D535,0.012978,0.468997,0.652068,0.390666,0.800345,0.013596,0.760746,0.609217,0.028427,0.618527,...,24.426676,71.203329,225.474814,223.812500,0.0,178.46060,15.8,57.157014,225.8144,36.900000
D526,0.820745,0.468997,0.652068,0.390666,0.800345,0.859828,0.760746,0.609217,2.032302,0.618527,...,25.406619,110.620102,241.201684,270.753125,0.0,154.13855,19.8,44.401681,242.3316,46.400000


In [7]:
raw_data = pd.read_table(os.path.join(ML_RAW_PATH, "raw_data_PL.txt"), index_col=0)
raw_data["fea_PS"].value_counts()

fea_PS
0.0    279
1.0     22
Name: count, dtype: int64