In [3]:
import pandas as pd
import numpy as np
from copy import deepcopy
from scipy.stats import spearmanr, pearsonr
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LassoCV, RidgeCV
from collections import Counter
import joblib

# Read features and HR phenotypes

In [10]:
df_X = pd.read_csv('../merge_features/feature_table_combined.all.csv', index_col=0)
df_y = pd.read_csv("../classification/hr_cla.csv", index_col=0)
df_y.HR = df_y.HR.astype(int)
df_y = df_y.loc[df_X.index]

In [11]:
len(df_X)

230

In [12]:
df_X.head()

Unnamed: 0,VAR_252,VAR_2,VAR_3927,VAR_153,VAR_9,VAR_76,VAR_125,VAR_1,VAR_191,VAR_7,...,CNV_quant_4563,CNV_quant_4564,CNV_quant_4565,CNV_quant_4566,CNV_quant_4567,CNV_quant_4568,CNV_quant_4569,CNV_quant_4570,CNV_quant_4571,CNV_quant_4572
CDC335,0,0,2,0,0,1,0,0,0,0,...,2,2,2,2,2,2,2,2,2,2
CDC336,0,0,2,0,0,0,0,0,0,0,...,2,2,2,2,2,2,2,2,2,2
CDC337,0,0,2,0,0,0,0,0,0,0,...,2,2,2,2,2,2,2,2,2,2
CDC338,0,0,2,0,0,0,0,0,0,0,...,2,2,2,2,2,2,2,2,2,2
CDC339,0,0,2,0,0,0,0,0,0,0,...,2,2,2,2,2,2,2,2,2,2


In [13]:
df_y.head()

Unnamed: 0,Institute,Location,HR
CDC335,CDC,USA,0
CDC336,CDC,USA,0
CDC337,CDC,USA,0
CDC338,CDC,USA,0
CDC339,CDC,USA,0


In [14]:
set(df_y.Location)

{'China', 'Europe', 'USA'}

# We will try two approaches for standardization:

## (1) Create dummy variables for categorical features and standardize only CNV quantitative features

In [15]:
numeric_features = [col for col in df_X.columns if col.startswith('CNV_quant')]
categorical_features = [col for col in df_X.columns if col.startswith('VAR') or col.startswith('CNV_cat')]
assert len(numeric_features) + len(categorical_features) == len(df_X.columns)

preprocessor = ColumnTransformer(
    transformers=[
        ('cat',OneHotEncoder(handle_unknown="ignore"),categorical_features),
        ('num',StandardScaler(),numeric_features)
    ]
)

df_X1 = pd.DataFrame(preprocessor.fit_transform(df_X), columns=preprocessor.get_feature_names_out(), index=df_X.index)
df_X1.head()

Unnamed: 0,cat__VAR_252_0,cat__VAR_252_2,cat__VAR_2_0,cat__VAR_2_2,cat__VAR_3927_0,cat__VAR_3927_1,cat__VAR_3927_2,cat__VAR_153_0,cat__VAR_153_1,cat__VAR_9_0,...,num__CNV_quant_4563,num__CNV_quant_4564,num__CNV_quant_4565,num__CNV_quant_4566,num__CNV_quant_4567,num__CNV_quant_4568,num__CNV_quant_4569,num__CNV_quant_4570,num__CNV_quant_4571,num__CNV_quant_4572
CDC335,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,...,-0.12559,-0.177173,-0.189832,-0.108306,0.05975,-0.038097,-0.21973,-0.12559,-0.169182,-0.178702
CDC336,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,...,-0.12559,-0.177173,-0.189832,-0.108306,0.05975,-0.038097,-0.21973,-0.12559,-0.169182,-0.178702
CDC337,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,...,-0.12559,-0.177173,-0.189832,-0.108306,0.05975,-0.038097,-0.21973,-0.12559,-0.169182,-0.178702
CDC338,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,...,-0.12559,-0.177173,-0.189832,-0.108306,0.05975,-0.038097,-0.21973,-0.12559,-0.169182,-0.178702
CDC339,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,...,-0.12559,-0.177173,-0.189832,-0.108306,0.05975,-0.038097,-0.21973,-0.12559,-0.169182,-0.178702


## (2) Create dummy variables for categorical features and standardize all features

In [19]:
numeric_features = [col for col in df_X.columns if col.startswith('CNV_quant')]
categorical_features = [col for col in df_X.columns if col.startswith('VAR') or col.startswith('CNV_cat')]
assert len(numeric_features) + len(categorical_features) == len(df_X.columns)

preprocessor1 = ColumnTransformer(
    transformers=[
        ('cat',OneHotEncoder(handle_unknown="ignore"),categorical_features)
    ]
)
df_X2 = pd.merge(
    pd.DataFrame(preprocessor1.fit_transform(df_X[categorical_features]), columns=preprocessor1.get_feature_names_out(), index=df_X[categorical_features].index),
    df_X[numeric_features],
    left_index=True,
    right_index=True,
    how='inner') 

preprocessor2 = ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),list(df_X2.columns))
    ]
)
df_X2 = pd.DataFrame(preprocessor2.fit_transform(df_X2), columns=preprocessor2.get_feature_names_out(), index=df_X2.index)
df_X2.head()

Unnamed: 0,num__cat__VAR_252_0,num__cat__VAR_252_2,num__cat__VAR_2_0,num__cat__VAR_2_2,num__cat__VAR_3927_0,num__cat__VAR_3927_1,num__cat__VAR_3927_2,num__cat__VAR_153_0,num__cat__VAR_153_1,num__cat__VAR_9_0,...,num__CNV_quant_4563,num__CNV_quant_4564,num__CNV_quant_4565,num__CNV_quant_4566,num__CNV_quant_4567,num__CNV_quant_4568,num__CNV_quant_4569,num__CNV_quant_4570,num__CNV_quant_4571,num__CNV_quant_4572
CDC335,0.291386,-0.291386,0.133038,-0.133038,-0.066082,-0.066082,0.093659,0.066082,-0.066082,0.066082,...,-0.12559,-0.177173,-0.189832,-0.108306,0.05975,-0.038097,-0.21973,-0.12559,-0.169182,-0.178702
CDC336,0.291386,-0.291386,0.133038,-0.133038,-0.066082,-0.066082,0.093659,0.066082,-0.066082,0.066082,...,-0.12559,-0.177173,-0.189832,-0.108306,0.05975,-0.038097,-0.21973,-0.12559,-0.169182,-0.178702
CDC337,0.291386,-0.291386,0.133038,-0.133038,-0.066082,-0.066082,0.093659,0.066082,-0.066082,0.066082,...,-0.12559,-0.177173,-0.189832,-0.108306,0.05975,-0.038097,-0.21973,-0.12559,-0.169182,-0.178702
CDC338,0.291386,-0.291386,0.133038,-0.133038,-0.066082,-0.066082,0.093659,0.066082,-0.066082,0.066082,...,-0.12559,-0.177173,-0.189832,-0.108306,0.05975,-0.038097,-0.21973,-0.12559,-0.169182,-0.178702
CDC339,0.291386,-0.291386,0.133038,-0.133038,-0.066082,-0.066082,0.093659,0.066082,-0.066082,0.066082,...,-0.12559,-0.177173,-0.189832,-0.108306,0.05975,-0.038097,-0.21973,-0.12559,-0.169182,-0.178702


# Read statistical test results

In [23]:
df_stat = pd.read_csv("../hypothesis_testing/corr_feature_HR.csv")
df_stat.head()

Unnamed: 0,Feature,Feature2,EffectSize,P,Padj
0,CNV_quant_571,CNV_quant__CPAR2_103140,-0.667507,4.950051e-31,1.545653e-26
1,CNV_quant_4327,CNV_quant__CPAR2_805410,-0.658765,5.307672e-30,1.65732e-25
2,CNV_quant_4328,CNV_quant__CPAR2_805420,-0.658458,5.7603029999999994e-30,1.7986550000000002e-25
3,VAR_628_0,cpar_Chr_1:273032:SNP:C:T;cpar_Chr_1:273038:SN...,0.023166,6.340429e-30,1.9797990000000002e-25
4,CNV_quant_3678,CNV_quant__CPAR2_700380,0.652194,2.998978e-29,9.364308e-25


In [24]:
len(df_stat)

31225

# All isolates (not corrected)

## X1

In [22]:
reg_all_notcorr_X1 = LassoCV(
    cv=5, 
    random_state=42,
    n_alphas=1000,
    max_iter=100000,
    verbose=0,
    n_jobs=-1,
).fit(df_X1.values, np.ravel(df_y[['HR']].values))
print('best alpha = %2.4f'%(reg_all_notcorr_X1.alpha_))

best alpha = 0.0456


In [25]:
df_reg_all_notcorr_X1 = pd.DataFrame(reg_all_notcorr_X1.coef_, index=df_X1.columns, columns=['CoefNotCorrected'])
df_reg_all_notcorr_X1 = df_reg_all_notcorr_X1[df_reg_all_notcorr_X1.CoefNotCorrected != 0].sort_values('CoefNotCorrected')
cnv_quant_map = joblib.load('../merge_features/cnv_quant_id_mapping.all.joblib')
cnv_cat_map = joblib.load('../merge_features/cnv_cat_id_mapping.all.joblib')
var_map = joblib.load('../merge_features/variant_id_mapping.all.joblib')
features = []
rank = []
for f in df_reg_all_notcorr_X1.index:
    f2 = f.split('__')[1]
    if f2.startswith('CNV_quant'):
        features.append((';').join([k for k,v in cnv_quant_map.items() if v==f2]))
    elif f2.startswith('CNV_cat'):
        features.append((';').join([k for k,v in cnv_cat_map.items() if v==('_').join(f2.split('_')[:-1])]))
    elif f2.startswith('VAR'):
        features.append((';').join([k for k,v in var_map.items() if v==('_').join(f2.split('_')[:-1])]))
    rank.append(list(df_stat[df_stat.Feature==f2].index)[0]+1)
df_reg_all_notcorr_X1['FeatureName'] = features
df_reg_all_notcorr_X1['CorrelationRank'] = rank
df_reg_all_notcorr_X1 = df_reg_all_notcorr_X1.rename({x:x.split('__')[1] for x in df_reg_all_notcorr_X1.index})
df_reg_all_notcorr_X1.index.name = 'FeatureID'
df_reg_all_notcorr_X1 = df_reg_all_notcorr_X1[['FeatureName','CorrelationRank','CoefNotCorrected']].reset_index()
#df_reg_all_notcorr_X1.to_csv("lasso_feature_selection_phylogeny_notcorrected_all_isolates.csv")
df_reg_all_notcorr_X1

Unnamed: 0,FeatureID,FeatureName,CorrelationRank,CoefNotCorrected
0,CNV_quant_4327,CNV_quant__CPAR2_805410,2,-0.068939
1,CNV_quant_571,CNV_quant__CPAR2_103140,1,-0.048188
2,CNV_quant_3355,CNV_quant__CPAR2_600440,1713,-0.037704
3,CNV_quant_1314,CNV_quant__CPAR2_202990,10246,-0.034309
4,CNV_quant_3902,CNV_quant__CPAR2_703370,11,-0.015994
5,CNV_quant_2759,CNV_quant__CPAR2_404940,12248,-0.012169
6,CNV_quant_2101,CNV_quant__CPAR2_300760,1039,-0.009727
7,CNV_quant_3868,CNV_quant__CPAR2_702960,2167,-0.005649
8,CNV_quant_4243,CNV_quant__CPAR2_804220,686,-0.003271
9,CNV_quant_1641,CNV_quant__CPAR2_207810,10600,-0.002684


## X2

In [26]:
reg_all_notcorr_X2 = LassoCV(
    cv=5, 
    random_state=42,
    n_alphas=1000,
    max_iter=100000,
    verbose=0,
    n_jobs=-1,
).fit(df_X2.values, np.ravel(df_y[['HR']].values))
print('best alpha = %2.4f'%(reg_all_notcorr_X2.alpha_))

best alpha = 0.0682


In [48]:
df_reg_all_notcorr_X2 = pd.DataFrame(reg_all_notcorr_X2.coef_, index=df_X2.columns, columns=['CoefNotCorrected'])
df_reg_all_notcorr_X2 = df_reg_all_notcorr_X2[df_reg_all_notcorr_X2.CoefNotCorrected != 0].sort_values('CoefNotCorrected')
cnv_quant_map = joblib.load('../merge_features/cnv_quant_id_mapping.all.joblib')
cnv_cat_map = joblib.load('../merge_features/cnv_cat_id_mapping.all.joblib')
var_map = joblib.load('../merge_features/variant_id_mapping.all.joblib')
features = []
rank = []
for f in df_reg_all_notcorr_X2.index:
    if f.startswith('num__cat__'):
        f2 = f.split('__')[2]
    else:
        f2 = f.split('__')[1]
    if f2.startswith('CNV_quant'):
        features.append((';').join([k for k,v in cnv_quant_map.items() if v==f2]))
    elif f2.startswith('CNV_cat'):
        features.append((';').join([k for k,v in cnv_cat_map.items() if v==('_').join(f2.split('_')[:-1])]))
    elif f2.startswith('VAR'):
        features.append((';').join([k for k,v in var_map.items() if v==('_').join(f2.split('_')[:-1])]))
    rank.append(list(df_stat[df_stat.Feature==f2].index)[0]+1)
df_reg_all_notcorr_X2['FeatureName'] = features
df_reg_all_notcorr_X2['CorrelationRank'] = rank
df_reg_all_notcorr_X2 = df_reg_all_notcorr_X2.rename({x:x.split('__')[2] if x.startswith('num__cat__') else x.split('__')[1] for x in df_reg_all_notcorr_X2.index})
df_reg_all_notcorr_X2.index.name = 'FeatureID'
df_reg_all_notcorr_X2 = df_reg_all_notcorr_X2[['FeatureName','CorrelationRank','CoefNotCorrected']].reset_index()
df_reg_all_notcorr_X2 = df_reg_all_notcorr_X2[(df_reg_all_notcorr_X2.CoefNotCorrected>=1e-3) | (df_reg_all_notcorr_X2.CoefNotCorrected<=-1e-3)]
#df_reg_all_notcorr_X2.to_csv("lasso_feature_selection_phylogeny_notcorrected_all_isolates.csv")
df_reg_all_notcorr_X2

Unnamed: 0,FeatureID,FeatureName,CorrelationRank,CoefNotCorrected
0,VAR_628_0,cpar_Chr_1:273032:SNP:C:T;cpar_Chr_1:273038:SN...,4,-0.109436
1,VAR_2995_0,cpar_Chr_3:864536:INDEL:C:CAAAATAAAATAAAATAAAAT,200,-0.031112
2,VAR_1542_0,cpar_Chr_1:2187739:INDEL:T:TC,22664,-0.008538
3,CNV_cat_5266_0,CNV_partial__CPAR2_403460,13664,-0.003032
4,VAR_2285_0,cpar_Chr_2:1994973:SNP:A:T,9,-0.00297
5,CNV_quant_1314,CNV_quant__CPAR2_202990,10246,-0.002895
6,VAR_4424_0,cpar_Chr_8:1464271:INDEL:A:AG,1957,-0.0011
12,VAR_1990_2,cpar_Chr_2:1291351:SNP:C:T,10,0.001153
13,CNV_quant_667,CNV_quant__CPAR2_104600,402,0.002004
14,VAR_3647_2,cpar_Chr_5:688402:SNP:C:A,274,0.007707


# All isolates (corrected for phylogeny)

In [35]:
df_phy = pd.read_csv('variance_covariance_matrix.csv', index_col=0)
df_phy = df_phy.loc[list(df_X.index), list(df_X.index)]
L = np.linalg.cholesky(df_phy.values) # Cholesky decomposition
LT = np.transpose(L)
transformer = np.matmul(np.linalg.inv(np.matmul(LT,L)),LT)

## X1

In [36]:
X1_phy_corrected = np.matmul(transformer, df_X1.values)
y_phy_corrected = np.matmul(transformer, df_y[['HR']].values)
reg_all_corrected_X1 = LassoCV(
    cv=5, 
    random_state=42,
    n_alphas=1000,
    max_iter=100000,
    verbose=0,
    n_jobs=-1,
).fit(X1_phy_corrected, np.ravel(y_phy_corrected))
print('best alpha = %2.4f'%(reg_all_corrected_X1.alpha_))

best alpha = 15.3186


In [37]:
df_reg_all_corrected_X1 = pd.DataFrame(reg_all_corrected_X1.coef_, index=df_X1.columns, columns=['CoefCorrected'])
df_reg_all_corrected_X1 = df_reg_all_corrected_X1[df_reg_all_corrected_X1.CoefCorrected != 0].sort_values('CoefCorrected')
cnv_quant_map = joblib.load('../merge_features/cnv_quant_id_mapping.all.joblib')
cnv_cat_map = joblib.load('../merge_features/cnv_cat_id_mapping.all.joblib')
var_map = joblib.load('../merge_features/variant_id_mapping.all.joblib')
features = []
rank = []
for f in df_reg_all_corrected_X1.index:
    f2 = f.split('__')[1]
    if f2.startswith('CNV_quant'):
        features.append((';').join([k for k,v in cnv_quant_map.items() if v==f2]))
    elif f2.startswith('CNV_cat'):
        features.append((';').join([k for k,v in cnv_cat_map.items() if v==('_').join(f2.split('_')[:-1])]))
    elif f2.startswith('VAR'):
        features.append((';').join([k for k,v in var_map.items() if v==('_').join(f2.split('_')[:-1])]))
    rank.append(list(df_stat[df_stat.Feature==f2].index)[0]+1)
df_reg_all_corrected_X1['FeatureName'] = features
df_reg_all_corrected_X1['CorrelationRank'] = rank
df_reg_all_corrected_X1 = df_reg_all_corrected_X1.rename({x:x.split('__')[1] for x in df_reg_all_corrected_X1.index})
df_reg_all_corrected_X1.index.name = 'FeatureID'
df_reg_all_corrected_X1 = df_reg_all_corrected_X1[['FeatureName','CorrelationRank','CoefCorrected']].reset_index()
#df_reg_all_corrected_X1.to_csv("lasso_feature_selection_phylogeny_corrected_all_isolates.csv")
df_reg_all_corrected_X1

Unnamed: 0,FeatureID,FeatureName,CorrelationRank,CoefCorrected
0,CNV_quant_1314,CNV_quant__CPAR2_202990,10246,-0.022052
1,CNV_quant_2730,CNV_quant__CPAR2_404570,9736,-0.009165
2,CNV_quant_2172,CNV_quant__CPAR2_301850,9147,-0.005919
3,CNV_quant_2011,CNV_quant__CPAR2_213580,9489,0.002096
4,CNV_quant_1982,CNV_quant__CPAR2_213220,9577,0.003792
5,CNV_quant_1948,CNV_quant__CPAR2_212650,9408,0.004102
6,CNV_quant_3252,CNV_quant__CPAR2_503480,12670,0.007317
7,CNV_quant_4155,CNV_quant__CPAR2_802880,11772,0.008034
8,CNV_quant_2768,CNV_quant__CPAR2_405080,12297,0.01123


## X2

In [42]:
X2_phy_corrected = np.matmul(transformer, df_X2.values)
y_phy_corrected = np.matmul(transformer, df_y[['HR']].values)
reg_all_corrected_X2 = LassoCV(
    cv=5, 
    random_state=42,
    n_alphas=1000,
    max_iter=100000,
    verbose=0,
    n_jobs=-1,
).fit(X2_phy_corrected, np.ravel(y_phy_corrected))
print('best alpha = %2.4f'%(reg_all_corrected_X2.alpha_))

best alpha = 14.2968


In [45]:
df_reg_all_corrected_X2 = pd.DataFrame(reg_all_corrected_X2.coef_, index=df_X2.columns, columns=['CoefCorrected'])
df_reg_all_corrected_X2 = df_reg_all_corrected_X2[df_reg_all_corrected_X2.CoefCorrected != 0].sort_values('CoefCorrected')
cnv_quant_map = joblib.load('../merge_features/cnv_quant_id_mapping.all.joblib')
cnv_cat_map = joblib.load('../merge_features/cnv_cat_id_mapping.all.joblib')
var_map = joblib.load('../merge_features/variant_id_mapping.all.joblib')
features = []
rank = []
for f in df_reg_all_corrected_X2.index:
    if f.startswith('num__cat__'):
        f2 = f.split('__')[2]
    else:
        f2 = f.split('__')[1]
    if f2.startswith('CNV_quant'):
        features.append((';').join([k for k,v in cnv_quant_map.items() if v==f2]))
    elif f2.startswith('CNV_cat'):
        features.append((';').join([k for k,v in cnv_cat_map.items() if v==('_').join(f2.split('_')[:-1])]))
    elif f2.startswith('VAR'):
        features.append((';').join([k for k,v in var_map.items() if v==('_').join(f2.split('_')[:-1])]))
    rank.append(list(df_stat[df_stat.Feature==f2].index)[0]+1)
df_reg_all_corrected_X2['FeatureName'] = features
df_reg_all_corrected_X2['CorrelationRank'] = rank
df_reg_all_corrected_X2 = df_reg_all_corrected_X2.rename({x:x.split('__')[2] if x.startswith('num__cat__') else x.split('__')[1] for x in df_reg_all_corrected_X2.index})
df_reg_all_corrected_X2.index.name = 'FeatureID'
df_reg_all_corrected_X2 = df_reg_all_corrected_X2[['FeatureName','CorrelationRank','CoefCorrected']].reset_index()
df_reg_all_corrected_X2 = df_reg_all_corrected_X2[(df_reg_all_corrected_X2.CoefCorrected>=1e-3) | (df_reg_all_corrected_X2.CoefCorrected<=-1e-3)]
#df_reg_all_corrected_X2.to_csv("lasso_feature_selection_phylogeny_corrected_all_isolates.csv")
df_reg_all_corrected_X2

Unnamed: 0,FeatureID,FeatureName,CorrelationRank,CoefCorrected
0,VAR_3927_2,cpar_Chr_6:547:SNP:T:G,21527,-0.04077
1,CNV_cat_5176_0,CNV_partial__CPAR2_400660,12929,-0.031601
2,CNV_quant_1314,CNV_quant__CPAR2_202990,10246,-0.023359
3,VAR_405_0,cpar_Chr_1:1204296:SNP:C:T;cpar_Chr_1:142036:S...,20354,-0.015267
4,CNV_cat_5266_0,CNV_partial__CPAR2_403460,13664,-0.007866
5,CNV_cat_1614_2,CNV_event__CPAR2_203220,26933,-0.002367
6,VAR_1260_0,cpar_Chr_6:598531:SNP:G:A;cpar_Chr_6:736022:SN...,21001,-0.001131
26,CNV_cat_3802_0,CNV_event__CPAR2_702970,3334,0.001627
27,CNV_cat_2112_0,CNV_event__CPAR2_212650,26282,0.001796
28,VAR_3156_0,cpar_Chr_4:134277:SNP:G:C,22766,0.002484


# Feature overlap

In [49]:
set(df_reg_all_corrected_X2.FeatureName).intersection(df_reg_all_notcorr_X2.FeatureName)

{'CNV_partial__CPAR2_403460',
 'CNV_quant__CPAR2_202990',
 'cpar_Chr_2:1442085:SNP:C:T'}

## generate table for yy

In [70]:
df_cnv_quant = pd.read_csv("../compute_cnv/cnv_features_stacked_04192023_peakdist0.9_mincontri0.2.csv")
df_cnv_quant = df_cnv_quant[df_cnv_quant.Orf.isin([x.split('__')[1] for x in df_reg_all_corrected.FeatureName])]
df_cnv_quant = df_cnv_quant[df_cnv_quant.Isolate != 'GL37']
df_cnv_quant = pd.pivot_table(df_cnv_quant, index='Isolate', columns='Orf', values='CNV_quant').astype(int)
df_cnv_quant.to_csv("for_yy_cnv_lasso_selected_phylo_corrected_features_all_isolates.csv")

# Combine the two tables

In [47]:
df_combined = pd.merge(df_reg_all_notcorr, df_reg_all_corrected, left_on=['FeatureID','FeatureName','CorrelationRank'], right_on=['FeatureID','FeatureName','CorrelationRank'], how='outer')
df_combined.to_csv("lasso_feature_selected_merged_all_isolates.csv")
df_combined

Unnamed: 0,FeatureID,FeatureName,CorrelationRank,CoefNotCorrected,CoefCorrected
0,CNV_quant_4327,CNV_quant__CPAR2_805410,2,-0.068939,
1,CNV_quant_571,CNV_quant__CPAR2_103140,1,-0.048188,
2,CNV_quant_3355,CNV_quant__CPAR2_600440,1713,-0.037704,
3,CNV_quant_1314,CNV_quant__CPAR2_202990,10246,-0.034309,-0.022052
4,CNV_quant_3902,CNV_quant__CPAR2_703370,11,-0.015994,
5,CNV_quant_2759,CNV_quant__CPAR2_404940,12248,-0.012169,
6,CNV_quant_2101,CNV_quant__CPAR2_300760,1039,-0.009727,
7,CNV_quant_3868,CNV_quant__CPAR2_702960,2167,-0.005649,
8,CNV_quant_4243,CNV_quant__CPAR2_804220,686,-0.003271,
9,CNV_quant_1641,CNV_quant__CPAR2_207810,10600,-0.002684,
