# Create feature tables for regression
Use corrected single mutant relative fitness and genetic interaction neutrality functions (Mani 2008 PNAS) to predict the relative fitness of the double mutant

- 9 features:

  Wa

  Wb

  additive (Wa + Wb)

  difference (Wa - Wb)

  multiplicative (Wa * Wb)
  
  log_mani (log2[((2^Wa) − 1)*((2^Wb) − 1) + 1])

  log_additive (log(Wa * Wb))

  log_difference (log(Wa / Wb))

  mean ((Wa + Wb)/2)

Labels:
- Wab as total seed count (TSC)

Relative fitness (W): total seed count (corrected for batch effects)

Wa: relative fitness of gene mutant A

Wb: relative fitness of gene mutant B

Wab: relative fitness of the double mutant

In [1]:
# Read in the data
import pandas as pd
data = pd.read_csv('../ara_data/double_mutant_fitness_data_05312024_all_corrected_linear_b.txt', sep='\t')
data

  data = pd.read_csv('../ara_data/double_mutant_fitness_data_05312024_all_corrected_linear_b.txt', sep='\t')


Unnamed: 0,Set,Flat,Column,Row,Number,Genotype,Subline,MA,MB,GN,...,TSC_emmean,TSC_SE,TSC_df,TSC_lower.CL,TSC_upper.CL,SH_emmean,SH_SE,SH_df,SH_lower.CL,SH_upper.CL
0,1,1,4,1,4,MB,001-MB-2,WT,MUT,5.0,...,38.657010,3.526650,15.317155,31.153665,46.160354,-0.045469,1.924941,17.060851,-4.105636,4.014698
1,1,1,6,1,6,DM,001-DM-2,MUT,MUT,5.0,...,40.079134,3.531284,15.320574,32.566074,47.592193,2.960672,1.871493,15.679768,-1.013312,6.934657
2,1,1,8,1,8,MA,001-MA-2,MUT,WT,5.0,...,51.311661,3.673220,16.969088,43.560769,59.062553,4.511428,1.955215,18.003516,0.403730,8.619125
3,1,1,10,1,10,WT,001-WT-2,WT,WT,9.0,...,54.898058,3.625398,15.993182,47.212292,62.583824,,,,,
4,1,1,6,3,26,MB,001-MB-2,WT,MUT,5.0,...,38.657010,3.526650,15.317155,31.153665,46.160354,-0.045469,1.924941,17.060851,-4.105636,4.014698
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26581,845E,1,5,20,195,MA,845-MA-2,MUT,WT,5.0,...,27.042495,2.645280,21.444050,21.548262,32.536729,-0.008226,0.455733,34.191697,-0.934195,0.917743
26582,845E,1,6,20,196,MA,845-MA-4,MUT,WT,3.0,...,27.042495,2.645280,21.444050,21.548262,32.536729,-0.008226,0.455733,34.191697,-0.934195,0.917743
26583,845E,1,8,20,198,MB,845-MB-4,WT,MUT,3.0,...,26.715770,2.728223,23.742441,21.081760,32.349781,0.839738,0.482346,38.649974,-0.136182,1.815658
26584,845E,1,9,20,199,MB,845-MB-1,WT,MUT,3.0,...,26.715770,2.728223,23.742441,21.081760,32.349781,0.839738,0.482346,38.649974,-0.136182,1.815658


## Calculate the relative fitness of the single and double mutants using the corrected trait values

In [2]:
# Subset the corrected trait values
data_avg = data.loc[:,data.columns.isin(['Set', 'Flat', 'Genotype'])\
    | (data.columns.str.endswith('_emmean'))].\
    groupby(['Set', 'Flat', 'Genotype']).mean()
data_avg

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,GN_emmean,PG_emmean,DTB_emmean,LN_emmean,DTF_emmean,SN_emmean,WO_emmean,FN_emmean,SPF_emmean,TSC_emmean,SH_emmean
Set,Flat,Genotype,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,1,DM,4.782268,95.702242,31.808797,8.853605,32.982221,2.917333,1.268865,1.347726e-01,13.135060,40.079134,2.960672e+00
1,1,MA,5.087400,101.765558,30.162385,8.239128,32.863066,2.572766,0.719832,4.142706e-02,18.094280,51.311661,4.511428e+00
1,1,MB,5.122931,101.819436,30.772401,8.631862,33.338776,2.944599,1.047182,4.605556e-02,13.149070,38.657010,-4.546878e-02
1,1,WT,5.197245,103.915498,28.391062,8.055482,31.392208,2.744861,0.581349,6.557994e-02,19.160268,54.898058,2.728458e+00
1,2,DM,4.782268,95.702242,31.808797,8.853605,32.982221,2.917333,1.268865,1.347726e-01,13.135060,40.079134,2.960672e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
845,1,MB,5.133986,102.679714,26.025641,11.203412,,1.958552,14.505501,3.773656e+00,9.721296,31.798327,3.070175e+00
845,1,WT,5.343502,106.870049,26.812500,12.301997,,1.891348,14.709453,4.474796e+00,7.656962,30.338463,-1.642242e-16
845E,1,MA,2.795727,55.914537,41.883470,7.972750,45.509007,1.830815,0.000000,2.104208e-18,14.159204,27.042495,-8.226227e-03
845E,1,MB,2.791089,55.821778,41.304371,7.747274,44.934294,1.748365,0.000000,3.333333e-02,15.063889,26.715770,8.397377e-01


In [3]:
# Calculate the relative fitness
W = data_avg.index.to_frame()
for trait in data_avg.columns:
    tmp = data_avg[trait].reset_index().pivot(index=['Set', 'Flat'], columns='Genotype', values=trait) # reshape tmp to wide
    W_tmp = tmp.apply(lambda x: x / x['WT'], axis=1) # relative fitness
    W_tmp = W_tmp.reset_index().melt(id_vars=['Set', 'Flat'], var_name='Genotype', value_name=f'W_{trait}') # reshape W_tmp back to long
    W_tmp.set_index(['Set', 'Flat', 'Genotype'], inplace=True)
    W = pd.concat([W, W_tmp], ignore_index=False, axis=1)
W

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Set,Flat,Genotype,W_GN_emmean,W_PG_emmean,W_DTB_emmean,W_LN_emmean,W_DTF_emmean,W_SN_emmean,W_WO_emmean,W_FN_emmean,W_SPF_emmean,W_TSC_emmean,W_SH_emmean
Set,Flat,Genotype,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,1,DM,1,1.0,DM,0.920155,0.920962,1.120381,1.099078,1.050650,1.062835,2.182621,2.055088,0.685536,0.730065,1.085108
1,1,MA,1,1.0,MA,0.978865,0.979311,1.062390,1.022798,1.046854,0.937303,1.238209,0.631703,0.944365,0.934672,1.653472
1,1,MB,1,1.0,MB,0.985701,0.979829,1.083876,1.071551,1.062008,1.072768,1.801296,0.702281,0.686268,0.704160,-0.016665
1,1,WT,1,1.0,WT,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
1,2,DM,1,2.0,DM,0.920155,0.920962,1.120381,1.099078,1.050650,1.062835,2.182621,2.055088,0.685536,0.730065,1.085108
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
845E,1,MA,845E,1.0,MA,1.016841,1.016841,0.991225,0.965635,0.989405,1.017233,0.000000,inf,0.947731,0.980661,-12.849417
845E,1,MB,845E,1.0,MB,1.015154,1.015154,0.977520,0.938326,0.976911,0.971422,0.000000,inf,1.008285,0.968813,1311.675510
845E,1,WT,845E,1.0,WT,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,,1.000000,1.000000,1.000000
845,1,DM,,,,,,,,,,,,,,


NaNs are because the sets do not have double mutant trait info. These will be removed.

Original number of sets: 137

After dropping, now there are: 135 (set 845 and 845E were dropped)

In [4]:
# Check which sets do not have double mutant information
for name, group in W.groupby(level='Set'):
    geno_levels = group.dropna(axis=0, how='all').index.get_level_values('Genotype').unique()
    if 'DM' not in geno_levels.values:
        print(geno_levels)
        print(f'Drop set {name}')

Index(['MA', 'MB', 'WT'], dtype='object', name='Genotype')
Drop set 845
Index(['MA', 'MB', 'WT'], dtype='object', name='Genotype')
Drop set 845E


In [5]:
# Drop sets that do not have double mutant information
W = W.loc[~W.index.get_level_values('Set').isin(['845', '845E']),:]
W.drop(columns=['Set', 'Flat', 'Genotype'], inplace=True)
W

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  W.drop(columns=['Set', 'Flat', 'Genotype'], inplace=True)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,W_GN_emmean,W_PG_emmean,W_DTB_emmean,W_LN_emmean,W_DTF_emmean,W_SN_emmean,W_WO_emmean,W_FN_emmean,W_SPF_emmean,W_TSC_emmean,W_SH_emmean
Set,Flat,Genotype,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,1,DM,0.920155,0.920962,1.120381,1.099078,1.050650,1.062835,2.182621,2.055088,0.685536,0.730065,1.085108
1,1,MA,0.978865,0.979311,1.062390,1.022798,1.046854,0.937303,1.238209,0.631703,0.944365,0.934672,1.653472
1,1,MB,0.985701,0.979829,1.083876,1.071551,1.062008,1.072768,1.801296,0.702281,0.686268,0.704160,-0.016665
1,1,WT,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
1,2,DM,0.920155,0.920962,1.120381,1.099078,1.050650,1.062835,2.182621,2.055088,0.685536,0.730065,1.085108
...,...,...,...,...,...,...,...,...,...,...,...,...,...
815,1,WT,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
823,1,DM,1.250274,1.250274,1.055933,0.942154,1.015529,0.940720,-15.854496,1.649833,0.905252,0.861198,0.798124
823,1,MA,1.164948,1.164948,1.018708,0.980750,0.995928,0.963460,-3.488429,1.145781,0.988766,0.933525,0.656723
823,1,MB,1.162424,1.162424,1.031139,0.949940,0.994315,1.025428,-28.600834,1.200050,0.924529,0.927485,1.034324


## Feature tables for total seed count (TSC) and the other traits
1. Ensure the first single mutant column has the greater value than the other single mutant
2. Calculate the 6 neutrality functions using the SMF data

In [6]:
import numpy as np
for trait in W.columns.values:
    print(trait)
    trait_df = W[trait].unstack()
    trait_df.drop(columns='WT', inplace=True) # drop wild type
    trait_df.index = trait_df.index.get_level_values('Set').astype(str) # drop flat, since values don't change for sets with flat included as the random effect
    trait_df.index = trait_df.index.get_level_values('Set').str.strip()
    trait_df = trait_df.groupby('Set').mean()
    trait_df.dropna(axis=0, how='all', inplace=True) # drop rows with all NaN values (this particular trait was not collected for these sets)

    # reorder the values in MA and MB so that MA is always greater than MB
    trait_df['MA_new'] = trait_df['MA'].where(trait_df['MA'] > trait_df['MB'], trait_df['MB'])
    trait_df['MB_new'] = trait_df['MB'].where(trait_df['MA'] > trait_df['MB'], trait_df['MA'])

    # calculate the expected double mutant fitness using the neutrality functions
    trait_df['mean'] = (trait_df['MA_new'] + trait_df['MB_new']) / 2
    trait_df['multiplicative'] = trait_df['MA_new'] * trait_df['MB_new']
    trait_df['additive'] = trait_df['MA_new'] + trait_df['MB_new'] - 1
    trait_df['difference'] = trait_df['MA_new'] - trait_df['MB_new']
    trait_df['log_mani'] = ((2**trait_df['MA_new']) - 1) * ((2**trait_df['MB_new']) - 1) + 1
    trait_df['log_additive'] = np.log(trait_df['MA_new']) + np.log(trait_df['MB_new'])
    trait_df['log_difference'] = np.log(trait_df['MA_new']) - np.log(trait_df['MB_new'])

    # create sample IDs
    ID = 'Set_' + trait_df.index.astype(str)
    trait_df.insert(0, 'ID', ID)
    trait_df.set_index('ID', inplace=True)

    # save the feature table
    trait_df.rename(columns={'DM': trait}, inplace=True)
    trait_df.replace([np.inf, -np.inf], np.nan) # replace inf with NaN
    trait_df.dropna(axis=0, how='any', inplace=True) # drop rows with all NaN values
    trait_df.loc[:,[trait, 'MA_new', 'MB_new', 'mean', 'multiplicative', 
        'additive', 'difference', 'log_mani', 'log_additive', 'log_difference']].\
        to_csv(f'../ara_data/{trait}_feature_table.txt', sep='\t')
    trait_df.corr(method='pearson').to_csv(f'../ara_data/{trait}_feature_correlation.txt', sep='\t')
    print(trait_df.shape)

W_GN_emmean
(133, 12)
W_PG_emmean
(133, 12)
W_DTB_emmean
(132, 12)
W_LN_emmean
(133, 12)
W_DTF_emmean
(133, 12)
W_SN_emmean
(133, 12)
W_WO_emmean
(128, 12)
W_FN_emmean
(133, 12)
W_SPF_emmean
(133, 12)
W_TSC_emmean
(133, 12)
W_SH_emmean
(125, 12)


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


## Stratified K-fold train-test split

In [7]:
from sklearn.model_selection import StratifiedKFold
# sklearn v1.2.2
np.random.seed(20240606)

for trait in W.columns.values:
    trait_df = pd.read_csv(f'../ara_data/{trait}_feature_table.txt', sep='\t', index_col=0)
    print(trait, 'trait_df', trait_df.shape)

    # Create bins
    trait_df['label_bin'] = pd.cut(trait_df[trait], bins=[-np.inf, 0.25, 0.5, 0.75, np.inf], labels=[0, 1, 3, 4])

    # Apply stratified k-fold train-test split
    X = trait_df.drop(columns=[trait, 'label_bin'], axis=1)
    y = trait_df['label_bin']
    skf = StratifiedKFold(n_splits=6, random_state=20240606, shuffle=True)

    cv_splits = pd.DataFrame(index=trait_df.index, columns=[f'cv_{i}' for i in range(1, 11)])
    i = 0
    for train_idx, test_idx, in skf.split(X, y):
        if i == 0:
            cv_splits = cv_splits.iloc[train_idx, :] # training instances only

            # Write test set to file
            with open(f'../ara_data/{trait}_test_instances.txt', 'w') as f:
                for ID in test_idx:
                    f.write("%s\n" % trait_df.iloc[ID,:].name)
        else:
            break
        i += 1

W_GN_emmean trait_df (133, 10)
W_PG_emmean trait_df (133, 10)
W_DTB_emmean trait_df (132, 10)
W_LN_emmean trait_df (133, 10)
W_DTF_emmean trait_df (133, 10)
W_SN_emmean trait_df (133, 10)
W_WO_emmean trait_df (128, 10)
W_FN_emmean trait_df (133, 10)
W_SPF_emmean trait_df (133, 10)
W_TSC_emmean trait_df (133, 10)
W_SH_emmean trait_df (125, 10)




## Run XGBoost Regression model on TSC (total seed count)

### Use shap conda environment and run on the command line
python 1b_xgb_regression.py \
    -X ../ara_data/W_TSC_emmean_feature_table.txt \
    -y_name W_TSC_emmean \
    -test ../ara_data/W_TSC_emmean_test_instances.txt \
    -save ../output/1_xgb_regression_ara/ \
    -prefix W_TSC_emmean \
    -tag use_SMF_and_Neutrality_Funcs \
    -fold 5 -n 10 -plot t

Note: many of the predicted values tend to be the same for many samples, 
from searching the topic, I found that some recomment increasing the min_child_weight 
parameter, setting the objective  an the eval_metric (as rmse) parameters, and or 
increasing the number of training  samples.

https://stackoverflow.com/questions/33470477/xgboost-predict-method-returns-the-same-predicted-value-for-all-rows

### Gradient Boosting Model (Use ml-pipe1.5 conda Environment)

python 1b_gb_regression.py \
    -X ../ara_data/W_TSC_emmean_feature_table.txt \
    -y_name W_TSC_emmean \
    -test ../ara_data/W_TSC_emmean_test_instances.txt \
    -save ../output/1_gb_regression_ara/ \
    -prefix W_TSC_emmean_gb \
    -tag use_SMF_and_Neutrality_Funcs \
    -fold 5 -n 10 -plot t