# Create feature tables for regression
Use single mutant relative fitness and genetic interaction neutrality 
functions (Query_SMFni 2008 PNAS) to predict the relative fitness of the double 
mutant.

- 11 features:

  Wa

  Wb

  additive (Wa + Wb)

  difference (Wa - Wb)

  multiplicative (Wa * Wb)

  minimum (min(Wa, Wb))

  Query_SMFximum (Query_SMFx(Wa, Wb))
  
  log_Query_SMFni (log2[((2^Wa) − 1)*((2^Wb) − 1) + 1])

  log_additive (log(Wa * Wb))

  log_difference (log(Wa / Wb))

  mean ((Wa + Wb)/2)

Labels:
- Wab as colony size

Relative fitness (W): total seed count (corrected for batch effects)

Wa: relative fitness of gene mutant A

Wb: relative fitness of gene mutant B

Wab: relative fitness of the double mutant

In [1]:
import pandas as pd

chunks = []
path = '~/Shiu_Lab/Co-function/data/Costanzo_2016/S1/SGA_all.csv'
for chunk in pd.read_csv(path, chunksize=10 ** 6):
    chunks.append(chunk)
data_av = pd.concat(chunks, axis=0)
data

Unnamed: 0,Query Strain ID,Query allele name,Array Strain ID,Array allele name,Arraytype/Temp,Genetic interaction score (ε),P-value,Query single mutant fitness (SMF),Array SMF,Double mutant fitness,Double mutant fitness standard deviation,Type
0,YAL001C_tsq508,tfc3-g349e,YBL023C_tsa111,mcm2-1,TSA30,-0.0348,0.005042,0.8285,0.9254,0.7319,0.0102,ExE
1,YAL001C_tsq508,tfc3-g349e,YBL026W_tsa1065,lsm2-5001,TSA30,-0.3529,0.000004,0.8285,0.9408,0.4266,0.0790,ExE
2,YAL001C_tsq508,tfc3-g349e,YBL034C_tsa274,stu1-5,TSA30,0.0126,0.462500,0.8285,0.8925,0.7520,0.1338,ExE
3,YAL001C_tsq508,tfc3-g349e,YBL034C_tsa454,stu1-8,TSA30,0.0043,0.499800,0.8285,0.7988,0.6661,0.0831,ExE
4,YAL001C_tsq508,tfc3-g349e,YBL034C_tsa643,stu1-6,TSA30,-0.1601,0.051400,0.8285,0.7683,0.4764,0.1395,ExE
...,...,...,...,...,...,...,...,...,...,...,...,...
19313649,YPR201W_sn2211,arr3,YPL264C_dma5012,ypl264c,DMA30,0.0453,0.178600,0.9995,0.9768,1.0217,0.0405,NxN
19313650,YPR201W_sn2211,arr3,YPL265W_dma5011,dip5,DMA30,-0.0020,0.463700,0.9995,1.0230,1.0205,0.0143,NxN
19313651,YPR201W_sn2211,arr3,YPL267W_dma5010,acm1,DMA30,-0.0539,0.107200,0.9995,0.9990,0.9446,0.0331,NxN
19313652,YPR201W_sn2211,arr3,YPL272C_dma5007,pbi1,DMA30,-0.0137,0.385300,0.9995,1.0155,1.0013,0.0385,NxN


In [2]:
import swifter

# get gene systematic name and keep relevant columns
data.insert(0, 'Query', data['Query Strain ID'].swifter.apply(lambda x: x.split('_')[0]))
data.insert(1, 'Array', data['Array Strain ID'].swifter.apply(lambda x: x.split('_')[0]))
data = data[['Query', 'Array', 'Double mutant fitness', 
      'Query single mutant fitness (SMF)', 'Array SMF']]
data

  from .autonotebook import tqdm as notebook_tqdm
Pandas Apply: 100%|██████████| 19313654/19313654 [00:11<00:00, 1698079.08it/s]
Pandas Apply: 100%|██████████| 19313654/19313654 [00:11<00:00, 1625155.03it/s]


Unnamed: 0,Query,Array,Double mutant fitness,Query single mutant fitness (SMF),Array SMF
0,YAL001C,YBL023C,0.7319,0.8285,0.9254
1,YAL001C,YBL026W,0.4266,0.8285,0.9408
2,YAL001C,YBL034C,0.7520,0.8285,0.8925
3,YAL001C,YBL034C,0.6661,0.8285,0.7988
4,YAL001C,YBL034C,0.4764,0.8285,0.7683
...,...,...,...,...,...
19313649,YPR201W,YPL264C,1.0217,0.9995,0.9768
19313650,YPR201W,YPL265W,1.0205,0.9995,1.0230
19313651,YPR201W,YPL267W,0.9446,0.9995,0.9990
19313652,YPR201W,YPL272C,1.0013,0.9995,1.0155


## Calculate the genetic interaction neutrality functions (expected double mutant fitness)

In [3]:
import numpy as np

# reorder the values in Query_SMF and Array_SMF so that Query_SMF is always greater than Array_SMF
data['Query_new'] = data['Query'].where(data['Query single mutant fitness (SMF)'] > data['Array SMF'], data['Array'])
data['Array_new'] = data['Array'].where(data['Query single mutant fitness (SMF)'] > data['Array SMF'], data['Query'])
data['Query_SMF_new'] = data['Query single mutant fitness (SMF)'].where(data['Query single mutant fitness (SMF)'] > data['Array SMF'], data['Array SMF'])
data['Array_SMF_new'] = data['Array SMF'].where(data['Query single mutant fitness (SMF)'] > data['Array SMF'], data['Query single mutant fitness (SMF)'])

# calculate means for duplicate gene pairs
data.drop(columns=['Query', 'Array'], inplace=True)
data_av = data.groupby(['Query_new', 'Array_new']).mean()
print(data.shape)

(19313654, 7)


In [4]:
data_av.shape

(13178146, 5)


In [5]:
# re-check for duplicate gene pairs
data_av.reset_index().drop_duplicates(subset=['Query_new', 'Array_new']).shape

(13178146, 7)

In [7]:
# calculate the expected double mutant fitness using the neutrality functions
data_av['min'] = data_av[['Query_SMF_new', 'Array_SMF_new']].min(axis=1)
data_av['max'] = data_av[['Query_SMF_new', 'Array_SMF_new']].max(axis=1)
data_av['mean'] = (data_av['Query_SMF_new'] + data_av['Array_SMF_new']) / 2
data_av['multiplicative'] = data_av['Query_SMF_new'] * data_av['Array_SMF_new']
data_av['additive'] = data_av['Query_SMF_new'] + data_av['Array_SMF_new'] - 1
data_av['difference'] = data_av['Query_SMF_new'] - data_av['Array_SMF_new']
data_av['log_mani'] = ((2**data_av['Query_SMF_new']) - 1) * ((2**data_av['Array_SMF_new']) - 1) + 1
data_av['log_additive'] = np.log(data_av['Query_SMF_new']) + np.log(data_av['Array_SMF_new'])
data_av['log_difference'] = np.log(data_av['Query_SMF_new']) - np.log(data_av['Array_SMF_new'])

# create sample IDs
data_av.reset_index(inplace=True)
data_av.insert(0, 'ID', data_av['Query_new'] + '_' + data_av['Array_new'])
data_av.set_index('ID', inplace=True)

# clean up the feature table
data_av.drop(columns=['Query_new', 'Array_new', 'Query single mutant fitness (SMF)',
    'Array SMF'], inplace=True)
data_av.rename(columns={'Double mutant fitness': 'DMF'}, inplace=True)
data_av.replace([np.inf, -np.inf], np.nan, inplace=True) # replace inf with NaN
data_av.dropna(axis=0, how='any', inplace=True) # remove rows with NaN
print(data_av.shape)

# save the feature table
data_av.to_csv('../yeast_data/DMF_feature_table.txt', sep='\t')
data_av.corr(method='pearson').to_csv('../yeast_data/DMF_feature_correlation.txt', sep='\t')
data_av

(11777196, 12)


Unnamed: 0_level_0,DMF,Query_SMF_new,Array_SMF_new,min,max,mean,multiplicative,additive,difference,log_mani,log_additive,log_difference
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
YAL001C_YBL006C,0.648600,0.86580,0.793000,0.793000,0.86580,0.829400,0.686579,0.658800,0.072800,1.602514,-0.376033,0.087831
YAL001C_YBL007C,0.742600,0.86580,0.832200,0.832200,0.86580,0.849000,0.720519,0.698000,0.033600,1.641760,-0.327784,0.039581
YAL001C_YBL027W,0.646800,0.82850,0.788000,0.788000,0.82850,0.808250,0.652858,0.616500,0.040500,1.563785,-0.426396,0.050119
YAL001C_YBL034C,0.522467,0.82850,0.745367,0.745367,0.82850,0.786933,0.617536,0.573867,0.083133,1.524777,-0.482017,0.105741
YAL001C_YBL035C,0.558600,0.84715,0.704050,0.704050,0.84715,0.775600,0.596436,0.551200,0.143100,1.502593,-0.516783,0.185028
...,...,...,...,...,...,...,...,...,...,...,...,...
YPR202W_YPR019W,0.967900,1.00950,0.925500,0.925500,1.00950,0.967500,0.934292,0.935000,0.084000,1.911226,-0.067966,0.086876
YPR202W_YPR025C,0.964800,1.00950,0.954600,0.954600,1.00950,0.982050,0.963669,0.964100,0.054900,1.950437,-0.037008,0.055918
YPR202W_YPR033C,0.826000,1.00950,0.887500,0.887500,1.00950,0.948500,0.895931,0.897000,0.122000,1.861198,-0.109892,0.128802
YPR202W_YPR034W,0.468000,1.00950,0.611000,0.611000,1.00950,0.810250,0.616804,0.620500,0.398500,1.534285,-0.483203,0.502113


## Stratified K-fold train-test split

In [8]:
from sklearn.model_selection import StratifiedKFold
# sklearn v1.2.2
np.random.seed(20240611)

# Create bins
bins = np.arange(data_av.DMF.min()-.1, data_av.DMF.max()+.1, 0.1).tolist()
data_av['label_bin'] = pd.cut(data_av.DMF, bins=bins, labels=[i for i in range(len(bins)-1)])

# Apply stratified k-fold train-test split
X = data_av.drop(columns=['DMF', 'label_bin'], axis=1)
y = data_av['label_bin']
skf = StratifiedKFold(n_splits=11, shuffle=True, random_state=20240611)

cv_splits = pd.DataFrame(index=data_av.index, columns=[f'cv_{i}' for i in range(1, 11)])
i = 0
for train_idx, test_idx in skf.split(X, y):
    if i == 0:
        cv_splits = cv_splits.iloc[train_idx, :] # training instances only

        # Write test set to file
        with open(f'../yeast_data/DMF_test_instances.txt', 'w') as f:
            for ID in test_idx:
                f.write(f'{data_av.iloc[ID,:].name}\n')
    else:
        break
    i += 1




## Run XGBoost Regression model to predict DMF
### Use shap conda environment and submit job to cluster

! sbatch