# Create feature tables for regression
Use single mutant relative fitness and genetic interaction neutrality 
functions (Query_SMFni 2008 PNAS) to predict the relative fitness of the double 
mutant.

- 9 features:

  Wa

  Wb

  additive (Wa + Wb)

  difference (Wa - Wb)

  multiplicative (Wa * Wb)
  
  log2_mani (log2[((2^Wa) − 1)*((2^Wb) − 1) + 1])

  pslog10_additive (pseudolog10(Wa * Wb))

  pslog10_difference (pseudolog10(Wa / Wb))

  mean ((Wa + Wb)/2)

Labels:
- Wab as colony size

Relative fitness (W): total seed count (corrected for batch effects)

Wa: relative fitness of gene mutant A

Wb: relative fitness of gene mutant B

Wab: relative fitness of the double mutant

In [1]:
import pandas as pd

chunks = []
path = '../yeast_data/SGA_all.csv'
for chunk in pd.read_csv(path, chunksize=10 ** 6):
    chunks.append(chunk)
data = pd.concat(chunks, axis=0)
data

Unnamed: 0,Query Strain ID,Query allele name,Array Strain ID,Array allele name,Arraytype/Temp,Genetic interaction score (ε),P-value,Query single mutant fitness (SMF),Array SMF,Double mutant fitness,Double mutant fitness standard deviation,Type
0,YAL001C_tsq508,tfc3-g349e,YBL023C_tsa111,mcm2-1,TSA30,-0.0348,0.005042,0.8285,0.9254,0.7319,0.0102,ExE
1,YAL001C_tsq508,tfc3-g349e,YBL026W_tsa1065,lsm2-5001,TSA30,-0.3529,0.000004,0.8285,0.9408,0.4266,0.0790,ExE
2,YAL001C_tsq508,tfc3-g349e,YBL034C_tsa274,stu1-5,TSA30,0.0126,0.462500,0.8285,0.8925,0.7520,0.1338,ExE
3,YAL001C_tsq508,tfc3-g349e,YBL034C_tsa454,stu1-8,TSA30,0.0043,0.499800,0.8285,0.7988,0.6661,0.0831,ExE
4,YAL001C_tsq508,tfc3-g349e,YBL034C_tsa643,stu1-6,TSA30,-0.1601,0.051400,0.8285,0.7683,0.4764,0.1395,ExE
...,...,...,...,...,...,...,...,...,...,...,...,...
19313649,YPR201W_sn2211,arr3,YPL264C_dma5012,ypl264c,DMA30,0.0453,0.178600,0.9995,0.9768,1.0217,0.0405,NxN
19313650,YPR201W_sn2211,arr3,YPL265W_dma5011,dip5,DMA30,-0.0020,0.463700,0.9995,1.0230,1.0205,0.0143,NxN
19313651,YPR201W_sn2211,arr3,YPL267W_dma5010,acm1,DMA30,-0.0539,0.107200,0.9995,0.9990,0.9446,0.0331,NxN
19313652,YPR201W_sn2211,arr3,YPL272C_dma5007,pbi1,DMA30,-0.0137,0.385300,0.9995,1.0155,1.0013,0.0385,NxN


In [2]:
import swifter

# get gene systematic name and keep relevant columns
data.insert(0, 'Query', data['Query Strain ID'].swifter.apply(lambda x: x.split('_')[0]))
data.insert(1, 'Array', data['Array Strain ID'].swifter.apply(lambda x: x.split('_')[0]))
data = data[['Query', 'Array', 'Double mutant fitness', 
      'Query single mutant fitness (SMF)', 'Array SMF']]
data

  from .autonotebook import tqdm as notebook_tqdm
Pandas Apply: 100%|██████████| 19313654/19313654 [00:04<00:00, 4156046.56it/s]
Pandas Apply: 100%|██████████| 19313654/19313654 [00:04<00:00, 4095815.41it/s]


Unnamed: 0,Query,Array,Double mutant fitness,Query single mutant fitness (SMF),Array SMF
0,YAL001C,YBL023C,0.7319,0.8285,0.9254
1,YAL001C,YBL026W,0.4266,0.8285,0.9408
2,YAL001C,YBL034C,0.7520,0.8285,0.8925
3,YAL001C,YBL034C,0.6661,0.8285,0.7988
4,YAL001C,YBL034C,0.4764,0.8285,0.7683
...,...,...,...,...,...
19313649,YPR201W,YPL264C,1.0217,0.9995,0.9768
19313650,YPR201W,YPL265W,1.0205,0.9995,1.0230
19313651,YPR201W,YPL267W,0.9446,0.9995,0.9990
19313652,YPR201W,YPL272C,1.0013,0.9995,1.0155


## Calculate the genetic interaction neutrality functions (expected double mutant fitness)

In [3]:
import numpy as np

print(data.shape)

# reorder the values in Query_SMF and Array_SMF so that Query_SMF is always greater than Array_SMF
data['Query_new'] = data['Query'].where(data['Query single mutant fitness (SMF)'] > data['Array SMF'], data['Array'])
data['Array_new'] = data['Array'].where(data['Query single mutant fitness (SMF)'] > data['Array SMF'], data['Query'])
data['Query_SMF_new'] = data['Query single mutant fitness (SMF)'].where(data['Query single mutant fitness (SMF)'] > data['Array SMF'], data['Array SMF'])
data['Array_SMF_new'] = data['Array SMF'].where(data['Query single mutant fitness (SMF)'] > data['Array SMF'], data['Query single mutant fitness (SMF)'])

# calculate means for duplicate gene pairs
data.drop(columns=['Query', 'Array'], inplace=True)
data_av = data.groupby(['Query_new', 'Array_new']).mean()
print(data_av.shape)

del data

(19313654, 5)
(13178146, 5)


In [4]:
# re-check for duplicate gene pairs
data_av.reset_index().drop_duplicates(subset=['Query_new', 'Array_new']).shape

(13178146, 7)

In [5]:
def pseudolog10(x):
  # Pseudo-Logarithm of base 10, is defined for all real numbers. Use instead of
  # log10, which returns infinite/NaN values for x <= 0
  return (np.log((x/2) + np.sqrt((x/2)**2 + 1)) / np.log(10))

In [9]:
# calculate the expected double mutant fitness using the neutrality functions
data_av['mean'] = (data_av['Query_SMF_new'] + data_av['Array_SMF_new']) / 2
data_av['multiplicative'] = data_av['Query_SMF_new'] * data_av['Array_SMF_new']
data_av['additive'] = data_av['Query_SMF_new'] + data_av['Array_SMF_new'] - 1
data_av['difference'] = data_av['Query_SMF_new'] - data_av['Array_SMF_new']
data_av['log2_mani'] = np.log2(((2**data_av['Query_SMF_new']) - 1) * ((2**data_av['Array_SMF_new']) - 1) + 1)
data_av['pslog10_additive'] = data_av.apply(lambda x: pseudolog10(x['Query_SMF_new'] * x['Array_SMF_new']), axis=1)
data_av['pslog10_difference'] = data_av.apply(lambda x: pseudolog10(x['Query_SMF_new'] / x['Array_SMF_new']), axis=1)

# create sample IDs
data_av.reset_index(inplace=True)
data_av.insert(0, 'ID', data_av['Query_new'] + '_' + data_av['Array_new'])
data_av.set_index('ID', inplace=True)

# clean up the feature table
data_av.drop(columns=['Query_new', 'Array_new', 'Query single mutant fitness (SMF)',
    'Array SMF'], inplace=True)
data_av.rename(columns={'Double mutant fitness': 'DMF'}, inplace=True)
data_av.replace([np.inf, -np.inf], np.nan, inplace=True) # replace inf with NaN
data_av.dropna(axis=0, how='any', inplace=True) # remove rows with NaN
print(data_av.shape)

# save the feature table
data_av.to_csv('../yeast_data/DMF_feature_table_v20241015.tsv', sep='\t')
data_av.corr(method='pearson').to_csv('../yeast_data/DMF_feature_correlation_v20241015.tsv', sep='\t')
data_av

(11777196, 10)


Unnamed: 0_level_0,DMF,Query_SMF_new,Array_SMF_new,mean,multiplicative,additive,difference,log2_mani,pslog10_additive,pslog10_difference
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
YAL001C_YBL006C,0.648600,0.86580,0.793000,0.829400,0.686579,0.658800,0.072800,0.680337,0.146306,0.226652
YAL001C_YBL007C,0.742600,0.86580,0.832200,0.849000,0.720519,0.698000,0.033600,0.715244,0.153258,0.216798
YAL001C_YBL027W,0.646800,0.82850,0.788000,0.808250,0.652858,0.616500,0.040500,0.645042,0.139362,0.218918
YAL001C_YBL034C,0.522467,0.82850,0.745367,0.786933,0.617536,0.573867,0.083133,0.608598,0.132052,0.230405
YAL001C_YBL035C,0.558600,0.84715,0.704050,0.775600,0.596436,0.551200,0.143100,0.587454,0.127668,0.247643
...,...,...,...,...,...,...,...,...,...,...
YPR202W_YPR019W,0.967900,1.00950,0.925500,0.967500,0.934292,0.935000,0.084000,0.934498,0.196143,0.226454
YPR202W_YPR025C,0.964800,1.00950,0.954600,0.982050,0.963669,0.964100,0.054900,0.963797,0.201906,0.220093
YPR202W_YPR033C,0.826000,1.00950,0.887500,0.948500,0.895931,0.897000,0.122000,0.896232,0.188568,0.235313
YPR202W_YPR034W,0.468000,1.00950,0.611000,0.810250,0.616804,0.620500,0.398500,0.617567,0.131900,0.326990


#### Random train-test split
This is the "perfect" dataset. No gene pairs are overlapping, not even one member in a pair, between the training and test sets

In [1]:
import pandas as pd

chunks = []
path = '../yeast_data/DMF_feature_table_v20241015.tsv'
for chunk in pd.read_csv(path, chunksize=10 ** 6, sep='\t'):
    chunks.append(chunk)
data_av = pd.concat(chunks, axis=0)
data_av.shape

(11777196, 11)

In [2]:
# reshape the gene pairs into a gene-by-gene matrix
data_av['Query_new'] = data_av.ID.str.split('_').str[0]
data_av['Array_new'] = data_av.ID.str.split('_').str[1]
data_av_square = data_av.pivot_table(index='Query_new', columns='Array_new', values='DMF')
print(data_av_square.shape)

(5324, 5330)


In [3]:
data_av_square.head()

Array_new,YAL001C,YAL002W,YAL004W,YAL005C,YAL007C,YAL008W,YAL009W,YAL010C,YAL011W,YAL012W,...,YPR193C,YPR194C,YPR195C,YPR196W,YPR197C,YPR198W,YPR199C,YPR200C,YPR201W,YPR202W
Query_new,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
YAL001C,,,,,,,,,,,...,,,,,,,,,,
YAL002W,,,,,,,,,,,...,,,,,,,,,,
YAL004W,,,,,,,,,,,...,,,,,,,,,0.9653,
YAL005C,,,,,,,,,,,...,1.0769,1.0549,,,,,1.0603,,1.0634,
YAL007C,,,,,,,,,,,...,,1.1115,,0.9637,,1.0339,1.05285,,0.96245,


In [5]:
# Do a random train-test split so that there are no overlapping Array_new and Query_new genes in the training and testing sets
import numpy as np
from sklearn.model_selection import train_test_split

# First, split the rows into training and testing sets, use these to create two non-overlapping dataframes
row_idx_train, row_idx_test = train_test_split(data_av_square.index, test_size=1/11, random_state=20241016)

data_av_square_train = data_av_square.loc[row_idx_train,
    ~data_av_square_train.columns.isin(row_idx_test)] # not all columns are in the row indices, so I'm putting all those extras in the training set
print(data_av_square_train.shape)

data_av_square_test = data_av_square.loc[~data_av_square.index.isin(row_idx_train),
    data_av_square.columns.isin(row_idx_test)]
print(data_av_square_test.shape)

# Ensure they are non-overlapping
print(np.intersect1d(data_av_square_train.index, data_av_square_test.index)) # empty! :)
print(np.intersect1d(data_av_square_train.columns, data_av_square_test.columns)) # empty! :)
print(np.intersect1d(data_av_square_train.index, data_av_square_test.columns)) # empty! :)
print(np.intersect1d(data_av_square_train.columns, data_av_square_test.index)) # empty! :)

# Reshape the test set to get the sample IDs to save
data_av_test = data_av_square_test.stack().reset_index()
data_av_test['ID'] = data_av_test.Query_new + '_' + data_av_test.Array_new

# Save the test instances
with open(f'../yeast_data/DMF_test_instances_v20241016.txt', 'w') as f:
    for ID in data_av_test.ID:
        f.write(f'{ID}\n')

#### Stratified K-fold train-test split
The caveat is that there are probably some gene pairs in the training and test sets that share one gene.

In [11]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold

# Read in the data
chunks = []
path = '../yeast_data/DMF_feature_table_v20241015.tsv'
for chunk in pd.read_csv(path, chunksize=10 ** 6, sep='\t'):
    chunks.append(chunk)
data_av = pd.concat(chunks, axis=0)
print(data_av.shape)

# sklearn v1.2.2
np.random.seed(20240611)

data_av.set_index('ID', inplace=True)

# Create bins
bins = np.arange(data_av.DMF.min()-.1, data_av.DMF.max()+.1, 0.1).tolist()
data_av['label_bin'] = pd.cut(data_av.DMF, bins=bins, labels=[i for i in range(len(bins)-1)])

# Apply stratified k-fold train-test split
X = data_av.drop(columns=['DMF', 'label_bin'], axis=1)
y = data_av['label_bin']
skf = StratifiedKFold(n_splits=11, shuffle=True, random_state=20240611)

i = 0
for train_idx, test_idx in skf.split(X, y):
    if i == 0:
        # Write test set to file
        with open(f'../yeast_data/DMF_test_instances_v20241015.txt', 'w') as f:
            for ID in test_idx:
                f.write(f'{data_av.iloc[ID,:].name}\n')
    else:
        break
    i += 1




## Run XGBoost Regression model to predict DMF
### Use shap conda environment and submit job to cluster
See yeast_code/1b_submit_xgb_regression.sb

## Run Sophie's Neural Network to predict DMF
### Use ... conda environment and submit job to cluster