In [1]:
## This script creates the following features: rank of a gene, weighted rank, single mutant fitness (SMF) 
## and double mutant fitness (DMF) 
## the data was taken from TheCellMap website

In [2]:
import pandas as pd
from os import chdir
import numpy as np

In [3]:
directory = r'../inputs'

chdir(directory)

In [4]:
## Input: data in txt file
## Output: table wich contains 6 columns: ORF, query gene, array strain gene genetic interaction score,
##single mutant fitness (SMF), double mutant fitness (DMF)
def extract_relevant_cols(file_name):
    data = pd.read_csv(file_name, sep="\t" )
    data.rename(columns={'Genetic interaction score (ε)':'weight'}, inplace=True)
    #del data_relevant_cols['Quetry Strain ID','Array Strain ID']
    relevant_cols = ['Query Strain ID','Array Strain ID', 'weight', 'Query single mutant fitness (SMF)',
                    'Array SMF', 'Double mutant fitness']
    # purifying the systemic name- cut the name after the 7'th character.
    data['Query Strain ID'] = data['Query Strain ID'].str.split('_', expand=True)[0]
    data['Array Strain ID'] = data['Array Strain ID'].str.split('_', expand=True)[0]
    relevant_cols_df = data[relevant_cols]
    return relevant_cols_df


In [5]:
EXE = extract_relevant_cols('SGA_ExE.txt')
NXN = extract_relevant_cols('SGA_NxN.txt')
EXN_NXE = extract_relevant_cols('SGA_ExN_NxE.txt')
big_df = pd.concat([EXE, NXN, EXN_NXE])
big_df.sort_values(by =['Query Strain ID', 'Array Strain ID'], inplace=True)
big_df.to_hdf('whole_data.h5', key='df')
unite_df = pd.read_hdf('whole_data.h5', key='df')

In [6]:
# Extracting single mutant fitness (SMF) feature from Array and Query columns
def smf_feature(relevant_cols_df):
    
    genes = unite_df['Query Strain ID']
    genes = genes.append(unite_df['Array Strain ID'], ignore_index=True)
    values = unite_df['Query single mutant fitness (SMF)']
    values = values.append(unite_df['Array SMF'], ignore_index=True)
    smf_df = pd.concat([genes, values], axis=1)
    smf_df.columns = ['genes', 'value']
    smf_df = smf_df.groupby('genes').mean()
    smf_df.columns = ['smf']
    return smf_df


In [7]:
smf_df = smf_feature(unite_df)


In [8]:
# This function build an adjacency matrix of source and dest columns with option to have value to their connection
def smart_crosstab(df, source, dest, values=None):
    if values is None:
        df = df.drop_duplicates(subset=[source, dest])
        df = pd.crosstab(df[source], df[dest])
    else:
        df = pd.crosstab(df[source], df[dest], values=df[values], aggfunc='mean')
    idx = df.columns.union(df.index)
    df = df.reindex(index = idx, columns=idx, fill_value=0)
    df.fillna(0, inplace=True)
    upper_triangle = pd.DataFrame(np.triu(df.values), index=df.index, columns=df.columns)
    lower_triangle = pd.DataFrame(np.tril(df.values), index=df.index, columns=df.columns)
    lower_triangle = lower_triangle.T
    return upper_triangle, lower_triangle

In [9]:
## creating crosstab to calculate rank - number of interaction per gene from the smart-crossrtab
def calc_rank(unite_df):
    df = unite_df[['Query Strain ID', 'Array Strain ID']]
    return smart_crosstab(df, 'Query Strain ID', 'Array Strain ID')
    

In [10]:
# The values of the feature per gene is the sum of row and column of the gene (after spliting to triangles matrices and transpose the lower triangle)
def adding_feature_per_gene(feature_crosstab,name_of_feature,weighted_option = False):
    sum_on_rows = feature_crosstab.sum(axis = 1)
    sum_on_cols = feature_crosstab.sum(axis = 0)
    feature_array = np.zeros(feature_crosstab.shape[0])
    for i in range(feature_crosstab.shape[0]):
        feature_array [i] = sum_on_rows[i] + sum_on_cols[i]
    if weighted_option is True:
        abs_max_value = abs(max(feature_array, key=abs))
        feature_array = feature_array / (abs_max_value) 
    feature_df= pd.DataFrame(data=feature_array, index=smf_df.index, columns=[name_of_feature])
    return feature_df
    

In [11]:
# calling the functions above:
## firt calculating rank_df
upper_triangle, lower_triangle = calc_rank(unite_df)
rank_cross = upper_triangle.astype(bool) | lower_triangle.astype(bool)
rank_df = adding_feature_per_gene(rank_cross,'rank', False)




KeyboardInterrupt



In [None]:
# other features using the rank_df values so we normalize the rank matrix
normalization_matrix = upper_triangle + lower_triangle
normalization_matrix.replace(0, 1, inplace=True)


In [None]:
# Calculating othe features:
## DMF
def calc_dmf(unite_df, normalization_matrix):
    df = unite_df[['Query Strain ID', 'Array Strain ID', 'Double mutant fitness']].groupby(['Query Strain ID', 'Array Strain ID']).mean()
    df.reset_index(inplace=True)
    triang_up, triang_down = smart_crosstab(df, 'Query Strain ID', 'Array Strain ID', 'Double mutant fitness')
    return (triang_up + triang_down) / normalization_matrix
dmf = calc_dmf(unite_df, normalization_matrix)

In [None]:
## Creating cross-tab for positive and negative rank 
def calc_positive(unite_df):
    df = unite_df[['Query Strain ID', 'Array Strain ID', 'weight']]
    df = df[df['weight'] > 0]
    df.reset_index(inplace=True)
    triang_up, triang_down = smart_crosstab(df, 'Query Strain ID', 'Array Strain ID', 'weight')
    return (triang_up + triang_down) / normalization_matrix


def calc_negative(unite_df):
    df = unite_df[['Query Strain ID', 'Array Strain ID', 'weight']]
    df = df[df['weight'] < 0]
    df.reset_index(inplace=True)
    triang_up, triang_down = smart_crosstab(df, 'Query Strain ID', 'Array Strain ID', 'weight')
    return (triang_up + triang_down) / normalization_matrix
    
    


In [None]:
positive_rank = calc_positive(unite_df)
negative_rank = calc_negative(unite_df)

In [None]:
# calculation  of the features acoording to the functions
final_dmf_df = adding_feature_per_gene(dmf,'dmf',True)
final_positive_rank_df = adding_feature_per_gene(positive_rank,'positive_rank',True)
final_negative_rank_df = adding_feature_per_gene(negative_rank,'negative_rank',True)
final_df = pd.concat([smf_df, final_dmf_df, rank_df, final_positive_rank_df,final_negative_rank_df], axis=1)
# normalizing weighted features by number of interactions i.e rank of each gene
final_df['dmf'] = final_df['dmf'] / final_df['rank']
final_df['positive_weighted_rank'] = final_df['positive_rank'] / final_df['rank']
final_df['negative_weighted_rank'] = final_df['negative_rank'] / final_df['rank']
final_df['absolute_weighted_rank'] = final_df['positive_weighted_rank'] + final_df['negative_weighted_rank'].abs()
final_df = final_df.fillna(0)

In [194]:
final_df.to_csv('../outputs/not_normalized_smf_dmf_ranks_feature.csv')