This code is for the binary data processing of specific protein properties data.
The protein properties with binary data processing documents are:
- acetylation_dict.json
- deamination_dict.json
- formylation_dict.json
- hydroxylation_dict.json
- myristoylation_dict.json
- oxidation_dict.json
- propionylation_dict.json

For each data type the below will be completed:
- Initial data processing: Calculating the number of genes in the pair that 
have a specific ptm.

- Transformation processing: For each gene pair, the number of genes will be: 
    log10, Reciprocal, Square, or no transformation.
    - If a transformation would result in inf or -inf, it was converted to a NaN

In [1]:
import pandas as pd
import numpy as np
import os
import json

In [2]:
def generate_feature_cols(data_type, data, gene_calcs_type, transformation_type):
    """
    Generate feature columns for continuous data processing.
    
    Parameters
    ----------

    data_type : str
        The type of data being processed.
        Example: 'acetylation'

    data : dict
        The data to be processed.
        Example: acetylation_dict

    gene_calcs_type : str
        The type of gene calculations to be performed.
        Example: 'num_genes'

    transformation_type : list
        The type of transformation to be performed.
        Example: ['log10', 'reciprocal', 'square', 'none']

    Returns
    -------
    list
        A list of feature columns.
    """
    return [str(data_type) + str(data) + str(gene_calcs_type) + str(trans) 
            for trans in transformation_type]

In [3]:
# calc average, max, min, and difference of two columns
def calc_features(df, calc_feature, col1, col2):
    """
    Calculate total of two columns.

    Parameters

    df : DataFrame
        DataFrame containing the data
    feature : list
        List of feature columns
    col1 : str
        Column 1 - specific protein property values for gene1
    col2 : str
        Column 2 - specific protein protery values for gene2

    Returns
    -------
    DataFrame
        DataFrame containing the data with calculated features
    
    """
    df[calc_feature] = (df[col1].astype(float) + df[col2].astype(float))
    return df

In [4]:
# Define a function to apply transformations
def apply_transformations(df, column):
    """
    Apply transformations to the data.

    Parameters
    ----------
    df : DataFrame
        DataFrame containing the data
    column : str
        Column to apply transformations to

    Returns
    -------
    DataFrame
        DataFrame containing the data with transformations applied
    
    """
    base_name = column.rpartition('_')[0]
    # NOTE - if log10(0) or 1/0 will return NaN values
    df[f'{base_name}_log'] = np.log10(df[column].replace(0, np.nan))
    df[f'{base_name}_reciprocal'] = 1 / df[column].replace(0, np.nan)
    df[f'{base_name}_squared'] = df[column] ** 2
    return df

In [5]:
def create_data_feat_df (df, calc, col1_name, col2_name, gene1dat_name, gene2dat_name, feature_list):
    """
    Create data feature DataFrame.

    Parameters
    ----------
    df : DataFrame
        DataFrame containing the data
    calc_list : list
        List of gene pair calculations to be applied to the data
    col1_name : str
        Column 1 name
    col2_name : str
        Column 2 name
    gene1dat_name : str
        Gene 1 data name
    gene2dat_name : str
        Gene 2 data name
    feature_list : list
        List of feature columns

    Returns
    -------
    DataFrame
        DataFrame containing the data with features
    
    """
    df = calc_features(df, calc, gene1dat_name, gene2dat_name)
    df = apply_transformations(df, calc)
    df = df[[col1_name, col2_name]+feature_list]
    return df

In [6]:
doc_list = ['acetylation_dict.json',
            'deamination_dict.json',
            'formylation_dict.json',
            'hydroxylation_dict.json',
            'myristoylation_dict.json',
            'oxidation_dict.json',
            'propionylation_dict.json']

In [7]:
# folder with continuous protein properties
# data_dir = "/home/brow1110/ara-kinase-prediction/protein_properties/"
data_dir = '/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/21_arabidopsis_redundancy/03_protein_sequence_properties'
# save_dir = '/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/Dataset_4_Features'
# save_dir = '/home/seguraab/ara-kinase-prediction/data/Kinase_genes/features'
save_dir = '/home/seguraab/ara-kinase-prediction/data/20250403_melissa_ara_data/features'

# prefix = 'Dataset_4'
# prefix = 'TAIR10_kinases'
prefix = '20250403_melissa_ara_features_for_binary_clf'

# gene pairs to process
# gene_pairs = pd.read_csv(
#     "/home/seguraab/ara-kinase-prediction/data/instances_dataset_1.txt", 
#     sep="\t")
# instances = '/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/Dataset_4.txt'
# instances = '/home/seguraab/ara-kinase-prediction/data/Kinase_genes/instances_tair10_kinases.txt'
# gene_pairs = pd.read_csv(instances, delimiter='\t', header=0)
instances = '/home/seguraab/ara-kinase-prediction/data/20250403_melissa_ara_data/corrected_data/binary_labels_from_linear_model.csv'
gene_pairs = pd.read_csv(instances, header=0)
# gene_pairs = gene_pairs["pair_ID"].str.split("_", expand=True)
# gene_pairs.columns = ['gene1', 'gene2']
gene_pairs.head()

# separate into gene 1 and gene 2
kinase_pairs_1 = list(gene_pairs.loc[:, 'gene1'])
kinase_pairs_2 = list(gene_pairs.loc[:, 'gene2'])

In [8]:
for doc_name in doc_list:
    # Construct the file path
    doc = os.path.join(data_dir, doc_name.rsplit("_")[0], doc_name) 

    # binary data types
    dtype = 'continuous_'
    dat = doc_name.rsplit("_")[0]
    genepair_calcs = '_number_in_pair_'
    transformations = ['log','noTF','reciprocal','squared']

    # Generate features and store them in a dictionary
    features = generate_feature_cols(dtype, dat, genepair_calcs, transformations)

    # read in as a dictionary of feature
    with open(doc, 'r') as f:
        binary_values = json.load(f)

    feat_1 = [binary_values.get(key) for key in kinase_pairs_1]
    feat_2 = [binary_values.get(key) for key in kinase_pairs_2]

    feat_df = pd.DataFrame({'gene1': kinase_pairs_1, 'gene2': kinase_pairs_2, 
                            'feat_1': feat_1, 'feat_2': feat_2})
    
    feat_df = create_data_feat_df(feat_df, features[1], 'gene1', 'gene2', 
                                  'feat_1', 'feat_2', features)
    
    print(feat_df)

    # feat_df.to_csv(f"{data_dir}{dat}_features.csv", index=False)
    feat_df.to_csv(f"{save_dir}/{prefix}_features_{dat}.txt", sep='\t', index=False)

         gene1      gene2  continuous_acetylation_number_in_pair_log  \
0    AT2G03450  AT1G13900                                        NaN   
1    AT2G21380  AT4G39050                                        NaN   
2    AT5G07830  AT5G61250                                        NaN   
3    AT5G16480  AT3G02800                                        NaN   
4    AT3G22790  AT4G14760                                        NaN   
..         ...        ...                                        ...   
137  AT3G19280  AT1G49710                                        NaN   
138  AT1G15330  AT1G80090                                        NaN   
139  AT1G51880  AT3G21340                                        NaN   
140  AT2G47800  AT3G62700                                        NaN   
141  AT1G24150  AT1G70140                                        NaN   

     continuous_acetylation_number_in_pair_noTF  \
0                                           0.0   
1                                