This code is for continuous data processing of specific protein properties data.
The protein properties with continuous data processing documents are: 
- aaLength.continuous.MLD_dictionary.json
- isoelectric_point_dictionary.json

In [1]:
import pandas as pd
import numpy as np
import os
import json

In [2]:
def generate_feature_cols(data_type, data, idx, gene_calcs_type, transformation_type):
    """
    Generate feature columns for continuous data processing.
    
    Parameters
    ----------
    data_type : list
        List of data types. Protein properties: code addresses continuous data
    data : list
        List of data. Protein properties: code addresses AA length and pI data
    idx : int
        Index of data. Protein properties: index 
    gene_calcs_type : list
        List of gene pair calculations to be applied to the data
    transformation_type : list
        List of transformations to be applied to the data gene pair calculations
        TF = transformation
    
    Returns
    -------
    list
        List of feature columns
    
    """
    return [str(data_type[0]) + str(data[idx]) + str(calc) + str(trans) 
            for calc in gene_calcs_type 
            for trans in transformation_type]

In [3]:
# gene pair calc columns (original cols: avg, abs(difference), max, min, sum)
def genepair_calc_lst(lst):
    """
    Generate gene pair calculation list.

    Parameters
    ----------
    lst : list
        List of gene pair calculations

    Returns
    -------
    list
        List of gene pair calculations
    
    """
    return [lst[2], lst[7], lst[12], lst[17], lst[22]]

In [4]:
# calc average, max, min, and difference of two columns
def calc_features(df, feature, col1, col2):
    """
    Calculate average, max, min, and difference of two columns.

    Parameters

    df : DataFrame
        DataFrame containing the data
    feature : list
        List of feature columns
    col1 : str
        Column 1 - specific protein property values for gene1
    col2 : str
        Column 2 - specific protein protery values for gene2

    Returns
    -------
    DataFrame
        DataFrame containing the data with calculated features
    
    """
    df[feature[0]] = (df[col1] + df[col2]) / 2
    df[feature[1]] = np.abs(df[col1] - df[col2])
    df[feature[2]] = np.maximum(df[col1], df[col2])
    df[feature[3]] = np.minimum(df[col1], df[col2])
    df[feature[4]] = (df[col1] + df[col2])
    return df

In [5]:
# Define a function to apply transformations
def apply_transformations(df, column):
    """
    Apply transformations to the data.

    Parameters
    ----------
    df : DataFrame
        DataFrame containing the data
    column : str
        Column to apply transformations to

    Returns
    -------
    DataFrame
        DataFrame containing the data with transformations applied
    
    """
    base_name = column.rpartition('_')[0]
    # qcut is (] for each dataset (exclusive - left, inclusive - right)
    df[f'{base_name}_binned'] = pd.qcut(df[column], 4, labels=False)
    df[f'{base_name}_log'] = np.log10(df[column].replace(0, np.nan))
    df[f'{base_name}_reciprocal'] = 1 / df[column].replace(0, np.nan)
    df[f'{base_name}_squared'] = df[column] ** 2
    return df

In [6]:
def create_data_feat_df (df, calc_list, col1_name, col2_name, gene1dat_name, gene2dat_name, feature_list):
    """
    Create data feature DataFrame.

    Parameters
    ----------
    df : DataFrame
        DataFrame containing the data
    calc_list : list
        List of gene pair calculations to be applied to the data
    col1_name : str
        Column 1 name
    col2_name : str
        Column 2 name
    gene1dat_name : str
        Gene 1 data name
    gene2dat_name : str
        Gene 2 data name
    feature_list : list
        List of feature columns

    Returns
    -------
    DataFrame
        DataFrame containing the data with features
    
    """
    df = calc_features(df, calc_list, gene1dat_name, gene2dat_name)
    for feature in calc_list:
        df = apply_transformations(df, feature)
    df = df[[col1_name, col2_name]+feature_list]
    return df

CONTINUOUS 1: Amino Acid Length
Average pair
Difference (Absolute Value)
Maximum of pair
Minimum of pair
Total of pair

For each of these data representations they were transformed using: 
Bin (Quartiles) 
Log (Base 10)
None
Reciprocal
Squared

CONTINUOUS 2: Isoelectric Point
Average pair
Difference (Absolute Value)
Maximum of pair
Minimum of pair
Total of pair

For each of these data representations they were transformed using: 
Bin (Quartiles) 
Log (Base 10)
None
Reciprocal
Squared

In [7]:
# continuous data types
dtype = ['continuous_']
dat = ['AA_length', 'isoelectric_point']
genepair_calcs = ['_average_', '_difference_', '_max_', '_min_', '_pair_total_']
transformations = ['binned','log','noTF','reciprocal','squared']

In [8]:
# Continuous AA length features - these are all the features that will be 
# produced using the aaLength.continuous.MLD_dictionary.json document
aa_len_feat = generate_feature_cols(dtype, dat, 0, genepair_calcs, 
                                    transformations)

# Continuous isoelectric point features - these are all the features that will  
# be produced using the isoelectric_point_dictionary.json document
iso_feat = generate_feature_cols(dtype, dat, 1, genepair_calcs, transformations)

In [9]:
aa_len_calcs = genepair_calc_lst(aa_len_feat)
iso_calcs = genepair_calc_lst(iso_feat)

In [10]:
aa_len_calcs

['continuous_AA_length_average_noTF',
 'continuous_AA_length_difference_noTF',
 'continuous_AA_length_max_noTF',
 'continuous_AA_length_min_noTF',
 'continuous_AA_length_pair_total_noTF']

In [11]:
# folder with continuous protein properties
data_dir = "/home/brow1110/ara-kinase-prediction/protein_properties/"
data_dir = "/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/21_arabidopsis_redundancy/03_protein_sequence_properties"

# """ Kinase gene pairs """
# kinase_pairs = pd.read_csv(
#     "/home/seguraab/ara-kinase-prediction/data/instances_dataset_1.txt", 
#     sep="\t")
# kinase_pairs

""" Dataset_4.txt from Cusack 2021, it contains the kinase gene pairs as well """
# instances = '/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/Dataset_4.txt'
instances = '/home/seguraab/ara-kinase-prediction/data/Kinase_genes/instances_tair10_kinases.txt'
kinase_pairs = pd.read_csv(instances, delimiter='\t', header=0)
instances = '/home/seguraab/ara-kinase-prediction/data/20250403_melissa_ara_data/corrected_data/binary_labels_from_linear_model.csv'
kinase_pairs = pd.read_csv(instances, header=0)
# kinase_pairs = kinase_pairs["pair_ID"].str.split("_", expand=True)
# kinase_pairs.columns = ['gene1', 'gene2']
kinase_pairs.head()

Unnamed: 0,Set,gene1,gene2,binary_DTB_p05,binary_DTB_log10_p05,binary_DTB_plog10_p05,binary_LN_p05,binary_LN_log10_p05,binary_LN_plog10_p05,binary_DTF_p05,...,binary_SN_plog10_p1,binary_SN_plus1_log10_p1,binary_SPF_p1,binary_SPF_log10_p1,binary_SPF_plog10_p1,binary_TSC_p1,binary_TSC_plus1_p1,binary_TSC_plog10_p1,binary_TSC_plus1_log10_p1,binary_combined_p1
0,1,AT2G03450,AT1G13900,0.0,0.0,0.0,0.0,0.0,0.0,,...,0,0,0,0,0,0,0,0,0,0
1,11,AT2G21380,AT4G39050,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,1,1,1,1,0,0,1
2,110,AT5G07830,AT5G61250,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,1,1
3,12,AT5G16480,AT3G02800,1.0,1.0,1.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,1
4,122,AT3G22790,AT4G14760,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
kinase_pairs_1 = list(kinase_pairs.loc[:, 'gene1'])
kinase_pairs_2 = list(kinase_pairs.loc[:, 'gene2'])

In [13]:
len(kinase_pairs_1), len(kinase_pairs_2)

(142, 142)

In [14]:
# Construct the file path
file_aa_len_path = os.path.join(data_dir, 
                                "AA_length/aaLength.continuous.MLD_dictionary.json")

# read in as a dictionary of AA length
with open(file_aa_len_path, 'r') as f:
    aa_len = json.load(f)

In [15]:
# Construct the file path
file_iso_path = os.path.join(data_dir, "isoelectric_point/isoelectric_point_dictionary.json")

# read in as a dictionary of pI: 
with open(file_iso_path, 'r') as f:
    iso = json.load(f)

In [16]:
# in kinase_genes_1, use list to get ordered values from dictionary
aa_len_1 = [aa_len.get(key) for key in kinase_pairs_1]
aa_len_2 = [aa_len.get(key) for key in kinase_pairs_2]

In [17]:
iso_1 = [iso.get(key) for key in kinase_pairs_1]
iso_2 = [iso.get(key) for key in kinase_pairs_2]

In [18]:
# add all the lists together as a df
aa_len_df = pd.DataFrame({'gene1': kinase_pairs_1, 'gene2': kinase_pairs_2, 
                          'aa_len_1': aa_len_1, 'aa_len_2': aa_len_2})

In [19]:
iso_df = pd.DataFrame({'gene1': kinase_pairs_1, 'gene2': kinase_pairs_2,
                        'iso_1': iso_1, 'iso_2': iso_2})

In [20]:
aa_len_df = create_data_feat_df(aa_len_df, aa_len_calcs, 'gene1', 'gene2', 
                                'aa_len_1', 'aa_len_2', aa_len_feat)

In [21]:
iso_df = create_data_feat_df(iso_df, iso_calcs, 'gene1', 'gene2', 
                             'iso_1', 'iso_2', iso_feat)

In [22]:
# output processed data to a csv
# aa_len_df.to_csv(data_dir+"aa_len_features.csv", index=False)
# aa_len_df.to_csv("/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/Dataset_4_Features/Dataset_4_features_aa_len.txt", sep="\t", index=False)
# aa_len_df.to_csv("/home/seguraab/ara-kinase-prediction/data/Kinase_genes/features/TAIR10_kinases_features_aa_len.txt", sep="\t", index=False)
aa_len_df.to_csv("/home/seguraab/ara-kinase-prediction/data/Kinase_genes/features/20250403_melissa_ara_features_for_binary_clf_aa_len.txt", sep='\t', index=False)

In [23]:
# output processed data to a csv
# iso_df.to_csv(data_dir+"iso_pt_features.csv", index=False)
# iso_df.to_csv("/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/Dataset_4_Features/Dataset_4_features_iso_pt.txt", sep="\t", index=False)
# iso_df.to_csv("/home/seguraab/ara-kinase-prediction/data/Kinase_genes/features/TAIR10_kinases_features_iso_pt.txt", sep="\t", index=False)
iso_df.to_csv("/home/seguraab/ara-kinase-prediction/data/Kinase_genes/features/20250403_melissa_ara_features_for_binary_clf_iso_pt.txt", sep='\t', index=False)