This code is for the categorical data processing of specific protein properties data.
The protein properties with categorical data are:
- protein family

For the data processing, the following steps are performed:
- Initital data processing: 
    - Number of Overlapping Protein Families
    - Percent Overlapping Protein Families
    - Total Number of Protein Families for the Pair

- Transformation processing: For each type of processing the below will be 
  performed:
    - Binning (quartiles, (] )
    - Log10
    - No Transformation
    - Squared

In [1]:
import pandas as pd
import numpy as np
import os
import json

In [2]:
def create_gene_pfam_dict(doc_names, data_dir):
    """
    Create a dictionary that aggregates values from multiple JSON files for each gene.

    Parameters:
    doc_names (list): List of document names (JSON files).
    data_dir (str): Directory where the JSON files are located.

    Returns:
    dict: Dictionary with gene names as keys and lists of values as values.
    """
    gene_pfam_dict = {}

    for doc_name in doc_names:
        with open(data_dir + doc_name, "r") as f:
            print(f'{doc_name} opened!')
            data = json.load(f)
            gene_names = list(data.keys())
            for gene_name in gene_names:
                if gene_name not in gene_pfam_dict:
                    gene_pfam_dict[gene_name] = []
                gene_pfam_dict[gene_name].append(data[gene_name])
    
    return gene_pfam_dict

In [3]:
# Assuming df is the DataFrame and it has columns 'k_pfam_1' and 'k_pfam_2' with lists of values

# Define the function to count the number of 1s at the same index in both lists
def count_overlap(row, colnm_1, colnm_2):
    """ 
    Count the number of overlapping protein families between two genes 
    Parameters:
    row (pd.Series): Row of the DataFrame
    colnm_1 (str): Column name of the first gene
    colnm_2 (str): Column name of the second gene
    Returns:
    int: Number of overlapping protein families
    """
    k_pfam_1 = row[colnm_1]
    k_pfam_2 = row[colnm_2]
    return sum(1 for a, b in zip(k_pfam_1, k_pfam_2) if a == 1 and b == 1)

In [4]:
# count the number of 1s at different indexes in both lists
def count_diff(row, colnm_1, colnm_2):
    k_pfam_1 = row[colnm_1]
    k_pfam_2 = row[colnm_2]
    return sum(1 for a, b in zip(k_pfam_1, k_pfam_2) if a == 1 and b == 0)

In [5]:
def generate_feature_cols(data_type, data, idx_dt, idx, gene_calcs_type, transformation_type):
    """
    Generate feature columns for continuous data processing.
    
    Parameters
    ----------
    data_type : list
        List of data types. Protein properties: code addresses continuous data
    data : list
        List of data. Protein properties: code addresses AA length and pI data
    idx : int
        Index of data. Protein properties: index 
    gene_calcs_type : list
        List of gene pair calculations to be applied to the data
    transformation_type : list
        List of transformations to be applied to the data gene pair calculations
        TF = transformation
    
    Returns
    -------
    list
        List of feature columns
    
    """
    return [str(data_type[idx_dt]) + str(data[idx]) + str(calc) + str(trans) 
            for calc in gene_calcs_type 
            for trans in transformation_type]

In [6]:
def genepair_calc_lst(lst):
    """
    Generate gene pair calculations list.
    
    Parameters
    ----------
    lst : list
        List of gene pair calculations
    
    Returns
    -------
    list
        List of gene pair calculations
    
    """
    return [lst[2], lst[7], lst[12]]

In [7]:
# data_dir = "/home/brow1110/ara-kinase-prediction/protein_properties/pfam/"

data_dir = '/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/21_arabidopsis_redundancy/03_protein_sequence_properties/protein_domain/pfam/'
# save_dir = '/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/Dataset_4_Features'
# save_dir = '/home/seguraab/ara-kinase-prediction/data/Kinase_genes/features'

# prefix = 'Dataset_4'
# prefix = 'TAIR10_kinases'
prefix = '20250403_melissa_ara_features_for_binary_clf'

# gene pairs to process
# gene_pairs = pd.read_csv(
#     "/home/seguraab/ara-kinase-prediction/data/instances_dataset_1.txt", 
#     sep="\t")
# instances = '/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/Dataset_4.txt'
# instances = '/home/seguraab/ara-kinase-prediction/data/Kinase_genes/instances_tair10_kinases.txt'
# gene_pairs = pd.read_csv(instances, delimiter='\t', header=0)
instances = '/home/seguraab/ara-kinase-prediction/data/20250403_melissa_ara_data/corrected_data/binary_labels_from_linear_model.csv'
gene_pairs = pd.read_csv(instances, header=0)
# gene_pairs = gene_pairs["pair_ID"].str.split("_", expand=True)
# gene_pairs.columns = ['gene1', 'gene2']
gene_pairs.head()

# separate into gene 1 and gene 2
kinase_pairs_1 = list(gene_pairs.loc[:, 'gene1'])
kinase_pairs_2 = list(gene_pairs.loc[:, 'gene2'])

In [8]:
# get list of doc names in data_dir
doc_names = os.listdir(data_dir)
doc_names

['H_PPase_dictionary.json',
 'ACPS_dictionary.json',
 'MBOAT_2_dictionary.json',
 'Macro_dictionary.json',
 'EnY2_dictionary.json',
 'FDX-ACB_dictionary.json',
 'Longin_dictionary.json',
 'Remorin_C_dictionary.json',
 'ArfGap_dictionary.json',
 'DUF1421_dictionary.json',
 'DUF3252_dictionary.json',
 'NIF_dictionary.json',
 'Methyltr_RsmB-F_dictionary.json',
 'RNA_POL_M_15KD_dictionary.json',
 'Stomagen_dictionary.json',
 'HAD_2_dictionary.json',
 'Ribosomal_S4e_dictionary.json',
 'VDE_dictionary.json',
 'Trigger_C_dictionary.json',
 'Aquarius_N_dictionary.json',
 'MyTH4_dictionary.json',
 'TFIIA_gamma_N_dictionary.json',
 'DUF775_dictionary.json',
 'Flavoprotein_dictionary.json',
 'Tim44_dictionary.json',
 'PPR_long_dictionary.json',
 'DUF321_dictionary.json',
 'FBA_1_dictionary.json',
 'NADH-G_4Fe-4S_3_dictionary.json',
 'SBDS_C_dictionary.json',
 'HhH-GPD_dictionary.json',
 'RrnaAD_dictionary.json',
 'Na_sulph_symp_dictionary.json',
 'Glucosamine_iso_dictionary.json',
 'Ldh_1_C_dicti

In [9]:
gene_pfam_dict = create_gene_pfam_dict(doc_names, data_dir)

H_PPase_dictionary.json opened!
ACPS_dictionary.json opened!
MBOAT_2_dictionary.json opened!
Macro_dictionary.json opened!
EnY2_dictionary.json opened!
FDX-ACB_dictionary.json opened!
Longin_dictionary.json opened!
Remorin_C_dictionary.json opened!
ArfGap_dictionary.json opened!
DUF1421_dictionary.json opened!
DUF3252_dictionary.json opened!
NIF_dictionary.json opened!
Methyltr_RsmB-F_dictionary.json opened!
RNA_POL_M_15KD_dictionary.json opened!
Stomagen_dictionary.json opened!
HAD_2_dictionary.json opened!
Ribosomal_S4e_dictionary.json opened!
VDE_dictionary.json opened!
Trigger_C_dictionary.json opened!
Aquarius_N_dictionary.json opened!
MyTH4_dictionary.json opened!
TFIIA_gamma_N_dictionary.json opened!
DUF775_dictionary.json opened!
Flavoprotein_dictionary.json opened!
Tim44_dictionary.json opened!
PPR_long_dictionary.json opened!
DUF321_dictionary.json opened!
FBA_1_dictionary.json opened!
NADH-G_4Fe-4S_3_dictionary.json opened!
SBDS_C_dictionary.json opened!
HhH-GPD_dictionary.j

In [10]:
# for gene1 in gene_pairs, create a column of the values from the kinase_pairs_1_dict[gene1]
# for gene2 in gene_pairs, create a column of the values from the kinase_pairs_2_dict[gene2]
# create a dataframe from the gene_pairs dataframe
# add the columns to the dataframe
# save the dataframe to a csv file

# create a dataframe from gene_pairs
gene_pairs_df = gene_pairs
k_pfam_1_vals = []
k_pfam_2_vals = []
# add columns to the dataframe
for gene1 in kinase_pairs_1:
    k_pfam_1_vals.append(gene_pfam_dict[gene1])

for gene2 in kinase_pairs_2:
    k_pfam_2_vals.append(gene_pfam_dict[gene2])

gene_pairs_df["k_pfam_1"] = k_pfam_1_vals
gene_pairs_df["k_pfam_2"] = k_pfam_2_vals

In [11]:
def calc_features(df, calc_feature_names, col1, col2):
    # col1 = k_pfam_1, col2 = k_pfam_2

    """ 
    Calculate the number of overlapping protein families, percent overlapping 
    protein families, and total number of protein families for the pair.
    Parameters:
    df (pd.DataFrame): DataFrame
    calc_feature_names (list): List of feature names
    col1 (str): Column name of the first gene
    col2 (str): Column name of the second gene
    Returns:
    pd.DataFrame: DataFrame with new columns:
     - 'continuous_protein_domain_number_overlapping_noTF', 
     - 'continuous_protein_domain_percent_overlapping_noTF', and 
     - 'continuous_protein_domain_total_noTF'
    """
    # count overlap
    overlap = df.apply(lambda row: count_overlap(row, col1, col2), axis=1)

    # Apply the function to each row and create a new column 'num_overlap_pfam'
    df[calc_feature_names[0]] = df.apply(lambda row: count_overlap(row, col1, col2), axis=1)

    # Apply the function to each row and create a new column 'num_diff_pfam'
    df['num_diff_pfam_1_2'] = df.apply(lambda row: count_diff(row, col1, col2), axis=1)
    df['num_diff_pfam_2_1'] = df.apply(lambda row: count_diff(row, col2, col1), axis=1)

    df[calc_feature_names[2]] = df[calc_feature_names[0]] + df['num_diff_pfam_1_2'] + df['num_diff_pfam_2_1']

    df[calc_feature_names[1]] = (df[calc_feature_names[0]] / df[calc_feature_names[2]])*100

    return df

In [12]:
# Define a function to apply transformations
def apply_transformations(df, column):
    """
    Apply transformations to the data.

    Parameters
    ----------
    df : DataFrame
        DataFrame containing the data
    column : str
        Column to apply transformations to

    Returns
    -------
    DataFrame
        DataFrame containing the data with transformations applied
    
    """
    base_name = column.rpartition('_')[0]
    
    try:
        # Attempt to bin the column without dropping duplicates
        df[f'{base_name}_binned'] = pd.qcut(df[column], 4, labels=False)
    except ValueError as e:
        print(f"Error in binning column '{column}': {e}. Retrying with duplicates dropped.")
        # Retry with duplicates dropped
        df[f'{base_name}_binned'] = pd.qcut(df[column], 4, labels=False, duplicates='drop')
    
    df[f'{base_name}_log'] = np.log10(df[column].replace(0, np.nan))
    df[f'{base_name}_reciprocal'] = 1 / df[column].replace(0, np.nan)
    df[f'{base_name}_squared'] = df[column] ** 2
    
    return df

In [13]:
# continuous data types
dtype = ['continuous_']
dat = ['protein_domain']
genepair_calcs = ['_number_overlapping_', '_percent_overlapping_', '_total_']
transformations = ['binned','log','noTF','reciprocal','squared']

In [14]:
feats = generate_feature_cols(dtype, dat, 0, 0, genepair_calcs, transformations)

In [15]:
feats

['continuous_protein_domain_number_overlapping_binned',
 'continuous_protein_domain_number_overlapping_log',
 'continuous_protein_domain_number_overlapping_noTF',
 'continuous_protein_domain_number_overlapping_reciprocal',
 'continuous_protein_domain_number_overlapping_squared',
 'continuous_protein_domain_percent_overlapping_binned',
 'continuous_protein_domain_percent_overlapping_log',
 'continuous_protein_domain_percent_overlapping_noTF',
 'continuous_protein_domain_percent_overlapping_reciprocal',
 'continuous_protein_domain_percent_overlapping_squared',
 'continuous_protein_domain_total_binned',
 'continuous_protein_domain_total_log',
 'continuous_protein_domain_total_noTF',
 'continuous_protein_domain_total_reciprocal',
 'continuous_protein_domain_total_squared']

In [16]:
pfam_calcs = genepair_calc_lst(feats)


In [17]:
pfam_calcs

['continuous_protein_domain_number_overlapping_noTF',
 'continuous_protein_domain_percent_overlapping_noTF',
 'continuous_protein_domain_total_noTF']

In [18]:
def create_data_feat_df (df, pfam_calcs, col1, col2, gene1, gene2, features):
    """
    Create a DataFrame with the features for the protein family data.

    Parameters
    ----------
    df : DataFrame
        DataFrame containing the data
    pfam_calcs : list
        List of gene pair calculations
    col1 : str
        Column name of the first gene
    col2 : str
        Column name of the second gene

    Returns
    -------
    DataFrame
        DataFrame containing the features for the protein family data
    
    """
    # Calculate the features
    df = calc_features(df, pfam_calcs, col1, col2)

    # Apply transformations to the features
    for calc in pfam_calcs:
        df = apply_transformations(df, calc)
    # drop k_pfam_1 and k_pfam_2 columns
    df = df.drop([col1, col2], axis=1)
    # write df to .txt file
    # df.to_csv(f'/home/brow1110/ara-kinase-prediction/protein_properties/pfam/_{gene1}_{gene2}_pfam_data.txt', sep='\t', index=False)
    
    df = df[[gene1, gene2]+features]
    # df.to_csv(f'/home/brow1110/ara-kinase-prediction/protein_properties/pfam/_{gene1}_{gene2}_pfam_data_features.txt', sep='\t', index=False)
    # df.to_csv('/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/Dataset_4_Features/Dataset_4_features_pfam.txt', sep='\t', index=False)
    # df.to_csv('/home/seguraab/ara-kinase-prediction/data/Kinase_genes/features/TAIR10_kinases_features_pfam.txt', sep='\t', index=False)
    df.to_csv('/home/seguraab/ara-kinase-prediction/data/20250403_melissa_ara_data/features/20250403_melissa_ara_features_for_binary_clf_pfam.txt', sep='\t', index=False)
    return df

In [19]:
gene_pairs_df = create_data_feat_df(gene_pairs_df, pfam_calcs, 'k_pfam_1', 'k_pfam_2', 'gene1', 'gene2', feats)
# gene_pairs_df = pd.read_csv('/home/seguraab/ara-kinase-prediction/data/Kinase_genes/features/TAIR10_kinases_features_pfam.txt', sep='\t')
gene_pairs_df = pd.read_csv('/home/seguraab/ara-kinase-prediction/data/20250403_melissa_ara_data/features/20250403_melissa_ara_features_for_binary_clf_pfam.txt', sep='\t')
""" 
Note the below printout will appear if there are duplicate bin edges. Avoid this by dropping the following columns: 'continuous_protein_domain_number_overlapping_noTF', 'continuous_protein_domain_percent_overlapping_noTF'
 
Error in binning column 'continuous_protein_domain_number_overlapping_noTF': Bin edges must be unique: Index([0.0, 2.0, 2.0, 3.0, 8.0], dtype='float64', name='continuous_protein_domain_number_overlapping_noTF').
You can drop duplicate edges by setting the 'duplicates' kwarg. Retrying with duplicates dropped.
Error in binning column 'continuous_protein_domain_percent_overlapping_noTF': Bin edges must be unique: Index([0.0, 50.0, 80.0, 100.0, 100.0], dtype='float64', name='continuous_protein_domain_percent_overlapping_noTF').
You can drop duplicate edges by setting the 'duplicates' kwarg. Retrying with duplicates dropped.
"""

Error in binning column 'continuous_protein_domain_number_overlapping_noTF': Bin edges must be unique: Index([0.0, 0.0, 1.0, 2.0, 5.0], dtype='float64', name='continuous_protein_domain_number_overlapping_noTF').
You can drop duplicate edges by setting the 'duplicates' kwarg. Retrying with duplicates dropped.
Error in binning column 'continuous_protein_domain_percent_overlapping_noTF': Bin edges must be unique: Index([0.0, 100.0, 100.0, 100.0, 100.0], dtype='float64', name='continuous_protein_domain_percent_overlapping_noTF').
You can drop duplicate edges by setting the 'duplicates' kwarg. Retrying with duplicates dropped.
Error in binning column 'continuous_protein_domain_total_noTF': Bin edges must be unique: Index([0.0, 0.0, 1.0, 2.0, 6.0], dtype='float64', name='continuous_protein_domain_total_noTF').
You can drop duplicate edges by setting the 'duplicates' kwarg. Retrying with duplicates dropped.


" \nNote the below printout will appear if there are duplicate bin edges. Avoid this by dropping the following columns: 'continuous_protein_domain_number_overlapping_noTF', 'continuous_protein_domain_percent_overlapping_noTF'\n \nError in binning column 'continuous_protein_domain_number_overlapping_noTF': Bin edges must be unique: Index([0.0, 2.0, 2.0, 3.0, 8.0], dtype='float64', name='continuous_protein_domain_number_overlapping_noTF').\nYou can drop duplicate edges by setting the 'duplicates' kwarg. Retrying with duplicates dropped.\nError in binning column 'continuous_protein_domain_percent_overlapping_noTF': Bin edges must be unique: Index([0.0, 50.0, 80.0, 100.0, 100.0], dtype='float64', name='continuous_protein_domain_percent_overlapping_noTF').\nYou can drop duplicate edges by setting the 'duplicates' kwarg. Retrying with duplicates dropped.\n"

In [20]:
# keep this commented out
# gene_pairs_df = calc_features(gene_pairs_df, 
#                               pfam_calcs, 
#                               'k_pfam_1', 'k_pfam_2')
# columns to transform: num_overlap_pfam, total, percent_overlap_pfam

In [21]:
for col_nm in pfam_calcs:
    gene_pairs_df = apply_transformations(gene_pairs_df, col_nm)
    

Error in binning column 'continuous_protein_domain_number_overlapping_noTF': Bin edges must be unique: Index([0.0, 0.0, 1.0, 2.0, 5.0], dtype='float64', name='continuous_protein_domain_number_overlapping_noTF').
You can drop duplicate edges by setting the 'duplicates' kwarg. Retrying with duplicates dropped.
Error in binning column 'continuous_protein_domain_percent_overlapping_noTF': Bin edges must be unique: Index([0.0, 100.0, 100.0, 100.0, 100.0], dtype='float64', name='continuous_protein_domain_percent_overlapping_noTF').
You can drop duplicate edges by setting the 'duplicates' kwarg. Retrying with duplicates dropped.
Error in binning column 'continuous_protein_domain_total_noTF': Bin edges must be unique: Index([0.0, 0.0, 1.0, 2.0, 6.0], dtype='float64', name='continuous_protein_domain_total_noTF').
You can drop duplicate edges by setting the 'duplicates' kwarg. Retrying with duplicates dropped.


In [22]:
gene_pairs_df

Unnamed: 0,gene1,gene2,continuous_protein_domain_number_overlapping_binned,continuous_protein_domain_number_overlapping_log,continuous_protein_domain_number_overlapping_noTF,continuous_protein_domain_number_overlapping_reciprocal,continuous_protein_domain_number_overlapping_squared,continuous_protein_domain_percent_overlapping_binned,continuous_protein_domain_percent_overlapping_log,continuous_protein_domain_percent_overlapping_noTF,continuous_protein_domain_percent_overlapping_reciprocal,continuous_protein_domain_percent_overlapping_squared,continuous_protein_domain_total_binned,continuous_protein_domain_total_log,continuous_protein_domain_total_noTF,continuous_protein_domain_total_reciprocal,continuous_protein_domain_total_squared
0,AT2G03450,AT1G13900,0,,0,,0,,,,,,0,,0,,0
1,AT2G21380,AT4G39050,2,0.477121,3,0.333333,9,0.0,2.000000,100.0,0.010000,10000.0,2,0.477121,3,0.333333,9
2,AT5G07830,AT5G61250,0,0.000000,1,1.000000,1,0.0,2.000000,100.0,0.010000,10000.0,0,0.000000,1,1.000000,1
3,AT5G16480,AT3G02800,1,0.301030,2,0.500000,4,0.0,2.000000,100.0,0.010000,10000.0,1,0.301030,2,0.500000,4
4,AT3G22790,AT4G14760,0,0.000000,1,1.000000,1,0.0,2.000000,100.0,0.010000,10000.0,0,0.000000,1,1.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137,AT3G19280,AT1G49710,0,0.000000,1,1.000000,1,0.0,2.000000,100.0,0.010000,10000.0,0,0.000000,1,1.000000,1
138,AT1G15330,AT1G80090,0,,0,,0,,,,,,0,,0,,0
139,AT1G51880,AT3G21340,2,0.477121,3,0.333333,9,0.0,1.778151,60.0,0.016667,3600.0,2,0.698970,5,0.200000,25
140,AT2G47800,AT3G62700,1,0.301030,2,0.500000,4,0.0,2.000000,100.0,0.010000,10000.0,1,0.301030,2,0.500000,4


In [23]:
# output the dataframe to a csv file
# gene_pairs_df.to_csv(f"{data_dir}_pfam_properties_features.csv", index=False)
# gene_pairs_df.to_csv("/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/Dataset_4_Features/Dataset_4_features_pfam_properties.txt", sep="\t", index=False)
# gene_pairs_df.to_csv("/home/seguraab/ara-kinase-prediction/data/Kinase_genes/features/TAIR10_kinases_features_pfam_properties.txt", sep="\t", index=False)
gene_pairs_df.to_csv("/home/seguraab/ara-kinase-prediction/data/20250403_melissa_ara_data/features/20250403_melissa_ara_features_for_binary_clf_pfam_properties.txt", sep="\t", index=False)