In [244]:
""" 
This code is for continuous data processing of specific protein properties data.
The protein properties with continuous data processing documents are: 
- aaLength.continuous.MLD_dictionary.json
- isoelectric_point_dictionary.json
"""

' \nThis code is for continuous data processing of specific protein properties data.\nThe protein properties with continuous data processing documents are: \n- aaLength.continuous.MLD_dictionary.json\n- isoelectric_point_dictionary.json\n\n\n'

In [281]:
import pandas as pd
import numpy as np
import os
import json

In [282]:
def generate_feature_cols(data_type, data, idx, gene_calcs_type, transformation_type):
    """
    Generate feature columns for continuous data processing.
    
    Parameters
    ----------
    data_type : list
        List of data types. Protein properties: code addresses continuous data
    data : list
        List of data. Protein properties: code addresses AA length and pI data
    idx : int
        Index of data. Protein properties: index 
    gene_calcs_type : list
        List of gene pair calculations to be applied to the data
    transformation_type : list
        List of transformations to be applied to the data gene pair calculations
        TF = transformation
    
    Returns
    -------
    list
        List of feature columns
    
    """
    return [str(data_type[0]) + str(data[idx]) + str(calc) + str(trans) 
            for calc in gene_calcs_type 
            for trans in transformation_type]

In [283]:
# gene pair calc columns (original cols: avg, abs(difference), max, min, sum)
def genepair_calc_lst(lst):
    """
    Generate gene pair calculation list.

    Parameters
    ----------
    lst : list
        List of gene pair calculations

    Returns
    -------
    list
        List of gene pair calculations
    
    """
    return [lst[2], lst[7], lst[12], lst[17], lst[22]]

In [284]:
# calc average, max, min, and difference of two columns
def calc_features(df, feature, col1, col2):
    """
    Calculate average, max, min, and difference of two columns.

    Parameters

    df : DataFrame
        DataFrame containing the data
    feature : list
        List of feature columns
    col1 : str
        Column 1 - specific protein property values for gene1
    col2 : str
        Column 2 - specific protein protery values for gene2

    Returns
    -------
    DataFrame
        DataFrame containing the data with calculated features
    
    """
    df[feature[0]] = (df[col1] + df[col2]) / 2
    df[feature[1]] = np.abs(df[col1] - df[col2])
    df[feature[2]] = np.maximum(df[col1], df[col2])
    df[feature[3]] = np.minimum(df[col1], df[col2])
    df[feature[4]] = (df[col1] + df[col2])
    return df

In [285]:
# Define a function to apply transformations
def apply_transformations(df, column):
    """
    Apply transformations to the data.

    Parameters
    ----------
    df : DataFrame
        DataFrame containing the data
    column : str
        Column to apply transformations to

    Returns
    -------
    DataFrame
        DataFrame containing the data with transformations applied
    
    """
    base_name = column.rpartition('_')[0]
    # qcut is (] for each dataset (exclusive - left, inclusive - right)
    df[f'{base_name}_binned'] = pd.qcut(df[column], 4, labels=False)
    df[f'{base_name}_log'] = np.log10(df[column].replace(0, np.nan))
    df[f'{base_name}_reciprocal'] = 1 / df[column].replace(0, np.nan)
    df[f'{base_name}_squared'] = df[column] ** 2
    return df

In [316]:
def create_data_feat_df (df, calc_list, col1_name, col2_name, gene1dat_name, gene2dat_name, feature_list):
    """
    Create data feature DataFrame.

    Parameters
    ----------
    df : DataFrame
        DataFrame containing the data
    calc_list : list
        List of gene pair calculations to be applied to the data
    col1_name : str
        Column 1 name
    col2_name : str
        Column 2 name
    gene1dat_name : str
        Gene 1 data name
    gene2dat_name : str
        Gene 2 data name
    feature_list : list
        List of feature columns

    Returns
    -------
    DataFrame
        DataFrame containing the data with features
    
    """
    df = calc_features(df, calc_list, gene1dat_name, gene2dat_name)
    for feature in calc_list:
        df = apply_transformations(df, feature)
    df = df[[col1_name, col2_name]+feature_list]
    return df

In [187]:
""" 
CONTINUOUS 1: Amino Acid Length
Average pair
Difference (Absolute Value)
Maximum of pair
Minimum of pair
Total of pair

For each of these data representations they were transformed using: 
Bin (Quartiles) 
Log (Base 10)
None
Reciprocal
Squared

CONTINUOUS 2: Isoelectric Point
Average pair
Difference (Absolute Value)
Maximum of pair
Minimum of pair
Total of pair

For each of these data representations they were transformed using: 
Bin (Quartiles) 
Log (Base 10)
None
Reciprocal
Squared
"""

' \nCONTINUOUS 1: Amino Acid Length\nAverage pair\nDifference (Absolute Value)\nMaximum of pair\nMinimum of pair\nTotal of pair\n\nFor each of these data representations they were transformed using: \nBin (Quartiles) \nLog (Base 10)\nNone\nReciprocal\nSquared\n'

In [298]:
# continuous data types
dtype = ['continuous_']
dat = ['AA_length', 'isoelectric_point']
genepair_calcs = ['_average_', '_difference_', '_max_', '_min_', '_pair_total_']
transformations = ['binned','log','noTF','reciprocal','squared']

In [299]:
# Continuous AA length features - these are all the features that will be 
# produced using the aaLength.continuous.MLD_dictionary.json document
aa_len_feat = generate_feature_cols(dtype, dat, 0, genepair_calcs, 
                                    transformations)

# Continuous isoelectric point features - these are all the features that will  
# be produced using the isoelectric_point_dictionary.json document
iso_feat = generate_feature_cols(dtype, dat, 1, genepair_calcs, transformations)

In [300]:
aa_len_calcs = genepair_calc_lst(aa_len_feat)
iso_calcs = genepair_calc_lst(iso_feat)

['continuous_AA_length_average_noTF',
 'continuous_AA_length_difference_noTF',
 'continuous_AA_length_max_noTF',
 'continuous_AA_length_min_noTF',
 'continuous_AA_length_pair_total_noTF']

In [301]:
# folder with continuous protein properties
data_dir = "/home/brow1110/ara-kinase-prediction/protein_properties/"

""" Kinase gene pairs """
kinase_pairs = pd.read_csv(
    "/home/seguraab/ara-kinase-prediction/data/instances_dataset_1.txt", 
    sep="\t")
kinase_pairs

In [326]:
kinase_pairs_1 = list(kinase_pairs.iloc[:, 0])
kinase_pairs_2 = list(kinase_pairs.iloc[:, 1])

['AT3G46420',
 'AT5G01820',
 'AT2G37050',
 'AT3G17840',
 'AT1G11410',
 'AT1G57700',
 'AT3G24660',
 'AT2G42290',
 'AT1G51800',
 'AT2G33170',
 'AT1G76360',
 'AT5G10930',
 'AT4G13260',
 'AT1G70460',
 'AT1G18160',
 'AT2G01210',
 'AT2G15300',
 'AT1G61490',
 'AT3G56760',
 'AT2G31500',
 'AT2G29250',
 'AT1G07570',
 'AT2G19210',
 'AT2G34650',
 'AT1G73670',
 'AT1G70410',
 'AT2G19470',
 'AT3G63260',
 'AT2G42290',
 'AT1G16270',
 'AT1G79250',
 'AT1G53700',
 'AT1G70110',
 'AT1G35670',
 'AT1G28440',
 'AT3G20190',
 'AT4G11480',
 'AT1G51880',
 'AT3G04530',
 'AT3G29160',
 'AT1G18150',
 'AT4G04700',
 'AT2G34180',
 'AT1G07560',
 'AT1G61370',
 'AT4G35600',
 'AT2G37050',
 'AT1G51880',
 'AT1G70530',
 'AT2G07020',
 'AT3G55450',
 'AT1G64210',
 'AT5G10530',
 'AT4G04960',
 'AT1G61440',
 'AT5G10020',
 'AT1G66930',
 'AT3G21340',
 'AT3G16030',
 'AT1G74330',
 'AT5G48940',
 'AT5G61570',
 'AT1G26150',
 'AT1G70740',
 'AT1G09970',
 'AT3G02880',
 'AT3G19100',
 'AT1G70740',
 'AT1G63500',
 'AT1G51800',
 'AT1G11350',
 'AT3G

In [304]:
# Construct the file path
file_aa_len_path = os.path.join(data_dir, 
                                "aaLength.continuous.MLD_dictionary.json")

# read in as a dictionary of AA length
with open(file_aa_len_path, 'r') as f:
    aa_len = json.load(f)

In [321]:
# Construct the file path
file_iso_path = os.path.join(data_dir, "isoelectric_point_dictionary.json")

# read in as a dictionary of pI: 
with open(file_iso_path, 'r') as f:
    iso = json.load(f)

In [305]:
# in kinase_genes_1, use list to get ordered values from dictionary
aa_len_1 = [aa_len.get(key) for key in kinase_pairs_1]
aa_len_2 = [aa_len.get(key) for key in kinase_pairs_2]

In [322]:
iso_1 = [iso.get(key) for key in kinase_pairs_1]
iso_2 = [iso.get(key) for key in kinase_pairs_2]

In [318]:
# add all the lists together as a df
aa_len_df = pd.DataFrame({'gene1': kinase_pairs_1, 'gene2': kinase_pairs_2, 
                          'aa_len_1': aa_len_1, 'aa_len_2': aa_len_2})

Unnamed: 0,gene1,gene2,aa_len_1,aa_len_2
0,AT3G46420,AT4G20450,,898.0
1,AT5G01820,AT5G57630,442.0,416.0
2,AT2G37050,AT5G59660,934.0,852.0
3,AT3G17840,AT3G51740,647.0,836.0
4,AT1G11410,AT4G23190,845.0,667.0
...,...,...,...,...
10245,AT1G23380,AT1G70510,329.0,310.0
10246,AT1G26790,AT1G69570,396.0,399.0
10247,AT1G16060,AT1G79700,345.0,313.0
10248,AT1G21410,AT1G77000,360.0,360.0


In [323]:
iso_df = pd.DataFrame({'gene1': kinase_pairs_1, 'gene2': kinase_pairs_2,
                        'iso_1': iso_1, 'iso_2': iso_2})

Unnamed: 0,gene1,gene2,iso_1,iso_2
0,AT3G46420,AT4G20450,7.0608,5.4427
1,AT5G01820,AT5G57630,8.2134,8.7389
2,AT2G37050,AT5G59660,6.5619,8.4164
3,AT3G17840,AT3G51740,6.5263,7.7491
4,AT1G11410,AT4G23190,8.0376,7.6984
...,...,...,...,...
10245,AT1G23380,AT1G70510,4.6703,4.6519
10246,AT1G26790,AT1G69570,7.2811,8.8242
10247,AT1G16060,AT1G79700,5.7015,7.5952
10248,AT1G21410,AT1G77000,7.3251,7.5001


In [319]:
aa_len_df = create_data_feat_df(aa_len_df, aa_len_calcs, 'gene1', 'gene2', 
                                'aa_len_1', 'aa_len_2', aa_len_feat)

In [324]:
iso_df = create_data_feat_df(iso_df, iso_calcs, 'gene1', 'gene2', 
                             'iso_1', 'iso_2', iso_feat)

In [217]:
# output processed data to a csv
aa_len_df.to_csv(data_dir+"aa_len_features.csv", index=False)

In [125]:
# output processed data to a csv
iso_df.to_csv(data_dir+"iso_pt_features.csv", index=False)