# Prepare feature engineering checklists for the 6 feature categories:
- 01 Functional Annotations  
- 02 Evolutionary Properties  
- 03 Protein sequence properties  
- 04 Gene expression  
- 05 Epigenetics  
- 06 Network properties  
The feature data is coming from the paper by Cusack et al., 2021 MBE and is located 
at /home/seguraab/ara-kinase-prediction/data/2021_cusack_data/21_arabidopsis_redundancy

In [1]:
import os
import pandas as pd
import numpy as np

In [1]:
# Cusack's feature list
master_list = pd.read_excel('/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/Supplemental_tables_revision.xlsx',
                            sheet_name='Supplemental table 1', skipfooter=22)
master_list = master_list[['Feature category', 'Feature name', 'ML model feature name',
             'Feature type', 'Calculation for gene pair', 'Data processing method',
             'Transformation', 'Description', 'Source']]
master_list.insert(2, 'NEW Feature name', master_list['Feature name'].str.strip().str.lower()) # Add new column with lowercase feature name
# master_list.to_csv('/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/Supplemental_table_1.csv', index=False)
master_list

Unnamed: 0,Feature category,Feature name,NEW Feature name,ML model feature name,Feature type,Calculation for gene pair,Data processing method,Transformation,Description,Source
0,Epigenetic modifications,Gene body methylation,gene body methylation,continuous_bodyMethylation.MLD_number_in_pair_log,Continuous,Number in pair,Binary,Log,A gene was called as gene body methylated if t...,Lloyd et al. 2015
1,Epigenetic modifications,Gene body methylation,gene body methylation,continuous_bodyMethylation.MLD_number_in_pair_...,Continuous,Number in pair,Binary,,A gene was called as gene body methylated if t...,Lloyd et al. 2015
2,Epigenetic modifications,Gene body methylation,gene body methylation,continuous_bodyMethylation.MLD_number_in_pair_...,Continuous,Number in pair,Binary,Reciprocal,A gene was called as gene body methylated if t...,Lloyd et al. 2015
3,Epigenetic modifications,Gene body methylation,gene body methylation,continuous_bodyMethylation.MLD_number_in_pair_...,Continuous,Number in pair,Binary,Squared,A gene was called as gene body methylated if t...,Lloyd et al. 2015
4,Epigenetic modifications,Chromatin accessibility,chromatin accessibility,continuous_chromatin_accessibility_number_in_p...,Continuous,Number in pair,Binary,Log,Whether or not gene contained a Dnase peak site,Sullivan et al. 2014
...,...,...,...,...,...,...,...,...,...,...
4111,Protein properties,Protein domain annotations,protein domain annotations,continuous_protein_domain_total_binned,Continuous,Total #,Categorical,Binned,All protein domain annotations for a gene.,Finn et al. 2016
4112,Protein properties,Protein domain annotations,protein domain annotations,continuous_protein_domain_total_log,Continuous,Total #,Categorical,Log,All protein domain annotations for a gene.,Finn et al. 2016
4113,Protein properties,Protein domain annotations,protein domain annotations,continuous_protein_domain_total_noTF,Continuous,Total #,Categorical,,All protein domain annotations for a gene.,Finn et al. 2016
4114,Protein properties,Protein domain annotations,protein domain annotations,continuous_protein_domain_total_reciprocal,Continuous,Total #,Categorical,Reciprocal,All protein domain annotations for a gene.,Finn et al. 2016


## Read in the master list with corrected 'Feature name' values
July 29 - Functional Annotation feature names were corrected, see `01_functional_annotation features` section  
July 30 - 

### 01_functional_annotation features (July 29)

In [2]:
master_list = pd.read_csv('/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/Supplemental_table_1_corrected_KSA_20240729.csv')

In [3]:
# Get all the JSON file paths
files = []
path = '/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/21_arabidopsis_redundancy/01_functional_annotation/AraCyc_pathways/AraCyc'
for dirpath, dirnames, filenames in os.walk(path):
    for filename in filenames:
        files.append(os.path.join(dirpath, filename))
len(files) # 608

# Create a dataframe with the file paths and feature names
file_paths = pd.DataFrame(np.repeat(files, 4)).sort_values(0) # repeat each file path 4 times since there are 4 data transformations per feature
feat_names = file_paths.iloc[:,0].str.split('/').apply(lambda x: x[10]).\
    apply(lambda x: x.split('_dictionary.json')[0].replace('_', ' ')) # get the feature name from the JSON file name
file_paths.insert(1, 'Feature name', feat_names) # insert the feature name into the dataframe
file_paths.columns = ['File path', 'Feature name']
file_paths['Feature name'] = file_paths['Feature name'].str.lower()

# Map the file paths to the master list of features
func_anno_paths = file_paths.merge(master_list.loc[master_list['Feature category']\
    =='Functional annotation'], left_on='Feature name', right_on='NEW Feature name',
    how='left', suffixes=('', '_y')) # merge to master list
func_anno_paths.drop(columns=['Feature name_y'], inplace=True)

# Drop duplicates created by merge
func_anno_paths = func_anno_paths.drop_duplicates('ML model feature name').reset_index().drop('index', axis=1)
func_anno_paths

Unnamed: 0,File path,Feature name,Feature category,NEW Feature name,ML model feature name,Feature type,Calculation for gene pair,Data processing method,Transformation,Description,Source
0,/home/seguraab/ara-kinase-prediction/data/2021...,"1,3,5-trimethoxybenzene biosynthesis",Functional annotation,"1,3,5-trimethoxybenzene biosynthesis",continuous_1.3.5.trimethoxybenzene_biosynthesi...,Continuous,Number in pair,Binary,Log,Gene product involved in this pathway or funct...,AraCyc v.15; Mueller et al. 2003
1,/home/seguraab/ara-kinase-prediction/data/2021...,"1,3,5-trimethoxybenzene biosynthesis",Functional annotation,"1,3,5-trimethoxybenzene biosynthesis",continuous_1.3.5.trimethoxybenzene_biosynthesi...,Continuous,Number in pair,Binary,,Gene product involved in this pathway or funct...,AraCyc v.15; Mueller et al. 2003
2,/home/seguraab/ara-kinase-prediction/data/2021...,"1,3,5-trimethoxybenzene biosynthesis",Functional annotation,"1,3,5-trimethoxybenzene biosynthesis",continuous_1.3.5.trimethoxybenzene_biosynthesi...,Continuous,Number in pair,Binary,Reciprocal,Gene product involved in this pathway or funct...,AraCyc v.15; Mueller et al. 2003
3,/home/seguraab/ara-kinase-prediction/data/2021...,"1,3,5-trimethoxybenzene biosynthesis",Functional annotation,"1,3,5-trimethoxybenzene biosynthesis",continuous_1.3.5.trimethoxybenzene_biosynthesi...,Continuous,Number in pair,Binary,Squared,Gene product involved in this pathway or funct...,AraCyc v.15; Mueller et al. 2003
4,/home/seguraab/ara-kinase-prediction/data/2021...,"1,4-dihydroxy-2-naphthoate biosynthesis ii plants",Functional annotation,"1,4-dihydroxy-2-naphthoate biosynthesis ii plants",continuous_1.4.dihydroxy.2.naphthoate_biosynth...,Continuous,Number in pair,Binary,Log,Gene product involved in this pathway or funct...,AraCyc v.15; Mueller et al. 2003
...,...,...,...,...,...,...,...,...,...,...,...
2425,/home/seguraab/ara-kinase-prediction/data/2021...,"zeaxanthin, antheraxanthin and violaxanthin in...",Functional annotation,"zeaxanthin, antheraxanthin and violaxanthin in...",continuous_zeaxanthin._antheraxanthin_and_viol...,Continuous,Number in pair,Binary,Squared,Gene product involved in this pathway or funct...,AraCyc v.15; Mueller et al. 2003
2426,/home/seguraab/ara-kinase-prediction/data/2021...,zeaxanthin biosynthesis,Functional annotation,zeaxanthin biosynthesis,continuous_zeaxanthin_biosynthesis_number_in_p...,Continuous,Number in pair,Binary,Log,Gene product involved in this pathway or funct...,AraCyc v.15; Mueller et al. 2003
2427,/home/seguraab/ara-kinase-prediction/data/2021...,zeaxanthin biosynthesis,Functional annotation,zeaxanthin biosynthesis,continuous_zeaxanthin_biosynthesis_number_in_p...,Continuous,Number in pair,Binary,,Gene product involved in this pathway or funct...,AraCyc v.15; Mueller et al. 2003
2428,/home/seguraab/ara-kinase-prediction/data/2021...,zeaxanthin biosynthesis,Functional annotation,zeaxanthin biosynthesis,continuous_zeaxanthin_biosynthesis_number_in_p...,Continuous,Number in pair,Binary,Reciprocal,Gene product involved in this pathway or funct...,AraCyc v.15; Mueller et al. 2003


In [4]:
# check for missing data
func_anno_paths.isna().sum()
# Missing data in original master list file from Cusack 2021 paper
# File path                       0
# Feature name                    0
# Feature category               96
# ML model feature name          96
# Feature type                   96
# Calculation for gene pair      96
# Data processing method         96
# Transformation               2436
# Description                    96
# Source                         96
# dtype: int64

File path                      0
Feature name                   0
Feature category               1
NEW Feature name               1
ML model feature name          1
Feature type                   1
Calculation for gene pair      1
Data processing method         1
Transformation               608
Description                    1
Source                         1
dtype: int64

In [5]:
# Manually check that these features were not included in the master listfunc_anno_paths.loc[func_anno_paths['ML model feature name'].isna(),'Feature name'].unique()
func_anno_paths.loc[func_anno_paths['ML model feature name'].isna(),'Feature name'].unique()

# Note: There are 96 rows with missing data in the original master list file from Cusack 2021 paper
# Some of these have (), sub2sub, sub3sub, sub12sub, etc. in the master list. I 
# modified the file Supplemental_table_1_corrected_KSA_20240729.csv to correct 
# the errors. See 'NEW Feature name' column in the master list file for corrected 
# feature names.
#
# array(['1d-myo-inositol hexakisphosphate biosynthesis v from ins1,3,4p3', # not found in master list, thus will not be included as a feature
#        '3e-4,8-dimethylnona-1,3,7-triene biosynthesis',                   # modified '(3e)' to '3e' in Supplemental_table_1_corrected_KSA_20240729.csv
#        '4-methyl-5beta-hydroxyethylthiazole salvage yeast',               # modified '5beta' to '5-beta' in Supplemental_table_1_corrected_KSA_20240729.csv
#        'cosub2sub fixation into oxaloacetate anapleurotic',               # modified 'co2 fixation into oxaloacetate anapleurotic' to 'cosub2sub fixation into oxaloacetate anapleurotic' in Supplemental_table_1_corrected_KSA_20240729.csv
#        'e,e-4,8,12-trimethyltrideca-1,3,7,11-tetraene biosynthesis',      # modified '(e,e)' to 'e,e' in Supplemental_table_1_corrected_KSA_20240729.csv
#        'gasub12sub biosynthesis',                                         # modified 'ga12' to 'gasub12sub' in Supplemental_table_1_corrected_KSA_20240729.csv
#        'kdo transfer to lipid ivsubasub i',                               # modified 'iva' to 'ivsubasub' in Supplemental_table_1_corrected_KSA_20240729.csv
#        'l-nsupdeltasup-acetylornithine biosynthesis',                     # modified 'ndelta' to 'nsupdeltasup' in Supplemental_table_1_corrected_KSA_20240729.csv
#        'nadnadh phosphorylation and dephosphorylation',                   # modified 'nad/nadh' to 'nadnadh' in Supplemental_table_1_corrected_KSA_20240729.csv
#        'fatty acid beta-oxidation v unsaturated, odd number, di-isomerase-dependent', # modified '(unsaturated, odd number)' to 'unsaturated, odd number'
#        'formate oxidation to cosub2sub',                                  # modified 'co2' to 'cosub2sub' in Supplemental_table_1_corrected_KSA_20240729.csv
#        'gibberellin biosynthesis iii early c-13 hydroxylation',           # modified '(early c-13 hydroxylation)' to 'early c-13 hydroxylation' in Supplemental_table_1_corrected_KSA_20240729.csv
#        'gibberellin biosynthesis ii early c-3 hydroxylation',             # modified '(early c-3 hydroxylation)' to 'early c-3 hydroxylation' in Supplemental_table_1_corrected_KSA_20240729.csv
#        'gibberellin biosynthesis i non c-3, non c-13 hydroxylation',      # modified '(non c-3, non c-13 hydroxylation)' to 'non c-3, non c-13 hydroxylation' in Supplemental_table_1_corrected_KSA_20240729.csv
#        'gibberellin inactivation i 2beta-hydroxylation',                  # modified '(2beta-hydroxylation)' to '2beta-hydroxylation' in Supplemental_table_1_corrected_KSA_20240729.csv
#        'lipid-dependent phytate biosynthesis ii via ins1,3,4psub3sub',    # modified 'ins(1,3,4)p3' to 'ins1,3,4psub3sub' in Supplemental_table_1_corrected_KSA_20240729.csv
#        'lipid-dependent phytate biosynthesis i via ins1,4,5psub3sub',     # modified 'ins(1,4,5)p3' to 'ins1,4,5psub3sub' in Supplemental_table_1_corrected_KSA_20240729.csv
#        'lipid ivsubasub biosynthesis',                                    # modified 'iva' to 'ivsubasub' in Supplemental_table_1_corrected_KSA_20240729.csv
#        'pyruvate decarboxylation to acetyl coa',                          # modified 'acetyl-coa' to 'acetyl coa' in Supplemental_table_1_corrected_KSA_20240729.csv
#        'superpathay of heme biosynthesis from glutamate',                 # modified 'superpathway' to 'superpathay' in Supplemental_table_1_corrected_KSA_20240729.csv
#        'superpathway of gibberellin gasub12sub biosynthesis',             # modified 'ga12' to 'gasub12sub' in Supplemental_table_1_corrected_KSA_20240729.csv
#        'superpathway of pantothenate and coenzymea biosynthesis',         # modified 'coenzyme a' to 'coenzymea' in Supplemental_table_1_corrected_KSA_20240729.csv
#        'trans, trans-farnesyl diphosphate biosynthesis',                  # modified 'trans-trans' to 'trans, trans' in Supplemental_table_1_corrected_KSA_20240729.csv
#        'traumatin and z-3-hexen-1-yl acetate biosynthesis'], dtype=object)# modified '(z)' to 'z' in Supplemental_table_1_corrected_KSA_20240729.csv

array(['1d-myo-inositol hexakisphosphate biosynthesis v from ins1,3,4p3'],
      dtype=object)

In [6]:
# Make the checklist file
func_anno_paths['Transformation'] = func_anno_paths['Transformation'].fillna('None') # set NaNs in Transformation to "None"
func_anno_paths.insert(0, 'Created by', '')
func_anno_paths.insert(1, 'Date created', '')
# func_anno_paths.to_csv('/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/01_functional_annotation_feature_list.csv', index=False)
func_anno_paths

Unnamed: 0,Created by,Date created,File path,Feature name,Feature category,NEW Feature name,ML model feature name,Feature type,Calculation for gene pair,Data processing method,Transformation,Description,Source
0,,,/home/seguraab/ara-kinase-prediction/data/2021...,"1,3,5-trimethoxybenzene biosynthesis",Functional annotation,"1,3,5-trimethoxybenzene biosynthesis",continuous_1.3.5.trimethoxybenzene_biosynthesi...,Continuous,Number in pair,Binary,Log,Gene product involved in this pathway or funct...,AraCyc v.15; Mueller et al. 2003
1,,,/home/seguraab/ara-kinase-prediction/data/2021...,"1,3,5-trimethoxybenzene biosynthesis",Functional annotation,"1,3,5-trimethoxybenzene biosynthesis",continuous_1.3.5.trimethoxybenzene_biosynthesi...,Continuous,Number in pair,Binary,,Gene product involved in this pathway or funct...,AraCyc v.15; Mueller et al. 2003
2,,,/home/seguraab/ara-kinase-prediction/data/2021...,"1,3,5-trimethoxybenzene biosynthesis",Functional annotation,"1,3,5-trimethoxybenzene biosynthesis",continuous_1.3.5.trimethoxybenzene_biosynthesi...,Continuous,Number in pair,Binary,Reciprocal,Gene product involved in this pathway or funct...,AraCyc v.15; Mueller et al. 2003
3,,,/home/seguraab/ara-kinase-prediction/data/2021...,"1,3,5-trimethoxybenzene biosynthesis",Functional annotation,"1,3,5-trimethoxybenzene biosynthesis",continuous_1.3.5.trimethoxybenzene_biosynthesi...,Continuous,Number in pair,Binary,Squared,Gene product involved in this pathway or funct...,AraCyc v.15; Mueller et al. 2003
4,,,/home/seguraab/ara-kinase-prediction/data/2021...,"1,4-dihydroxy-2-naphthoate biosynthesis ii plants",Functional annotation,"1,4-dihydroxy-2-naphthoate biosynthesis ii plants",continuous_1.4.dihydroxy.2.naphthoate_biosynth...,Continuous,Number in pair,Binary,Log,Gene product involved in this pathway or funct...,AraCyc v.15; Mueller et al. 2003
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2425,,,/home/seguraab/ara-kinase-prediction/data/2021...,"zeaxanthin, antheraxanthin and violaxanthin in...",Functional annotation,"zeaxanthin, antheraxanthin and violaxanthin in...",continuous_zeaxanthin._antheraxanthin_and_viol...,Continuous,Number in pair,Binary,Squared,Gene product involved in this pathway or funct...,AraCyc v.15; Mueller et al. 2003
2426,,,/home/seguraab/ara-kinase-prediction/data/2021...,zeaxanthin biosynthesis,Functional annotation,zeaxanthin biosynthesis,continuous_zeaxanthin_biosynthesis_number_in_p...,Continuous,Number in pair,Binary,Log,Gene product involved in this pathway or funct...,AraCyc v.15; Mueller et al. 2003
2427,,,/home/seguraab/ara-kinase-prediction/data/2021...,zeaxanthin biosynthesis,Functional annotation,zeaxanthin biosynthesis,continuous_zeaxanthin_biosynthesis_number_in_p...,Continuous,Number in pair,Binary,,Gene product involved in this pathway or funct...,AraCyc v.15; Mueller et al. 2003
2428,,,/home/seguraab/ara-kinase-prediction/data/2021...,zeaxanthin biosynthesis,Functional annotation,zeaxanthin biosynthesis,continuous_zeaxanthin_biosynthesis_number_in_p...,Continuous,Number in pair,Binary,Reciprocal,Gene product involved in this pathway or funct...,AraCyc v.15; Mueller et al. 2003


### 02_evolutionary_properties features

In [None]:
master_list = pd.read_csv('/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/Supplemental_table_1_corrected_KSA_20240729.csv')

# Subset the master list to get the protein properties
evo_prop_df = master_list.loc[master_list['Feature category']=='Evolutionary properties',:]

# JSON file paths for evolutionary properties
# repeat these 25
file_paths = ['/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/21_arabidopsis_redundancy/02_evolutionary_properties/gene_family_size/gene_family_size_dictionary.json',
            #   'ka/ks', # ask Thilanka for a script to calculate these
            #   'ka',
            #   'ks',
              '/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/21_arabidopsis_redundancy/02_evolutionary_properties/lethality_score/lethality_dict_121817.json',
              '/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/21_arabidopsis_redundancy/02_evolutionary_properties/retention_rate/Retention_rate_dictionary.json']
file_paths = file_paths + list(np.repeat('/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/21_arabidopsis_redundancy/02_evolutionary_properties/lethality_score/lethality_dict_121817.json', 5))

### 03_protein_sequence_properties features

In [None]:
master_list = pd.read_csv('/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/Supplemental_table_1_corrected_KSA_20240729.csv')

# Subset the master list to get the protein properties
prot_prop_df = master_list.loc[master_list['Feature category']=='Protein properties',:]

# JSON file paths for protein properties
file_paths = list(np.repeat(['/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/21_arabidopsis_redundancy/03_protein_sequence_properties/AA_length/aaLength.continuous.MLD_dictionary.json',
                             '/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/21_arabidopsis_redundancy/03_protein_sequence_properties/isoelectric_point/isoelectric_point_dictionary.json'], 25))
file_paths = file_paths + list(np.repeat(['/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/21_arabidopsis_redundancy/03_protein_sequence_properties/acetylation/acetylation_dict.json',
                   '/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/21_arabidopsis_redundancy/03_protein_sequence_properties/deamination/deamination_dict.json',
                   '/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/21_arabidopsis_redundancy/03_protein_sequence_properties/formylation/formylation_dict.json',
                   '/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/21_arabidopsis_redundancy/03_protein_sequence_properties/hydroxylation/hydroxylation_dict.json',
                   '/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/21_arabidopsis_redundancy/03_protein_sequence_properties/myristoylation/myristoylation_dict.json',
                   '/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/21_arabidopsis_redundancy/03_protein_sequence_properties/oxidation/oxidation_dict.json',
                   '/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/21_arabidopsis_redundancy/03_protein_sequence_properties/propionylation/propionylation_dict.json'], 4))
file_paths = file_paths + list(np.repeat('/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/21_arabidopsis_redundancy/03_protein_sequence_properties/protein_domain/pfam', 15))

print(len(file_paths), prot_prop_df.shape) # 93 features

In [None]:
# Create a dataframe with the file paths and feature names  
file_paths = pd.DataFrame(file_paths).sort_values(0)
feat_names = file_paths.iloc[:,0].str.split('/').apply(lambda x: x[9]).\
    apply(lambda x: x.split('_dict.json')[0].replace('_', ' ')) # get the feature name from the JSON file name
feat_names = feat_names.apply(lambda x: x.replace(' dictionary.json', ''))
file_paths.insert(1, 'Feature name', feat_names) # insert the feature name into the dataframe
file_paths.columns = ['File path', 'Feature name']
file_paths['Feature name'] = file_paths['Feature name'].str.lower()

# Map the file paths to the master list of features
prot_prop_df.loc[prot_prop_df['Feature name']=='AA length', 'NEW Feature name'] = 'aalength.continuous.mld'
prot_prop_df.loc[prot_prop_df['Feature name']=='Protein domain annotations', 'NEW Feature name'] = 'pfam'

func_anno_paths = file_paths.merge(prot_prop_df, left_on='Feature name', right_on='NEW Feature name',
    how='left', suffixes=('', '_y')) # merge to master list
func_anno_paths.drop(columns=['Feature name'], inplace=True)

# Drop duplicates created by merge
func_anno_paths = func_anno_paths.drop_duplicates('ML model feature name').reset_index().drop('index', axis=1)
func_anno_paths

In [None]:
# Make the checklist file
func_anno_paths['Transformation'] = func_anno_paths['Transformation'].fillna('None') # set NaNs in Transformation to "None"
func_anno_paths.insert(0, 'Created by', '')
func_anno_paths.insert(1, 'Date created', '')
func_anno_paths.rename(columns={'Feature name_y':'Feature name'}, inplace=True)
# func_anno_paths.to_csv('/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/03_protein_sequence_properties_feature_list.csv', index=False)
func_anno_paths

### 04_gene_expression features

In [None]:
master_list = pd.read_csv('/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/Supplemental_table_1_corrected_KSA_20240729.csv')

# Subset the master list to get the protein properties
gene_expr_df = master_list.loc[master_list['Feature category']=='Gene expression',:]
print(gene_expr_df.shape) # 450

# Get all the JSON file paths
files = ['/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/21_arabidopsis_redundancy/03_protein_sequence_properties/polysome_data/Hsu_data_dict.json']
path = '/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/21_arabidopsis_redundancy/04_gene_expression'
for dirpath, dirnames, filenames in os.walk(path):
    for filename in filenames:
        if filename.endswith('.json'):
            # some of the JSONs we want to keep are found in the probably_junk
            # folder, hence I did not exclude the folder from the search. There 
            # will be extra files though, so will need to filter them out later.
            files.append(os.path.join(dirpath, filename))

len(files) # 22

In [None]:
# Create a dataframe with the file paths and feature names  
file_paths = list(np.repeat(sorted(files)[:1], 50)) # Hsu 2016 data
file_paths = file_paths + list(np.repeat(files[1:], 25)) # all other features
file_paths = pd.DataFrame(file_paths).sort_values(0)
feat_names = file_paths.iloc[:,0].str.split('/').apply(lambda x: x[-1]).\
    apply(lambda x: x.split('_dict.json')[0].replace('_', ' ')) # get the feature name from the JSON file name
feat_names = feat_names.apply(lambda x: x.replace(' dictionary.json', ''))
file_paths.insert(1, 'Feature name', feat_names) # insert the feature name into the dataframe
file_paths.columns = ['File path', 'Feature name']
file_paths['Feature name'] = file_paths['Feature name'].str.lower()

# Map the file paths to the master list of features
feat_map = {'Abiotic stress differential expression breadth, root':'abiotic root breadth up down',
       'Abiotic stress downregulated expression breadth, root':'abiotic-root expr breadth downonly',
       'Abiotic stress upregulated expression breadth, root':'abiotic-root expr breadth uponly',
       'Abiotic stress differential expression breadth, shoot':'abiotic shoot breadth up down',
       'Abiotic stress downregulated expression breadth, shoot':'abiotic-shoot expr breadth downonly',
       'Abiotic stress upregulated expression breadth, shoot':'abiotic-shoot expr breadth uponly',
       'Biotic stress differential expression breadth':'biotic breadth up down',
       'Biotic stress downregulated expression breadth':'biotic expr breadth downonly',
       'Biotic stress upregulated expression breadth':'biotic expr breadth uponly',
       'Hormone treatment differential expression breadth ':'hormone breadth up down',
       'Hormone treatment downregulated expression breadth ':'hormone expr breadth downonly',
       'Hormone treatment upregulated expression breadth ':'hormone expr breadth uponly',
       'Ribosome occupancy':'hsu data', 'RNA-seq expression':'hsu data'}

gene_expr_df['NEW Feature name'] = gene_expr_df['Feature name'].replace(feat_map)
gene_expr_df['NEW Feature name'] = gene_expr_df['NEW Feature name'].str.lower()

func_anno_paths = file_paths.merge(gene_expr_df, left_on='Feature name',
    right_on='NEW Feature name', how='right', suffixes=('', '_y')) # merge to master list
func_anno_paths.drop(columns=['Feature name'], inplace=True)
func_anno_paths.rename(columns={'Feature name_y':'Feature name'}, inplace=True)
columns = func_anno_paths.columns.tolist()
columns.insert(2, columns.pop(columns.index('Feature name')))
func_anno_paths = func_anno_paths.reindex(columns=columns)

# Drop duplicates created by merge
func_anno_paths = func_anno_paths.drop_duplicates('ML model feature name').reset_index().drop('index', axis=1)
func_anno_paths

In [None]:
# Make the checklist file
func_anno_paths['Transformation'] = func_anno_paths['Transformation'].fillna('None') # set NaNs in Transformation to "None"
func_anno_paths.insert(0, 'Created by', '')
func_anno_paths.insert(1, 'Date created', '')
# func_anno_paths.to_csv('/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/04_gene_expression_feature_list.csv', index=False)
func_anno_paths

### 05_epigenetics features

In [None]:
master_list = pd.read_csv('/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/Supplemental_table_1_corrected_KSA_20240729.csv')

# Subset the master list to get the protein properties
epi_df = master_list.loc[master_list['Feature category']=='Epigenetic modifications',:]

# Get all the JSON file paths
files = []
path = '/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/21_arabidopsis_redundancy/05_epigenetics'
for dirpath, dirnames, filenames in os.walk(path):
    if ("probably_junk" in dirpath) | ("test" in dirpath) | ("methyl_dicts" in dirpath):
        pass
    else:
        for filename in filenames:
            if filename.endswith('.json'):
                files.append(os.path.join(dirpath, filename))
len(files) # 31

# Create a dataframe with the file paths and feature names 
file_paths = list(np.repeat(sorted(files)[:9], 4)) + list(np.repeat(sorted(files)[30], 4))
file_paths = file_paths + list(np.repeat(sorted(files)[9:30], 25))
file_paths = pd.DataFrame(file_paths).sort_values(0)

feat_names = file_paths.iloc[:,0].str.split('/').apply(lambda x: x[-1]).\
    apply(lambda x: x.split('_dict.json')[0].replace('_', ' ')) # get the feature name from the JSON file name
feat_names = feat_names.apply(lambda x: x.replace(' dictionary.json', ''))

file_paths.insert(1, 'Feature name', feat_names) # insert the feature name into the dataframe
file_paths.columns = ['File path', 'Feature name']
file_paths['Feature name'] = file_paths['Feature name'].str.lower()

print(len(file_paths), epi_df.shape) # 93 features

In [None]:
# Map the file paths to the master list of features
epi_df.loc[epi_df['Feature name']=='Gene body methylation', 'NEW Feature name'] = 'bodymethylation.mld'

func_anno_paths = file_paths.merge(epi_df, left_on='Feature name', right_on='NEW Feature name',
    how='left', suffixes=('', '_y')) # merge to master list
func_anno_paths.drop(columns=['Feature name'], inplace=True)

# Drop duplicates created by merge
func_anno_paths = func_anno_paths.drop_duplicates('ML model feature name').reset_index().drop('index', axis=1)
func_anno_paths

In [None]:
# Make the checklist file
func_anno_paths['Transformation'] = func_anno_paths['Transformation'].fillna('None') # set NaNs in Transformation to "None"
func_anno_paths.insert(0, 'Created by', '')
func_anno_paths.insert(1, 'Date created', '')
func_anno_paths.rename(columns={'Feature name_y':'Feature name'}, inplace=True)
# func_anno_paths.to_csv('/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/05_epigenetics_feature_list.csv', index=False)
func_anno_paths

### 06_network_properties

In [None]:
master_list = pd.read_csv('/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/Supplemental_table_1_corrected_KSA_20240729.csv')

# Subset the master list to get the protein properties
ntwk_df = master_list.loc[master_list['Feature category']=='Network properties',:]

# Get all the JSON file paths
files = []
path = '/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/21_arabidopsis_redundancy/06_network_properties'
for dirpath, dirnames, filenames in os.walk(path):
    if ("probably_junk" in dirpath) | ("individual_dicts" in dirpath):
        pass
    else:
        for filename in filenames:
            if filename.endswith('.json'):
                files.append(os.path.join(dirpath, filename))

len(files)

# Create a dataframe with the file paths and feature names 
file_paths = list(np.repeat(files[0], 175)) + list(np.repeat(files[1], 15))
file_paths = file_paths + list(np.repeat(files[3], 15))
file_paths = pd.DataFrame(file_paths).sort_values(0)

feat_names = file_paths.iloc[:,0].str.split('/').apply(lambda x: x[-1]).\
    apply(lambda x: x.split('_dict_011518.json')[0].replace('_', ' ')) # get the feature name from the JSON file name
feat_names = feat_names.apply(lambda x: x.replace(' dicts.json', ''))

file_paths.insert(1, 'Feature name', feat_names) # insert the feature name into the dataframe
file_paths.columns = ['File path', 'Feature name']
file_paths['Feature name'] = file_paths['Feature name'].str.lower()

print(len(file_paths), ntwk_df.shape) # 93 features

In [None]:
# Map the file paths to the master list of features
ntwk_df.loc[ntwk_df['Feature name'].str.contains('Co-expression'), 'NEW Feature name'] = 'all clust'
ntwk_df.loc[ntwk_df['Feature name'].str.contains('AraNet'), 'NEW Feature name'] = 'aranet'
ntwk_df.loc[ntwk_df['Feature name'].str.contains('Protein-protein'), 'NEW Feature name'] = 'ppi'

func_anno_paths = file_paths.merge(ntwk_df, left_on='Feature name', right_on='NEW Feature name',
    how='left', suffixes=('', '_y')) # merge to master list
func_anno_paths.drop(columns=['Feature name'], inplace=True)

# Drop duplicates created by merge
func_anno_paths = func_anno_paths.drop_duplicates('ML model feature name').reset_index().drop('index', axis=1)
func_anno_paths

In [None]:
# Make the checklist file
func_anno_paths['Transformation'] = func_anno_paths['Transformation'].fillna('None') # set NaNs in Transformation to "None"
func_anno_paths.insert(0, 'Created by', '')
func_anno_paths.insert(1, 'Date created', '')
func_anno_paths.rename(columns={'Feature name_y':'Feature name'}, inplace=True)
func_anno_paths.to_csv('/home/seguraab/ara-kinase-prediction/data/2021_cusack_data/06_network_properties_feature_list.csv', index=False)
func_anno_paths