## Check to see if Melissa's gene pairs have feature information in the 2021 Cusack data

In [1]:
import pandas as pd

# Melissa's data
ara_m = pd.read_csv("../data/20240725_melissa_ara_data/interactions_fitness.txt", sep="\t")

# 2021 Cusack data
dat_2 = pd.read_csv("../data/2021_cusack_data/Dataset_2.txt", sep="\t") # inclusive genetic redundancy gene pairs
dat_4 = pd.read_csv("../data/2021_cusack_data/Dataset_4.txt", sep="\t") # protein kinase family gene pairs
features = pd.read_excel("../data/2021_cusack_data/Supplemental_tables_revision.xlsx",
                         sheet_name="Supplemental table 10", header=0)

print(ara_m.shape, dat_2.shape, dat_4.shape)

(126, 4) (300, 4119) (10300, 184)


In [2]:
ara_m.head()

Unnamed: 0,Set,MA,MB,Interaction
0,1,At2g03450,At1g13900,0
1,2,At1g48620,At3g18035,1
2,3,At1g13020,At3g26400,1
3,5,At3g03860,At5g18120,1
4,7,At4g35470,At2g17440,0


In [3]:
dat_2.head()

Unnamed: 0,pair_ID,Class,continuous_endosperm_CHH_difference_squared,continuous_endosperm_CHH_average_squared,continuous_endosperm_CHH_max_squared,continuous_endosperm_CHH_min_squared,continuous_endosperm_CHH_pair_total_squared,continuous_endosperm_CHH_difference_log,continuous_endosperm_CHH_average_log,continuous_endosperm_CHH_max_log,...,continuous_amino_acid_similarity_squared,continuous_amino_acid_similarity_log,continuous_amino_acid_similarity_reciprocal,continuous_amino_acid_similarity_noTF,continuous_amino_acid_similarity_binned,PCA_Dim1,PCA_Dim2,PCA_Dim3,PCA_Dim4,PCA_Dim5
0,AT1G11310_AT1G61560,negative,2.93e-08,1.2e-05,1.3e-05,1.2e-05,4.9e-05,-3.766344,-2.456106,-2.445604,...,7191.04,1.928396,0.011792,84.8,4,-1.732958,2.498189,8.563027,-1.903397,1.258387
1,AT1G11310_AT2G39200,negative,1.16e-05,3e-06,1.2e-05,0.0,1.2e-05,-2.466868,-2.767898,-2.466868,...,6225.21,1.897077,0.012674,78.9,4,-2.323976,8.113223,10.797732,-3.260628,0.613043
2,AT4G11660_AT4G36990,negative,0.0,0.0,0.0,0.0,0.0,-5.280442,-4.156549,-3.855519,...,1980.25,1.64836,0.022472,44.5,2,-0.171407,16.080585,13.912486,-7.916315,5.644254
3,AT1G35720_AT2G38750,negative,4.89e-05,1.2e-05,4.9e-05,0.0,4.9e-05,-2.155336,-2.456366,-2.155336,...,2872.96,1.729165,0.018657,53.6,2,-8.630975,24.123981,6.612503,21.101815,-5.114057
4,AT1G12820_AT3G62980,negative,2.34e-06,7.5e-05,8.9e-05,6.3e-05,0.000301,-2.815643,-2.062003,-2.025306,...,5343.61,1.863917,0.01368,73.1,3,1.148119,-10.435068,-0.450987,-0.860005,4.550009


In [4]:
dat_4.head()

Unnamed: 0,pair_ID,Class,continuous_ks_pair_total_log,continuous_ks_average_squared,continuous_ks_max_reciprocal,continuous_gene_family_size_max_squared,continuous_gene_family_size_pair_total_noTF,continuous_ka_max_noTF,continuous_GOSLIM_total_squared,continuous_gene_family_size_average_noTF,...,continuous_GOSLIM_transferase_activity_number_in_pair_log,continuous_hormone_breadth_up_down_difference_squared,continuous_abiotic_root_breadth_up_down_min_reciprocal,continuous_biotic_expr_breadth_uponly_difference_noTF,binary_h25.stressfc.comp_same_or_not,continuous_GOSLIM_other_cytoplasmic_components_number_in_pair_squared,continuous_GOSLIM_plasma_membrane_number_in_pair_noTF,binary_k5.stressfc.run3_same_or_not,continuous_H3K23ac_number_in_pair_log,continuous_hormone_expr_breadth_uponly_pair_total_noTF
0,AT3G46420_AT4G20450,test,0.212587,0.665448,1.104972,51529,586,0.1842,256,9.0,...,0.30103,1,0.25,6,0,0,2,1,-1.0,0
1,AT5G01820_AT5G57630,test,1.855332,1284.114974,0.014298,85849,586,0.5755,676,293.0,...,0.30103,4,0.25,1,0,4,1,0,0.30103,2
2,AT2G37050_AT5G59660,test,1.861588,1321.646035,0.014288,108900,660,0.4802,169,330.0,...,0.0,0,10.0,0,0,0,2,0,0.0,0
3,AT3G17840_AT3G51740,test,1.795856,976.456253,0.016363,51529,454,0.6376,256,227.0,...,0.30103,0,0.166667,0,1,0,2,0,0.30103,0
4,AT1G11410_AT4G23190,test,0.544527,3.068978,0.426494,108900,660,0.646,324,330.0,...,0.30103,1,0.333333,13,0,0,2,0,0.0,1


In [5]:
# split gene pair identifiers into two columns
dat_2_genes = dat_2.pair_ID.str.split('_', expand=True)
dat_4_genes = dat_4.pair_ID.str.split('_', expand=True)

# get gene pairs with feature information in dat_2 and dat_4
ara_m['MA'] = ara_m['MA'].str.upper() # convert gene names to uppercase
ara_m['MB'] = ara_m['MB'].str.upper()

ara_m_dat_2 = ara_m.loc[(ara_m.MA.isin(dat_2_genes[0]) | ara_m.MA.isin(dat_2_genes[1])) & \
    (ara_m.MB.isin(dat_2_genes[0]) | ara_m.MB.isin(dat_2_genes[1]))]

ara_m_dat_4 = ara_m.loc[(ara_m.MA.isin(dat_4_genes[0]) | ara_m.MA.isin(dat_4_genes[1])) & \
    (ara_m.MB.isin(dat_4_genes[0]) | ara_m.MB.isin(dat_4_genes[1]))]

ara_m_features = pd.concat([ara_m_dat_2, ara_m_dat_4], axis=0).drop_duplicates()
ara_m_features.to_csv(
  '../data/20240725_melissa_ara_data/interactions_fitness_gene_pairs_in_2021_cusack.txt',
  sep="\t", index=False)

ara_m.loc[~ara_m.Set.isin(ara_m_features.Set)].to_csv(
  '../data/20240725_melissa_ara_data/interactions_fitness_gene_pairs_NOT_in_2021_cusack.txt',
  sep="\t", index=False)

print(ara_m_features.shape)

(22, 4)
