In [1]:
## The only library needed for this is pandas! So read that in first.
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [4]:

### This code takes an input correlation table (NOT A MATRIX! you must flatten it first) plus a category file and sorts it so that only comparisons of interest are included. For large files, this takes around 40 mins using 8 threads on a local machine.

# Load the correlation data. This code assumes that you have flattened your corr matrix into table format
correlation_df = pd.read_csv('Spearman_corr_table_normseqdepth_BH.csv')

# Load the mapping data. The mapping file is used by the ARGs-oap pipeline to generate counts, but another column is added to broadly categorize each gene as being for antibiotic resistance or metal resistance.
mapping_df = pd.read_csv('file_for_filtering_large_correlations.csv')

# Create an empty DataFrame to store the filtered results
filtered_results = pd.DataFrame(columns=['gene1', 'gene2', 'correlation', 'p.adjusted'])

# Iterate through each row in the correlation data
for index, row in correlation_df.iterrows():
    gene1 = row['row']
    gene2 = row['column']

    # Find the category for gene1. By category, it's going to classify it as either "metal" or "antibiotic"
   
    category_gene1 = mapping_df.loc[mapping_df['subtype'] == gene1, 'Class'].values
    if len(category_gene1) == 0:
        continue  # Skip if gene1 not found in mapping

    # Find the category for gene2. By category, it's going to classify it as either "metal" or "antibiotic"
   
    category_gene2 = mapping_df.loc[mapping_df['subtype'] == gene2, 'Class'].values
    if len(category_gene2) == 0:
        continue  # Skip if gene2 not found in mapping

    # I am only interested in correlations made exclusively between a metal and antibiotic. So, if the previous step classified gene1 and gene 2 as both metal or antibiotic, discard! If different, append them to a new file.
    # We also only consider highly and significantly correlated pairs. So, the correlation coefficient > 0.70 and the adjusted p-value < 0.05.
   
    if category_gene1[0] != category_gene2[0]:
        # Check if correlation and p-value meet criteria
        if row['cor'] > 0.70 and row['p_value'] < 0.05:
            # Append the row to the filtered results DataFrame
            filtered_results = filtered_results._append({'gene1': gene1, 'gene2': gene2, 'correlation': row['cor'], 'p.adjusted': row['p_value']},
                                                       ignore_index=True)

# Save the result to a new CSV file. Unhash the below line to use this
#filtered_results.to_csv('Spearman_corr_normseqdepth_filtered_BH_Final.csv', index=False)


  filtered_results = filtered_results._append({'gene1': gene1, 'gene2': gene2, 'correlation': row['cor'], 'p.adjusted': row['p_value']},
