In [16]:
import pandas as pd
from pathlib import Path
from pprint import pprint

In [1]:
!ls /home/koala/my_projects/metagenome_metabolic_profiling/data

cog-24.cog.csv	cog-24.fun.tab		  Readme.COG2024.txt
cog-24.def.tab	pathway_result_table.tsv


# P.1. Load COG ID to Functional Category mapping (cog-24.def.tab file)

In [9]:
# descriptions of COG functional categories
cog_fun_file = "/home/koala/my_projects/metagenome_metabolic_profiling/data/cog-24.fun.edited.tab"
cog_fun_df = pd.read_csv(
    cog_fun_file, 
    sep="\t", 
    names=['COG Functional category ID', 'Functional group', 'RGB color', 'FC description']
    )
cog_fun_df.head()

# Functional groups
# 1:	INFORMATION STORAGE AND PROCESSING
# 2: 	CELLULAR PROCESSES AND SIGNALING
# 3:	METABOLISM
# 4:	POORLY CHARACTERIZED

Unnamed: 0,COG Functional category ID,Functional group,RGB color,FC description
0,J,1,FCCCFC,"Translation, ribosomal structure and biogenesis"
1,A,1,FCDCFC,RNA processing and modification
2,K,1,FCDCEC,Transcription
3,L,1,FCDCDC,"Replication, recombination and repair"
4,B,1,FCDCCC,Chromatin structure and dynamics


# P.2. Functional Category descriptions and groups (cog-24.fun.edited.tab file)

In [10]:
# COG descriptions
cog_def_file = "/home/koala/my_projects/metagenome_metabolic_profiling/data/cog-24.def.tab"
cog_def_df = pd.read_csv(
    cog_def_file, 
    sep="\t", 
    names=['COG ID', 'COG Functional category ID', 'COG name', 'Gene', 'Pathway', 'PubMed ID', 'PDB ID']
    )
cog_def_df.head()

Unnamed: 0,COG ID,COG Functional category ID,COG name,Gene,Pathway,PubMed ID,PDB ID
0,COG0001,H,Glutamate-1-semialdehyde aminotransferase,HemL,Heme biosynthesis,,2CFB
1,COG0002,E,N-acetyl-gamma-glutamylphosphate reductase,ArgC,Arginine biosynthesis,,3DR3
2,COG0003,P,"Anion-transporting ATPase, ArsA/GET3 family",ArsA,,,1F48
3,COG0004,P,Ammonia channel protein AmtB,AmtB,,,1U77
4,COG0005,F,Purine nucleoside phosphorylase,XapA,Purine salvage,,1YQQ


In [121]:
cog_def_df[cog_def_df['COG Functional category ID'] == "H"]

Unnamed: 0,COG ID,COG Functional category ID,COG name,Gene,Pathway,PubMed ID,PDB ID
0,COG0001,H,Glutamate-1-semialdehyde aminotransferase,HemL,Heme biosynthesis,,2CFB
6,COG0007,H,Uroporphyrinogen-III methylase (siroheme synth...,CysG,Cobalamine/B12 biosynthesis,,1PJQ
10,COG0011,H,"Thiamin-binding stress-response protein YqgV, ...",YqgV,,20471400,1LXJ
28,COG0029,H,Aspartate oxidase,NadB,NAD biosynthesis,,1KNP
41,COG0043,H,3-polyprenyl-4-hydroxybenzoate decarboxylase,UbiD,Ubiquinone biosynthesis,28057757,2IDB
...,...,...,...,...,...,...,...
4702,COG5749,H,Chlorophyllide a oxygenase/letal leaf spot pro...,PobA,,14657372;22366162,
4706,COG5753,H,Bacteriochlorophyllide reductase subunit BchX,BchX,,21886856,
4707,COG5754,H,Bacteriochlorophyllide reductase subunit BchY,BchY,,21886856,
4708,COG5755,H,Bacteriochlorophyllide reductase subunit BchZ,BchZ,,21886856,


In [120]:
for path in cog_def_df['Pathway']:
    print(path)

Heme biosynthesis
Arginine biosynthesis
nan
nan
Purine salvage
nan
Cobalamine/B12 biosynthesis
Heme biosynthesis
tRNA modification
Urea cycle
nan
nan
Aminoacyl-tRNA synthetases
Proline biosynthesis
Purine biosynthesis
Aminoacyl-tRNA synthetases
Aminoacyl-tRNA synthetases
Aminoacyl-tRNA synthetases
Lysine biosynthesis
nan
Pentose phosphate pathway
Pyruvate oxidation
Translation factors
nan
nan
Purine biosynthesis
Purine biosynthesis
Isoleucine, leucine, valine biosynthesis
NAD biosynthesis
16S rRNA modification
Cysteine biosynthesis
nan
Purine biosynthesis
Pyrimidine salvage
Pentose phosphate pathway
tRNA modification
nan
TCA cycle
Histidine biosynthesis
Purine biosynthesis
tRNA modification
Ubiquinone biosynthesis
Pyrimidine biosynthesis
TCA cycle
Purine biosynthesis
Purine biosynthesis
Ribosome 30S subunit
Ribosome 30S subunit
Translation factors
Ribosome 30S subunit
Ribosome 30S subunit
nan
Riboflavin/FAD biosynthesis
FoF1-type ATP synthase
FoF1-type ATP synthase
Glycolysis
nan
Isole

In [14]:
cog_def_df.tail(10)

Unnamed: 0,COG ID,COG Functional category ID,COG name,Gene,Pathway,PubMed ID,PDB ID
4971,COG6081,U,"LydA family holin, phage or type X secretion s...",LydA,Type X secretion system,32885520,
4972,COG6082,N,"Pre-archaellin peptidase ArlK/FlaK/PibD, inclu...",ArlK,Archaella,25699024;37334237,3S0X
4973,COG6083,U,"Type IX system secreted protein, contains C-te...",CTD-A,Type IX secretion/gliding motility system,28396348,
4974,COG6084,KN,"Regulator of haloarchaeal motility, adhesion a...",TbsP,Archaella,38204420,
4975,COG6085,U,"Type IX system secreted protein, contains C-te...",CTD-B,Type IX secretion/gliding motility system,27005013;28396348,5HFS
4976,COG6086,U,"Type IX system secreted protein, contains C-te...",CTD-C,Type IX secretion/gliding motility system,24363341;27005013;28396348,
4977,COG6088,U,Membrane-anchored component of a predicted arc...,MMP0364,,25583072,
4978,COG6090,N,"Archaellum component ArlX/FlaX, crenarchaea-sp...",ArlX,Archaella,22081969,
4979,COG6091,U,Crenarchaeal DNA import system 6TM protein CedA,CedA,,26884154,
4980,COG6092,U,Crenarchaeal DNA import system 2TM protein CedA1,CedA1,,26884154,


In [11]:
cog_def_df.shape

(4981, 7)

In [8]:
cog_def_df['Pathway'].unique()[:10]

array(['Heme biosynthesis', 'Arginine biosynthesis', nan,
       'Purine salvage', 'Cobalamine/B12 biosynthesis',
       'tRNA modification', 'Urea cycle', 'Aminoacyl-tRNA synthetases',
       'Proline biosynthesis', 'Purine biosynthesis'], dtype=object)

In [54]:
for i, row in cog_def_df.iterrows():
    category = row['COG Functional category ID']
    if len(category) > 1:
        print(f"Category: {category}")

Category: EH
Category: EH
Category: EF
Category: HI
Category: EH
Category: EG
Category: EQ
Category: EQ
Category: EH
Category: OM
Category: EH
Category: EHJQ
Category: KJ
Category: EG
Category: DP
Category: FT
Category: JE
Category: LJ
Category: KT
Category: HJ
Category: IQ
Category: OV
Category: TK
Category: EM
Category: TE
Category: FP
Category: VI
Category: GH
Category: FR
Category: JHO
Category: EP
Category: KR
Category: DN
Category: EF
Category: MDT
Category: CE
Category: GEPR
Category: PT
Category: ER
Category: QR
Category: EF
Category: EH
Category: DH
Category: DL
Category: FR
Category: HC
Category: KL
Category: HR
Category: PT
Category: LX
Category: HR
Category: MU
Category: EP
Category: CR
Category: FT
Category: NU
Category: CP
Category: HC
Category: HT
Category: HR
Category: GER
Category: GM
Category: FV
Category: MU
Category: TK
Category: OK
Category: FV
Category: DM
Category: FH
Category: CO
Category: GM
Category: MN
Category: JLK
Category: ET
Category: MV
Category: CHR
Cat

In [55]:
list("KJ")

['K', 'J']

In [15]:
# pd.merge(cog_fun_df, cog_def_df, on='COG Functional category ID', how='outer')
# doesn't work when the there are more than 1 COG Functional category IDs: DK, DKO, DO

In [40]:
# Get the cog IDs from each sample
samplex_cog_file = "/home/koala/my_projects/metagenome_metabolic_profiling/data/samples/samplex_protein-cog.tsv"

sample_name = "samplex"
cog_ids_dict = {}

# Initializing the entry for this sample. The value is a list
cog_ids_dict[sample_name] = []

with open(samplex_cog_file, mode="r") as fr:
    for line in fr:
        cog_id = line.strip().split(":")[1]
        # print(f"COG ID: {cog_id}")
        
        # Appending cog_ids to the sample list value
        # It doesn't matter if there are duplicated cog_ids?
        cog_ids_dict[sample_name].append(cog_id)
        
cog_ids_dict

{'samplex': ['COG0190',
  'COG0826',
  'COG1538',
  'COG1136',
  'COG0577',
  'COG0351',
  'COG0352',
  'COG0422',
  'COG0057',
  'COG1123',
  'COG0359',
  'COG0360',
  'COG1595',
  'COG4775',
  'COG0719',
  'COG0396',
  'COG0782']}

In [None]:
for sample, cog_list in cog_ids_dict.items():
    for sample_cog_id in cog_list:
        if 

In [52]:
cog_fc_dict = {}

for i, row in cog_def_df.iterrows():
    ncbi_cog_id = row['COG ID']
    ncbi_fc = row['COG Functional category ID']
    ncbi_pathway = row['Pathway']
    
    for sample, cog_list in cog_ids_dict.items():
        # Initialize dictionary for each new sample
        cog_fc_dict[sample] = []
        
        for sample_cog_id in cog_list:
            if sample_cog_id == ncbi_cog_id:
                print(f"{sample_cog_id} same as {ncbi_cog_id}, Functional category: {ncbi_fc}, Pathway: {ncbi_pathway}")
                
                cog_fc_dict[sample].append((ncbi_cog_id, ncbi_fc, ncbi_pathway))
                
cog_fc_dict

COG0057 same as COG0057, Functional category: G, Pathway: Glycolysis
COG0190 same as COG0190, Functional category: H, Pathway: nan
COG0351 same as COG0351, Functional category: H, Pathway: Thiamine biosynthesis
COG0352 same as COG0352, Functional category: H, Pathway: Thiamine biosynthesis
COG0359 same as COG0359, Functional category: J, Pathway: Ribosome 50S subunit
COG0360 same as COG0360, Functional category: J, Pathway: Ribosome 30S subunit
COG0396 same as COG0396, Functional category: O, Pathway: nan
COG0422 same as COG0422, Functional category: H, Pathway: Thiamine biosynthesis
COG0577 same as COG0577, Functional category: V, Pathway: nan
COG0719 same as COG0719, Functional category: O, Pathway: nan
COG0782 same as COG0782, Functional category: K, Pathway: nan
COG0826 same as COG0826, Functional category: J, Pathway: 23S rRNA modification
COG1123 same as COG1123, Functional category: O, Pathway: nan
COG1136 same as COG1136, Functional category: M, Pathway: nan
COG1538 same as COG

{'samplex': []}

In [None]:
cog_def_df.loc[cog_def_df['COG ID'] == cog, 'COG Functional category ID'].values

In [58]:
for i, row in cog_def_df.iterrows():
    ncbi_cog_id = row['COG ID']
    ncbi_fc = row['COG Functional category ID']
    ncbi_pathway = row['Pathway']
    
    if ncbi_cog_id == 'COG0190':
        category = cog_def_df.loc[cog_def_df['COG ID'] == 'COG0190', 'COG Functional category ID'].values
        print(f"category: {category}")

category: ['H']


In [53]:
for sample, cog_list in cog_ids_dict.items():
    # Initialize dictionary for each new sample
    cog_fc_dict[sample] = []
    
    for sample_cog_id in cog_list:
        for i, row in cog_def_df.iterrows():
            ncbi_cog_id = row['COG ID']
            ncbi_fc = row['COG Functional category ID']
            ncbi_pathway = row['Pathway']
            if sample_cog_id == ncbi_cog_id:
                print(f"{sample_cog_id} same as {ncbi_cog_id}, Functional category: {ncbi_fc}, Pathway: {ncbi_pathway}")
                
                cog_fc_dict[sample].append((ncbi_cog_id, ncbi_fc, ncbi_pathway))
cog_fc_dict

COG0190 same as COG0190, Functional category: H, Pathway: nan
COG0826 same as COG0826, Functional category: J, Pathway: 23S rRNA modification
COG1538 same as COG1538, Functional category: M, Pathway: nan
COG1136 same as COG1136, Functional category: M, Pathway: nan
COG0577 same as COG0577, Functional category: V, Pathway: nan
COG0351 same as COG0351, Functional category: H, Pathway: Thiamine biosynthesis
COG0352 same as COG0352, Functional category: H, Pathway: Thiamine biosynthesis
COG0422 same as COG0422, Functional category: H, Pathway: Thiamine biosynthesis
COG0057 same as COG0057, Functional category: G, Pathway: Glycolysis
COG1123 same as COG1123, Functional category: O, Pathway: nan
COG0359 same as COG0359, Functional category: J, Pathway: Ribosome 50S subunit
COG0360 same as COG0360, Functional category: J, Pathway: Ribosome 30S subunit
COG1595 same as COG1595, Functional category: K, Pathway: RNA polymerase
COG4775 same as COG4775, Functional category: M, Pathway: nan
COG0719 

{'samplex': [('COG0190', 'H', nan),
  ('COG0826', 'J', '23S rRNA modification'),
  ('COG1538', 'M', nan),
  ('COG1136', 'M', nan),
  ('COG0577', 'V', nan),
  ('COG0351', 'H', 'Thiamine biosynthesis'),
  ('COG0352', 'H', 'Thiamine biosynthesis'),
  ('COG0422', 'H', 'Thiamine biosynthesis'),
  ('COG0057', 'G', 'Glycolysis'),
  ('COG1123', 'O', nan),
  ('COG0359', 'J', 'Ribosome 50S subunit'),
  ('COG0360', 'J', 'Ribosome 30S subunit'),
  ('COG1595', 'K', 'RNA polymerase'),
  ('COG4775', 'M', nan),
  ('COG0719', 'O', nan),
  ('COG0396', 'O', nan),
  ('COG0782', 'K', nan)]}

In [64]:
for sample, cog_list in cog_ids_dict.items():
    # Initialize dictionary for each new sample
    cog_fc_dict[sample] = []
    
    for sample_cog_id in cog_list:
        for i, row in cog_def_df.iterrows():
            ncbi_cog_id = row['COG ID']
            ncbi_fc = row['COG Functional category ID']
            ncbi_pathway = row['Pathway']
            if sample_cog_id == 'COG0190' and ncbi_cog_id == 'COG0190':
                print(f"COG ID: {sample_cog_id}, category: {ncbi_fc}, Pathway: {ncbi_pathway}")
                category = cog_def_df.loc[cog_def_df['COG ID'] == sample_cog_id, 'COG Functional category ID'].values
                print(f"category: {category}")

COG ID: COG0190, category: H, Pathway: nan
category: ['H']


In [22]:
samples_cog_dir = Path("/home/koala/my_projects/metagenome_metabolic_profiling/data/samples")

for item in samples_cog_dir.iterdir():
    if item.is_file():
        sample = item.stem.split("_")[0]
        print(sample)

samplex


# P.3. Script 1

In [123]:
import pandas as pd
from collections import defaultdict

# Load cog-24.def.tab file (COG ID to Functional Category mapping)
def load_cog_def(cog_def_file):
    cog_def_df = pd.read_csv(
        cog_def_file, 
        sep='\t', 
        names=['COG ID', 'COG Functional category ID', 'COG name', 'Gene', 'Pathway', 'PubMed ID', 'PDB ID']
        )
    cog_def_df['COG ID'] = cog_def_df['COG ID'].str.strip()  # Strip any spaces
    return cog_def_df

# Load cog-24.fun.edited.tab file (Functional Category descriptions and groups)
def load_cog_fun(cog_fun_file):
    cog_fun_df = pd.read_csv(
        cog_fun_file, 
        sep='\t', 
        names=['COG Functional category ID', 'Functional group', 'RGB color', 'FC description']
        )
    return cog_fun_df

# Load sample-specific protein-COG mappings
def load_sample_protein_cog(sample_protein_cog_file):
    sample_cog_df = pd.read_csv(sample_protein_cog_file, sep='\t', header=None, names=['Protein', 'COG'])
    sample_cog_df['COG'] = sample_cog_df['COG'].str.strip()
    sample_cog_df['COG'] = sample_cog_df['COG'].str.replace('COG:', '')  # Clean up COG prefixes
    return sample_cog_df

# Handle combined functional categories (like EHJQ, etc.)
def map_combined_categories(cog_categories):
    return list(cog_categories)  # Split the string into individual categories (like EHJQ -> ['E', 'H', 'J', 'Q'])

# Function to summarize the functional categories for a given sample
def summarize_functional_categories(sample_cog_df, cog_def_df, cog_fun_df):
    category_counts = defaultdict(int)  # Count occurrences of each functional category
    pathway_counts = defaultdict(int) # Count occurrences of each Pathway
    
    for cog in sample_cog_df['COG']:
        # Find the functional category/ies for the COG
        # print(f"Processing COG: {cog}")
        functional_categories = cog_def_df.loc[cog_def_df['COG ID'] == cog, 'COG Functional category ID'].values
        pathways = cog_def_df.loc[cog_def_df['COG ID'] == cog, 'Pathway'].values[0]
        
        # Handle case when no category is found
        if len(functional_categories) == 0:
            print(f"COG ID {cog} not found in cog_def_df.")
            continue
        
        # print(f"Functional categories found: {functional_categories}")
        functional_categories = functional_categories[0]  # Extract the category
        # print(f"Functional categories found: {map_combined_categories(functional_categories)}")
        # print(f"Number of Functional categories found: {len(map_combined_categories(functional_categories))}")
        
        # Get the number of categories for this COG
        num_categories = len(map_combined_categories(functional_categories))
        # Handle combined categories (split them into individual letters, e.g., 'ER' -> ['E', 'R'])
        for category in map_combined_categories(functional_categories):
            # Add 1 / num_categories to each category
            category_counts[category] += 1 / num_categories

    # Collect rows for the summary DataFrame
    summary_rows = []
    
    for category, count in category_counts.items():
        group = cog_fun_df.loc[cog_fun_df['COG Functional category ID'] == category, 'Functional group'].values[0]
        description = cog_fun_df.loc[cog_fun_df['COG Functional category ID'] == category, 'FC description'].values[0]
        color = cog_fun_df.loc[cog_fun_df['COG Functional category ID'] == category, 'RGB color'].values[0]
        # Collect the row as a dictionary
        summary_rows.append(
            {
                'Functional category': category,
                'Functional group': group,
                'Count': count,
                'FC description': description,
                'Color': color
            }
                            )

    # create the summary DataFrame with category, group, and count columns
    summary_df = pd.DataFrame(summary_rows)
    
    return summary_df

# Main function to process each sample and summarize the functional categories
def process_sample(sample, sample_protein_cog_file, cog_def_file, cog_fun_file):
    # Load the data
    cog_def_df = load_cog_def(cog_def_file)
    cog_fun_df = load_cog_fun(cog_fun_file)
    sample_cog_df = load_sample_protein_cog(sample_protein_cog_file)

    # Summarize the functional categories
    summary_df = summarize_functional_categories(sample_cog_df, cog_def_df, cog_fun_df)
    # Dataframe for comparison between samples
    summary_df['Functional group'] = summary_df['Functional group'].replace(
        {
            1: 'Information storage and processing',
            2: 'Cellular processes and signaling',
            3: 'Metabolism',
            4: 'Poorly characterized',
        }
            )
    # Group the rows with the same 'Functional group'
    summary_df = summary_df.sort_values(by='Functional group').reset_index(drop=True)
    # Create non-normalized DataFrame
    non_normalized_df = summary_df[['Functional group', 'FC description', 'Count']].copy()
    non_normalized_df = non_normalized_df.rename(columns={'Count': sample})
    # Create normalized DataFrame
    ## Normalize the counts to relative abundance
    normalized_df = summary_df[['Functional group', 'FC description', 'Count']].copy()
    normalized_df[sample] = normalized_df['Count'] / normalized_df['Count'].sum()
    normalized_df.drop(columns=['Count'], inplace=True)
    # Return DataFrames
    return non_normalized_df, normalized_df

# Example usage:
sample = "samplex"
# sample_protein_cog_file = "/home/koala/my_projects/metagenome_metabolic_profiling/data/samples/samplex_protein-cog.tsv"
sample_protein_cog_file = "/home/koala/my_projects/metagenome_metabolic_profiling/data/samples/sampley_protein-cog.tsv"
cog_def_file = "/home/koala/my_projects/metagenome_metabolic_profiling/data/cog-24.def.tab"
cog_fun_file = "/home/koala/my_projects/metagenome_metabolic_profiling/data/cog-24.fun.edited.tab"
non_normalized_df, normalized_df = process_sample(sample, sample_protein_cog_file, cog_def_file, cog_fun_file)
non_normalized_df

Unnamed: 0,Functional group,FC description,samplex
0,Cellular processes and signaling,Cell wall/membrane/envelope biogenesis,3.0
1,Cellular processes and signaling,Defense mechanisms,1.0
2,Cellular processes and signaling,"Posttranslational modification, protein turnov...",3.0
3,Information storage and processing,"Translation, ribosomal structure and biogenesis",2.5
4,Information storage and processing,Transcription,2.0
5,Metabolism,Amino acid transport and metabolism,0.5
6,Metabolism,Coenzyme transport and metabolism,3.0
7,Metabolism,Carbohydrate transport and metabolism,1.0
8,Poorly characterized,General function prediction only,1.0


In [124]:
normalized_df

Unnamed: 0,Functional group,FC description,samplex
0,Cellular processes and signaling,Cell wall/membrane/envelope biogenesis,0.176471
1,Cellular processes and signaling,Defense mechanisms,0.058824
2,Cellular processes and signaling,"Posttranslational modification, protein turnov...",0.176471
3,Information storage and processing,"Translation, ribosomal structure and biogenesis",0.147059
4,Information storage and processing,Transcription,0.117647
5,Metabolism,Amino acid transport and metabolism,0.029412
6,Metabolism,Coenzyme transport and metabolism,0.176471
7,Metabolism,Carbohydrate transport and metabolism,0.058824
8,Poorly characterized,General function prediction only,0.058824


In [125]:
samples_cog_dir = Path("/home/koala/my_projects/metagenome_metabolic_profiling/data/samples")
cog_def_file = "/home/koala/my_projects/metagenome_metabolic_profiling/data/cog-24.def.tab"
cog_fun_file = "/home/koala/my_projects/metagenome_metabolic_profiling/data/cog-24.fun.edited.tab"

# Initialize lists to hold DataFrames
non_normalized_list = []
normalized_list = []

# Iterate over files in the sample directory
for item in samples_cog_dir.iterdir():
    if item.is_file() and item.suffix == ".tsv":  # Filter for .tsv files
        sample_protein_cog_file = item
        # Extract sample name from filename
        sample = item.stem.split("_")[0]
        print(f"Processing {sample} from file: {item}")
        
        # Process the sample
        non_normalized_df, normalized_df = process_sample(sample, sample_protein_cog_file, cog_def_file, cog_fun_file)
        # Append the results to the list
        non_normalized_list.append(non_normalized_df)
        normalized_list.append(normalized_df)
        
# Merge all DataFrames by 'Functional group' and 'FC description'
non_normalized_combined = pd.concat(non_normalized_list).groupby(['Functional group', 'FC description'], as_index=False).sum()
normalized_combined = pd.concat(normalized_list).groupby(['Functional group', 'FC description'], as_index=False).sum()

Processing samplex from file: /home/koala/my_projects/metagenome_metabolic_profiling/data/samples/samplex_protein-cog.tsv
Processing sampley from file: /home/koala/my_projects/metagenome_metabolic_profiling/data/samples/sampley_protein-cog.tsv


In [126]:
non_normalized_combined

Unnamed: 0,Functional group,FC description,samplex,sampley
0,Cellular processes and signaling,Cell wall/membrane/envelope biogenesis,3.0,3.0
1,Cellular processes and signaling,Defense mechanisms,1.0,1.0
2,Cellular processes and signaling,"Posttranslational modification, protein turnov...",3.0,3.0
3,Information storage and processing,Transcription,2.0,2.0
4,Information storage and processing,"Translation, ribosomal structure and biogenesis",3.0,2.5
5,Metabolism,Amino acid transport and metabolism,0.0,0.5
6,Metabolism,Carbohydrate transport and metabolism,1.0,1.0
7,Metabolism,Coenzyme transport and metabolism,4.0,3.0
8,Poorly characterized,General function prediction only,0.0,1.0


In [127]:
normalized_combined

Unnamed: 0,Functional group,FC description,samplex,sampley
0,Cellular processes and signaling,Cell wall/membrane/envelope biogenesis,0.176471,0.176471
1,Cellular processes and signaling,Defense mechanisms,0.058824,0.058824
2,Cellular processes and signaling,"Posttranslational modification, protein turnov...",0.176471,0.176471
3,Information storage and processing,Transcription,0.117647,0.117647
4,Information storage and processing,"Translation, ribosomal structure and biogenesis",0.176471,0.147059
5,Metabolism,Amino acid transport and metabolism,0.0,0.029412
6,Metabolism,Carbohydrate transport and metabolism,0.058824,0.058824
7,Metabolism,Coenzyme transport and metabolism,0.235294,0.176471
8,Poorly characterized,General function prediction only,0.0,0.058824


**Key Improvements:**

1. **Filter for `.tsv` Files**: 
   - The check `item.suffix == ".tsv"` ensures that only `.tsv` files are processed, in case there are other types of files in the directory.
   
2. **Use `pd.concat()` Instead of `append()`**:
   - Instead of appending DataFrames to a list and then merging later, I’ve used `pd.concat()` with a **groupby** to merge the DataFrames on the fly.
   - The `groupby(['Functional group', 'FC description'])` ensures that if the same functional group and description appear in multiple samples, they are aggregated (you can sum the counts for each group).
   
3. **Summing the Data**: 
   - Using `.groupby(...).sum()` allows the counts across different samples to be aggregated into a single DataFrame.

**What Happens in the `process_all_samples` Function:**

- **Iterates over each file**: For each file in the directory, it processes the sample using `process_sample`.
- **Appends the results**: The non-normalized and normalized DataFrames are appended to their respective lists.
- **Merges the DataFrames**: At the end, the DataFrames are merged by `'Functional group'` and `'FC description'` using `pd.concat()` and `groupby`.

**Benefits**:

- **More Pythonic**: Using `pd.concat()` and filtering the file types makes the code cleaner and avoids the need for post-processing steps.
- **Efficient**: Merging on-the-fly reduces memory overhead and avoids multiple passes through the data.

In [128]:
def process_all_samples(samples_cog_dir, cog_def_file, cog_fun_file):
    # Initialize lists to hold DataFrames
    non_normalized_list = []
    normalized_list = []
    
    # Iterate over files in the sample directory
    for item in samples_cog_dir.iterdir():
        if item.is_file() and item.suffix == ".tsv":  # Filter for .tsv files
            sample_protein_cog_file = item
            sample = item.stem.split("_")[0]  # Extract sample name from filename
            print(f"Processing {sample} from file: {item}")
            
            # Process the sample
            non_normalized_df, normalized_df = process_sample(sample, sample_protein_cog_file, cog_def_file, cog_fun_file)
            
            # Append the results to the list
            non_normalized_list.append(non_normalized_df)
            normalized_list.append(normalized_df)
    
    # Concatenate and merge all DataFrames by 'Functional group' and 'FC description'
    non_normalized_combined = pd.concat(non_normalized_list).groupby(['Functional group', 'FC description'], as_index=False).sum()
    normalized_combined = pd.concat(normalized_list).groupby(['Functional group', 'FC description'], as_index=False).sum()
    
    return non_normalized_combined, normalized_combined

In [130]:
samples_cog_dir = Path("/home/koala/my_projects/metagenome_metabolic_profiling/data/samples")
cog_def_file = "/home/koala/my_projects/metagenome_metabolic_profiling/data/cog-24.def.tab"
cog_fun_file = "/home/koala/my_projects/metagenome_metabolic_profiling/data/cog-24.fun.edited.tab"

non_normalized_combined, normalized_combined = process_all_samples(samples_cog_dir, cog_def_file, cog_fun_file)

Processing samplex from file: /home/koala/my_projects/metagenome_metabolic_profiling/data/samples/samplex_protein-cog.tsv
Processing sampley from file: /home/koala/my_projects/metagenome_metabolic_profiling/data/samples/sampley_protein-cog.tsv


In [131]:
non_normalized_combined

Unnamed: 0,Functional group,FC description,samplex,sampley
0,Cellular processes and signaling,Cell wall/membrane/envelope biogenesis,3.0,3.0
1,Cellular processes and signaling,Defense mechanisms,1.0,1.0
2,Cellular processes and signaling,"Posttranslational modification, protein turnov...",3.0,3.0
3,Information storage and processing,Transcription,2.0,2.0
4,Information storage and processing,"Translation, ribosomal structure and biogenesis",3.0,2.5
5,Metabolism,Amino acid transport and metabolism,0.0,0.5
6,Metabolism,Carbohydrate transport and metabolism,1.0,1.0
7,Metabolism,Coenzyme transport and metabolism,4.0,3.0
8,Poorly characterized,General function prediction only,0.0,1.0


In [132]:
normalized_combined

Unnamed: 0,Functional group,FC description,samplex,sampley
0,Cellular processes and signaling,Cell wall/membrane/envelope biogenesis,0.176471,0.176471
1,Cellular processes and signaling,Defense mechanisms,0.058824,0.058824
2,Cellular processes and signaling,"Posttranslational modification, protein turnov...",0.176471,0.176471
3,Information storage and processing,Transcription,0.117647,0.117647
4,Information storage and processing,"Translation, ribosomal structure and biogenesis",0.176471,0.147059
5,Metabolism,Amino acid transport and metabolism,0.0,0.029412
6,Metabolism,Carbohydrate transport and metabolism,0.058824,0.058824
7,Metabolism,Coenzyme transport and metabolism,0.235294,0.176471
8,Poorly characterized,General function prediction only,0.0,0.058824


In [60]:
print(cog_def_df.columns)

Index(['COG ID', 'COG Functional category ID', 'COG name', 'Gene', 'Pathway',
       'PubMed ID', 'PDB ID'],
      dtype='object')


In [61]:
# Check and clean column names
print("Column names in cog_def_df:", cog_def_df.columns)

# Ensure column names are stripped of any leading/trailing spaces
cog_def_df.columns = cog_def_df.columns.str.strip()

# Try fetching the functional category again
functional_categories = cog_def_df.loc[cog_def_df['COG ID'] == 'COG0190', 'COG Functional category ID'].values
print(f"category: {functional_categories}")

Column names in cog_def_df: Index(['COG ID', 'COG Functional category ID', 'COG name', 'Gene', 'Pathway',
       'PubMed ID', 'PDB ID'],
      dtype='object')
category: ['H']


# P.3.5. Script 1.5

In [139]:
import pandas as pd
from collections import defaultdict

# Load cog-24.def.tab file (COG ID to Functional Category mapping)
def load_cog_def(cog_def_file):
    cog_def_df = pd.read_csv(
        cog_def_file, 
        sep='\t', 
        names=['COG ID', 'COG Functional category ID', 'COG name', 'Gene', 'Pathway', 'PubMed ID', 'PDB ID']
        )
    cog_def_df['COG ID'] = cog_def_df['COG ID'].str.strip()  # Strip any spaces
    return cog_def_df

# Load cog-24.fun.edited.tab file (Functional Category descriptions and groups)
def load_cog_fun(cog_fun_file):
    cog_fun_df = pd.read_csv(
        cog_fun_file, 
        sep='\t', 
        names=['COG Functional category ID', 'Functional group', 'RGB color', 'FC description']
        )
    return cog_fun_df

# Load sample-specific protein-COG mappings
def load_sample_protein_cog(sample_protein_cog_file):
    sample_cog_df = pd.read_csv(sample_protein_cog_file, sep='\t', header=None, names=['Protein', 'COG'])
    sample_cog_df['COG'] = sample_cog_df['COG'].str.strip()
    sample_cog_df['COG'] = sample_cog_df['COG'].str.replace('COG:', '')  # Clean up COG prefixes
    return sample_cog_df

# Handle combined functional categories (like EHJQ, etc.)
def map_combined_categories(cog_categories):
    return list(cog_categories)  # Split the string into individual categories (like EHJQ -> ['E', 'H', 'J', 'Q'])

# Function to summarize the functional categories for a given sample
def summarize_functional_categories(sample_cog_df, cog_def_df, cog_fun_df):
    category_counts = defaultdict(int)  # Count occurrences of each functional category
    pathway_counts = defaultdict(int) # Count occurrences of each Pathway
    
    for cog in sample_cog_df['COG']:
        # Find the functional category/ies for the COG
        # print(f"Processing COG: {cog}")
        functional_categories = cog_def_df.loc[cog_def_df['COG ID'] == cog, 'COG Functional category ID'].values
        pathways = cog_def_df.loc[cog_def_df['COG ID'] == cog, 'Pathway'].values[0]
        
        # Handle case when no category is found
        if len(functional_categories) == 0:
            print(f"COG ID {cog} not found in cog_def_df.")
            continue
        
        # print(f"Functional categories found: {functional_categories}")
        functional_categories = functional_categories[0]  # Extract the category
        # print(f"Functional categories found: {map_combined_categories(functional_categories)}")
        # print(f"Number of Functional categories found: {len(map_combined_categories(functional_categories))}")
        
        # Get the number of categories for this COG
        num_categories = len(map_combined_categories(functional_categories))
        # Handle combined categories (split them into individual letters, e.g., 'ER' -> ['E', 'R'])
        for category in map_combined_categories(functional_categories):
            # Add 1 / num_categories to each category
            category_counts[category] += 1 / num_categories

    # Collect rows for the summary DataFrame
    summary_rows = []
    
    for category, count in category_counts.items():
        group = cog_fun_df.loc[cog_fun_df['COG Functional category ID'] == category, 'Functional group'].values[0]
        description = cog_fun_df.loc[cog_fun_df['COG Functional category ID'] == category, 'FC description'].values[0]
        color = cog_fun_df.loc[cog_fun_df['COG Functional category ID'] == category, 'RGB color'].values[0]
        # Collect the row as a dictionary
        summary_rows.append(
            {
                'Functional category': category,
                'Functional group': group,
                'Count': count,
                'FC description': description,
                'Color': color
            }
                            )

    # create the summary DataFrame with category, group, and count columns
    summary_df = pd.DataFrame(summary_rows)
    
    return summary_df

# Main function to process each sample and summarize the functional categories
def process_sample(sample, sample_protein_cog_file, cog_def_file, cog_fun_file):
    # Load the data
    cog_def_df = load_cog_def(cog_def_file)
    cog_fun_df = load_cog_fun(cog_fun_file)
    sample_cog_df = load_sample_protein_cog(sample_protein_cog_file)

    # Summarize the functional categories
    summary_df = summarize_functional_categories(sample_cog_df, cog_def_df, cog_fun_df)
    # Dataframe for comparison between samples
    summary_df['Functional group'] = summary_df['Functional group'].replace(
        {
            1: 'Information storage and processing',
            2: 'Cellular processes and signaling',
            3: 'Metabolism',
            4: 'Poorly characterized',
        }
            )
    # Group the rows with the same 'Functional group'
    summary_df = summary_df.sort_values(by='Functional group').reset_index(drop=True)
    # Create non-normalized DataFrame
    non_normalized_df = summary_df[['Functional group', 'FC description', 'Count']].copy()
    non_normalized_df = non_normalized_df.rename(columns={'Count': sample})
    # Create normalized DataFrame
    ## Normalize the counts to relative abundance
    normalized_df = summary_df[['Functional group', 'FC description', 'Count']].copy()
    normalized_df[sample] = normalized_df['Count'] / normalized_df['Count'].sum()
    normalized_df.drop(columns=['Count'], inplace=True)
    # Return DataFrames
    return non_normalized_df, normalized_df

def process_all_samples(samples_cog_dir, output_dir, cog_def_file, cog_fun_file):
    # Initialize lists to hold DataFrames
    non_normalized_list = []
    normalized_list = []
    
    # Iterate over files in the sample directory
    for item in samples_cog_dir.iterdir():
        if item.is_file() and item.suffix == ".tsv":  # Filter for .tsv files
            sample_protein_cog_file = item
            sample = item.stem.split("_")[0]  # Extract sample name from filename
            print(f"Processing {sample} from file: {item}")
            
            # Process the sample
            non_normalized_df, normalized_df = process_sample(sample, sample_protein_cog_file, cog_def_file, cog_fun_file)
            
            # Append the results to the list
            non_normalized_list.append(non_normalized_df)
            normalized_list.append(normalized_df)
    
    # Concatenate and merge all DataFrames by 'Functional group' and 'FC description'
    non_normalized_combined = pd.concat(non_normalized_list).groupby(['Functional group', 'FC description'], as_index=False).sum()
    normalized_combined = pd.concat(normalized_list).groupby(['Functional group', 'FC description'], as_index=False).sum()
    
    # save
    non_normalized_combined.to_csv(output_dir / "non_normalized.csv", index=False)
    normalized_combined.to_csv(output_dir / "normalized.csv", index=False)
    print(f"\nNon normalized CSV saved to: {output_dir / 'non_normalized.csv'}")
    print(f"Normalized CSV saved to: {output_dir / 'normalized.csv'}")
    
    return non_normalized_combined, normalized_combined

In [140]:
samples_cog_dir = Path("/home/koala/my_projects/metagenome_metabolic_profiling/data/samples")
output_dir = Path("/home/koala/my_projects/metagenome_metabolic_profiling/data/samples_fp")
cog_def_file = "/home/koala/my_projects/metagenome_metabolic_profiling/data/cog-24.def.tab"
cog_fun_file = "/home/koala/my_projects/metagenome_metabolic_profiling/data/cog-24.fun.edited.tab"

non_normalized_combined, normalized_combined = process_all_samples(samples_cog_dir, output_dir, cog_def_file, cog_fun_file)

non_normalized_combined

Processing samplex from file: /home/koala/my_projects/metagenome_metabolic_profiling/data/samples/samplex_protein-cog.tsv
Processing sampley from file: /home/koala/my_projects/metagenome_metabolic_profiling/data/samples/sampley_protein-cog.tsv

Non normalized CSV saved to: /home/koala/my_projects/metagenome_metabolic_profiling/data/samples_fp/non_normalized.csv
Normalized CSV saved to: /home/koala/my_projects/metagenome_metabolic_profiling/data/samples_fp/normalized.csv


Unnamed: 0,Functional group,FC description,samplex,sampley
0,Cellular processes and signaling,Cell wall/membrane/envelope biogenesis,3.0,3.0
1,Cellular processes and signaling,Defense mechanisms,1.0,1.0
2,Cellular processes and signaling,"Posttranslational modification, protein turnov...",3.0,3.0
3,Information storage and processing,Transcription,2.0,2.0
4,Information storage and processing,"Translation, ribosomal structure and biogenesis",3.0,2.5
5,Metabolism,Amino acid transport and metabolism,0.0,0.5
6,Metabolism,Carbohydrate transport and metabolism,1.0,1.0
7,Metabolism,Coenzyme transport and metabolism,4.0,3.0
8,Poorly characterized,General function prediction only,0.0,1.0


In [135]:
normalized_combined

Unnamed: 0,Functional group,FC description,samplex,sampley
0,Cellular processes and signaling,Cell wall/membrane/envelope biogenesis,0.176471,0.176471
1,Cellular processes and signaling,Defense mechanisms,0.058824,0.058824
2,Cellular processes and signaling,"Posttranslational modification, protein turnov...",0.176471,0.176471
3,Information storage and processing,Transcription,0.117647,0.117647
4,Information storage and processing,"Translation, ribosomal structure and biogenesis",0.176471,0.147059
5,Metabolism,Amino acid transport and metabolism,0.0,0.029412
6,Metabolism,Carbohydrate transport and metabolism,0.058824,0.058824
7,Metabolism,Coenzyme transport and metabolism,0.235294,0.176471
8,Poorly characterized,General function prediction only,0.0,0.058824


# P.4. Script 2

In [None]:
import pandas as pd
from collections import defaultdict

# Load cog-24.def.tab file (COG ID to Functional Category mapping)
def load_cog_def(cog_def_file):
    cog_def_df = pd.read_csv(
        cog_def_file, 
        sep='\t', 
        names=['COG ID', 'COG Functional category ID', 'COG name', 'Gene', 'Pathway', 'PubMed ID', 'PDB ID']
        )
    cog_def_df['COG ID'] = cog_def_df['COG ID'].str.strip()  # Strip any spaces
    return cog_def_df

# Load cog-24.fun.edited.tab file (Functional Category descriptions and groups)
def load_cog_fun(cog_fun_file):
    cog_fun_df = pd.read_csv(
        cog_fun_file, 
        sep='\t', 
        names=['COG Functional category ID', 'Functional group', 'RGB color', 'FC description']
        )
    return cog_fun_df

# Load sample-specific protein-COG mappings
def load_sample_protein_cog(sample_protein_cog_file):
    sample_cog_df = pd.read_csv(sample_protein_cog_file, sep='\t', header=None, names=['Protein', 'COG'])
    sample_cog_df['COG'] = sample_cog_df['COG'].str.strip()
    sample_cog_df['COG'] = sample_cog_df['COG'].str.replace('COG:', '')  # Clean up COG prefixes
    return sample_cog_df

# Handle combined functional categories (like EHJQ, etc.)
def map_combined_categories(cog_categories):
    return list(cog_categories)  # Split the string into individual categories (like EHJQ -> ['E', 'H', 'J', 'Q'])

# Function to summarize the functional categories for a given sample
def summarize_functional_categories(sample_cog_df, cog_def_df, cog_fun_df):
    category_counts = defaultdict(int)  # Count occurrences of each functional category
    pathway_counts = defaultdict(int) # Count occurrences of each Pathway
    
    for cog in sample_cog_df['COG']:
        # Find the functional category/ies for the COG
        print(f"Processing COG: {cog}")
        functional_categories = cog_def_df.loc[cog_def_df['COG ID'] == cog, 'COG Functional category ID'].values
        pathways = cog_def_df.loc[cog_def_df['COG ID'] == cog, 'Pathway'].values[0]
        
        # Handle case when no category is found
        if len(functional_categories) == 0:
            print(f"COG ID {cog} not found in cog_def_df.")
            continue
        
        # print(f"Functional categories found: {functional_categories}")
        functional_categories = functional_categories[0]  # Extract the category
        # print(f"Functional categories found: {map_combined_categories(functional_categories)}")
        # print(f"Number of Functional categories found: {len(map_combined_categories(functional_categories))}")
        
        # Get the number of categories for this COG
        num_categories = len(map_combined_categories(functional_categories))
        # Handle combined categories (split them into individual letters, e.g., 'ER' -> ['E', 'R'])
        for category in map_combined_categories(functional_categories):
            # Add 1 / num_categories to each category
            category_counts[category] += 1 / num_categories

    # Collect rows for the summary DataFrame
    summary_rows = []
    
    for category, count in category_counts.items():
        group = cog_fun_df.loc[cog_fun_df['COG Functional category ID'] == category, 'Functional group'].values[0]
        description = cog_fun_df.loc[cog_fun_df['COG Functional category ID'] == category, 'FC description'].values[0]
        color = cog_fun_df.loc[cog_fun_df['COG Functional category ID'] == category, 'RGB color'].values[0]
        # Collect the row as a dictionary
        summary_rows.append(
            {
                'Functional category': category,
                'Functional group': group,
                'Count': count,
                'FC description': description,
                'Color': color
            }
                            )

    # create the summary DataFrame with category, group, and count columns
    summary_df = pd.DataFrame(summary_rows)
    
    return summary_df

# Main function to process each sample and summarize the functional categories
def process_sample(sample, sample_protein_cog_file, cog_def_file, cog_fun_file):
    # Load the data
    cog_def_df = load_cog_def(cog_def_file)
    cog_fun_df = load_cog_fun(cog_fun_file)
    sample_cog_df = load_sample_protein_cog(sample_protein_cog_file)

    # Summarize the functional categories
    summary_df = summarize_functional_categories(sample_cog_df, cog_def_df, cog_fun_df)
    # Dataframe for comparison between samples
    summary_df['Functional group'] = summary_df['Functional group'].replace(
        {
            1: 'Information storage and processing',
            2: 'Cellular processes and signaling',
            3: 'Metabolism',
            4: 'Poorly characterized',
        }
            )
    # Group the rows with the same 'Functional group'
    summary_df = summary_df.sort_values(by='Functional group').reset_index(drop=True)
    # Create non-normalized DataFrame
    non_normalized_df = summary_df[['Functional group', 'FC description', 'Count']].copy()
    non_normalized_df = non_normalized_df.rename(columns={'Count': sample})
    # Create normalized DataFrame
    ## Normalize the counts to relative abundance
    normalized_df = summary_df[['Functional group', 'FC description', 'Count']].copy()
    normalized_df[sample] = normalized_df['Count'] / normalized_df['Count'].sum()
    normalized_df.drop(columns=['Count'], inplace=True)
    # Return DataFrames
    return non_normalized_df, normalized_df

# Example usage:
sample = "samplex"
# sample_protein_cog_file = "/home/koala/my_projects/metagenome_metabolic_profiling/data/samples/samplex_protein-cog.tsv"
sample_protein_cog_file = "/home/koala/my_projects/metagenome_metabolic_profiling/data/samples/sampley_protein-cog.tsv"
cog_def_file = "/home/koala/my_projects/metagenome_metabolic_profiling/data/cog-24.def.tab"
cog_fun_file = "/home/koala/my_projects/metagenome_metabolic_profiling/data/cog-24.fun.edited.tab"
non_normalized_df, normalized_df = process_sample(sample, sample_protein_cog_file, cog_def_file, cog_fun_file)
non_normalized_df