In [1]:
#Now, we need to keep only species level genomes. 
import pandas as pd
import sys
import os
import matplotlib.pyplot as plt
import re 
from tqdm import tqdm
import seaborn as sns

In [2]:
genome_metadata = pd.read_csv('data/taxonomy_info.csv',header=0)

def clean_species_name(name):
    """
    Remove taxonomic qualifiers and return the first two valid words.
    """
    specifiers = {"x.", "sp.", "var.", "subsp.", "f.", "spp.", "cf.", "aff.", "ex"}
    words = name.split()
    cleaned_words = [w for w in words if w.lower().strip(".") not in specifiers]
    return " ".join(cleaned_words[:2]) if len(cleaned_words) >= 2 else " ".join(cleaned_words)

def get_lui_and_species(organism_name, accession_id):
    species = clean_species_name(organism_name)
    #take the first two words of the organism name
    org_short = "".join(organism_name.split()[:2])
    lui = f"{accession_id}_{org_short}"  # Remove space in LUI
    return lui, species

def alphanum_only(s):
    return re.sub(r'[^A-Za-z0-9]', '', s)

# Add LUI and species columns
genome_metadata["LUI"], genome_metadata["Species"] = zip(*genome_metadata.apply(
    lambda row: get_lui_and_species(row["Organism"], row["ID"]),
    axis=1
))
genome_metadata["LUI"] = genome_metadata["LUI"].apply(alphanum_only)
genome_metadata["Species"] = genome_metadata["Species"].apply(alphanum_only)