In [None]:
import pandas as pd

## I used this to quickly design a mock community with species present in my real dataset at different levels of abundance.  

# CONFIG
input_file = "full_training_set/species_read_counts_summary.tsv"

# Load your full species summary file
df = pd.read_csv(input_file, sep="\t", names=("Species", "Taxon", "reads", "Proportion"))

# Step 1: Stratified selection
high_abundance = df.iloc[:20]
medium_abundance = df.iloc[20:100]
low_abundance = df.iloc[100:]

# Select species per stratum
high_selected = high_abundance.head(10)
medium_selected = medium_abundance.head(5)
low_selected = low_abundance.head(5)

# Combine selected species
selected_bacteria = pd.concat([high_selected, medium_selected, low_selected]).reset_index(drop=True)

print(selected_bacteria)

# Step 3: Rescale proportions to sum 98%
total_selected_prop = selected_bacteria['Proportion'].sum()
selected_bacteria['New_Proportion'] = (selected_bacteria['Proportion'] / total_selected_prop) * 98
selected_bacteria['New_Proportion'] = selected_bacteria['New_Proportion'].round(2)

# Step 4: Add fixed viruses
virus_data = [
    ["Swine Influenza A Virus (IAV-S)", "Virus", 0.14],
    ["Betaarterivirus suid 1 (PRRSV-1)", "Virus", 0.14],
    ["Betacoronavirus 1", "Virus", 0.30],
    ["Porcine parvovirus", "Virus", 0.06],
    ["Suid betaherpesvirus 2", "Virus", 1.36]
]
virus_df = pd.DataFrame(virus_data, columns=["Species", "Taxon", "New_Proportion"])
virus_df['New_Proportion'] = virus_df['New_Proportion'].round(2)

# Step 5: Merge and export
final_df = pd.concat([
    selected_bacteria[['Species', 'Taxon', 'New_Proportion']],
    virus_df
]).reset_index(drop=True)

# Optional: Check if total is exactly 100 
total_prop_sum = final_df['New_Proportion'].sum()
print(f"Total proportion sum: {total_prop_sum:.2f}%")

final_df.to_csv("full_training_set/mock_community_proportions.tsv", sep="\t", index=False)

print("Mock community proportions saved as 'full_training_set/mock_community_proportions.tsv'")


                         Species    Taxon  reads  Proportion
0   Jeotgalicoccus sp. ATCC 8456   946435   9884   22.658811
1       Lactobacillus amylovorus     1604   7982   18.298526
2                Segatella copri   165179   5494   12.594851
3          Glaesserella parasuis      738   4456   10.215263
4            Cutibacterium acnes     1747   2302    5.277275
5    Limosilactobacillus reuteri     1598   1997    4.578070
6              Segatella hominis  2518605   1540    3.530410
7          Agathobacter rectalis    39491    581    1.331927
8          Dorea formicigenerans    39486    492    1.127897
9           Megasphaera elsdenii      907    431    0.988056
10         Ruoffia tabacinasalis    87458    251    0.575411
11    Streptococcus acidominimus     1326    210    0.481419
12     Corynebacterium stationis     1705    202    0.463080
13          Neisseria shayeganii   607712    202    0.463080
14     Fusobacterium necrophorum      859    142    0.325531
15     Streptococcus sp.