# UNITE
## This script loads, normalize and filter abundance data resulting from UNITE reference library 

In [47]:
import pandas as pd
pd.options.mode.chained_assignment = None

In [48]:
batch="batch2_20231114"
abund_tb=pd.read_excel("../data/"+batch+"/SSU/OTU.xlsx")
trim_tbl=pd.read_excel("../data/"+batch+"/Trim_summary.xlsx")
control_sample_tbl=pd.read_excel("../data/conrol_sample_all_batches.xlsx")

  warn("Workbook contains no default style, apply openpyxl's default")


In [49]:
abund_tb.head(5)

Unnamed: 0,ID,Name,Taxonomy,Combined Abundance,Min,Max,Mean,Median,Std,BFTMS23111401 Abundance,BFTMS23111402 Abundance,BFTMS23111407 Abundance,BFTMS23111406 Abundance,BFTMS23111408 Abundance,BFTMS23111409 Abundance,Sequence
0,Pseudomonas amygdali pv. morsprunorum; AB00144...,AB001445.1.1538,Bacteria; Proteobacteria; Gammaproteobacteria;...,674,4,166,112.333333,132.0,60.327992,166,139,84,125,156,4,TACACACGTGCTACAATGGTCGGTACAGAGGGTTGCCAAGCCGCGA...
1,Dickeya phage phiDP10.3; KM209255.204.1909,KM209255.204.1909,Bacteria; Proteobacteria; Gammaproteobacteria;...,1552,38,372,258.666667,284.0,124.075246,215,251,372,317,359,38,TACACACGTGCTACAATGGCGCATACAAAGAGAAGCGACCTCGCGA...
2,Streptococcus porcinus; AB002523.1.1496,AB002523.1.1496,Bacteria; Firmicutes; Bacilli; Lactobacillales...,224,0,224,37.333333,0.0,91.447617,0,0,0,0,0,224,TATGACCTGGGCTACACACGTGCTACAATGGTTGGTACAACGAGTC...
3,Pseudomonas sp. NR25; JN082749.1.1447,JN082749.1.1447,Bacteria; Proteobacteria; Gammaproteobacteria;...,37,0,17,6.166667,3.0,7.167054,5,1,13,1,17,0,AATGCCTAGGAATCTGCCTGGTAGTGGGGGACAACGTTTCGAAAGG...
4,Bradyrhizobium sp. ORS 3635; JN085489.1.1385,JN085489.1.1385,Bacteria; Proteobacteria; Alphaproteobacteria;...,51,0,32,8.5,2.0,12.83355,32,0,15,0,2,2,ACCCTGGTAGTCCACGCCGTAAACGATGAATGCCAGCCGTTAGTGG...


In [50]:
# Rename the columns by removing the keyword abundance and BFTMS
abund_tb.columns = [col.replace("Abundance", '') for col in abund_tb.columns]
abund_tb.columns = [col.replace(" ", '') for col in abund_tb.columns]
#abund_tb.columns = [col.replace("BFTMS", '') for col in abund_tb.columns]

# Rename the "(single)" at the end of sample names in trimm summary file
trim_tbl.Name=trim_tbl.Name.str.replace(' (single)','')
##add a control column that gets values from the spcified control sample
control=control_sample_tbl.loc[control_sample_tbl['batch'] == batch, 'control_sample'].iloc[0]

samples=trim_tbl["Name"].astype(str).tolist()
#samples.append('control')

abund_tb['reference_lib']='UNITE'
abund_tb['batch']=batch

## mark species with high risk of false positive detection in a column "false_pos_prone"
#### for UNITE it includes []

In [51]:
## mark species with high risk of false positive detection in a column "false_pos_prone"  

# first split the ID column into species group name and species ID
abund_tb[['species group', 'species id']] = abund_tb['ID'].str.split(';', expand=True)
## create a new column "false_pos_prone" and mark species with high risk of false positive detection
abund_tb['false_pos_prone']='No'


def mark_species(df, ID_column, false_pos_tag, species_list):
    # Update Column B to True for rows where values in Column A start with any of the strings in the search list
    for name in species_list:
        df.loc[df[ID_column].str.startswith(name), false_pos_tag] = 'Yes'
    return df

false_pos_prone_spicies=['000000']
abund_tb = mark_species(abund_tb, 'species group', 'false_pos_prone', false_pos_prone_spicies)
#abund_tb.loc[abund_tb['species group'].contains(false_pos_prone_spicies), 'false_pos_prone'] = 'Yes'

## data normalization & corection for control & S:N

In [52]:
## data normalization and corection for control
resid_val=0.00 # this value will be added to sample and control to avoid deviding to 0

# normalize all abundances based on the total reads for each sample
for samp in samples:
    ## read the trimmed reads from trim_tbl and get the value based on the sample name
    trimmed_reads=trim_tbl[trim_tbl["Name"]==samp]["Trimmed sequences"].values[0]
    abund_tb[samp+"_Normalized"]=abund_tb[samp]*trimmed_reads/1000000

## correct normized abundance by deducing control abundance
for samp in samples:
    abund_tb[samp+"_Normalized_corrected"]=abund_tb[samp+"_Normalized"]-abund_tb[control+"_Normalized"]
    ## calculate signal to noize 
    abund_tb[samp+"_S:N"]=(abund_tb[samp+"_Normalized_corrected"]+resid_val)/(abund_tb[control+"_Normalized"]+resid_val)

## ## calculate signal to noize
for samp in samples:
    abund_tb[samp+"_S:N"]=(abund_tb[samp+"_Normalized_corrected"]+resid_val)/(abund_tb[control+"_Normalized"]+resid_val)

## pick one sample by name and review the accuracy of the normalization and background corrections

In [53]:
## check one sample and compare it against the results from manual analysis
#sample='BFTMS23111410'
#sample_df=abund_tb[['ID', sample, control,sample+'_Normalized', control+'_Normalized',sample+'_Normalized_corrected', sample+'_S:N']]
#sample_df.sort_values(by=sample+'_Normalized_corrected', ascending=False)
#sample_df.head(5)

In [54]:
## UNITE strict

standard_col_names=['ID','Name','Taxonomy','species group', 'species id','reference_lib','batch','false_pos_prone','Combined','Min','Max','Mean','Median','Std', 'sample_abund', 'control_abund','sample_normalized', 'control_normalized','normalized_corrected', 'S:N', 'sample', 'control']
select_IDs=pd.DataFrame(columns=standard_col_names)

for sample in samples:
    select_columns=['ID','Name','Taxonomy','species group', 'species id','reference_lib','batch','false_pos_prone','Combined','Min','Max','Mean','Median','Std', sample, control,sample+'_Normalized', control+'_Normalized',sample+'_Normalized_corrected', sample+'_S:N']
    sample_abund=abund_tb[select_columns]
    sample_abund['sample']=sample
    sample_abund['control']=control
    sample_abund.columns=standard_col_names
    #sample_abund.rename(columns={sample: "sample",control:"control", sample+'_Normalized':"Normalized'}
    sample_abund=sample_abund.sort_values(by='normalized_corrected', ascending=False)
    sample_abund=sample_abund[0:15]
    

    select_ID=sample_abund[((sample_abund['false_pos_prone']=='Yes') &
                            (sample_abund['sample_abund']>=1000) & 
                            ((sample_abund['control_abund']==0) | (sample_abund['S:N']>=1000)) &
                            (sample_abund['normalized_corrected']>=0)) 
                            |
                            ((sample_abund['false_pos_prone']=='No') &
                            (sample_abund['sample_abund']>=10) & 
                            ((sample_abund['control_abund']==0) | (sample_abund['S:N']>=1000)) &
                            (sample_abund['normalized_corrected']>=5))
                            ]
    
    select_IDs=pd.concat([select_IDs, select_ID], axis=0)

select_IDs['mode']='UNITE_strict'
select_IDs.to_csv('../output_tables/'+batch+'_UNITE-strict.csv')




In [55]:
## UNITE moderate

standard_col_names=['ID','Name','Taxonomy','species group', 'species id','reference_lib','batch','false_pos_prone','Combined','Min','Max','Mean','Median','Std', 'sample_abund', 'control_abund','sample_normalized', 'control_normalized','normalized_corrected', 'S:N', 'sample', 'control']
select_IDs=pd.DataFrame(columns=standard_col_names)

for sample in samples:
    select_columns=['ID','Name','Taxonomy','species group', 'species id','reference_lib','batch','false_pos_prone','Combined','Min','Max','Mean','Median','Std', sample, control,sample+'_Normalized', control+'_Normalized',sample+'_Normalized_corrected', sample+'_S:N']
    sample_abund=abund_tb[select_columns]
    sample_abund['sample']=sample
    sample_abund['control']=control
    sample_abund.columns=standard_col_names
    #sample_abund.rename(columns={sample: "sample",control:"control", sample+'_Normalized':"Normalized'}
    sample_abund=sample_abund.sort_values(by='normalized_corrected', ascending=False)
    sample_abund=sample_abund[0:15]
    


    select_ID=sample_abund[((sample_abund['false_pos_prone']=='Yes') &
                            (sample_abund['sample_abund']>=1000) & 
                            ((sample_abund['control_abund']==0) | (sample_abund['S:N']>=1000)) &
                            (sample_abund['normalized_corrected']>=0)) 
                            |
                            ((sample_abund['false_pos_prone']=='No') &
                            (sample_abund['sample_abund']>=7) & 
                            ((sample_abund['control_abund']==0) | (sample_abund['S:N']>=1000)) &
                            (sample_abund['normalized_corrected']>=3))
                            ]

    select_IDs=pd.concat([select_IDs, select_ID], axis=0)

select_IDs['mode']='UNITE_moderate'
select_IDs.to_csv('../output_tables/'+batch+'_UNITE-moderate.csv')




In [56]:
## LSU loose

standard_col_names=['ID','Name','Taxonomy','species group', 'species id','reference_lib','batch','false_pos_prone','Combined','Min','Max','Mean','Median','Std', 'sample_abund', 'control_abund','sample_normalized', 'control_normalized','normalized_corrected', 'S:N', 'sample', 'control']
select_IDs=pd.DataFrame(columns=standard_col_names)

for sample in samples:
    select_columns=['ID','Name','Taxonomy','species group', 'species id','reference_lib','batch','false_pos_prone','Combined','Min','Max','Mean','Median','Std', sample, control,sample+'_Normalized', control+'_Normalized',sample+'_Normalized_corrected', sample+'_S:N']
    sample_abund=abund_tb[select_columns]
    sample_abund['sample']=sample
    sample_abund['control']=control
    sample_abund.columns=standard_col_names
    #sample_abund.rename(columns={sample: "sample",control:"control", sample+'_Normalized':"Normalized'}
    sample_abund=sample_abund.sort_values(by='normalized_corrected', ascending=False)
    sample_abund=sample_abund[0:15]
    

    select_ID=sample_abund[((sample_abund['false_pos_prone']=='Yes') &
                            (sample_abund['sample_abund']>=1000) & 
                            ((sample_abund['control_abund']==0) | (sample_abund['S:N']>=1000)) &
                            (sample_abund['normalized_corrected']>=0)) 
                            |
                            ((sample_abund['false_pos_prone']=='No') &
                            (sample_abund['sample_abund']>=4) & 
                            ((sample_abund['control_abund']==0) | (sample_abund['S:N']>=3)) &
                            (sample_abund['normalized_corrected']>=1.5))
                            ]

    select_IDs=pd.concat([select_IDs, select_ID], axis=0)

select_IDs['mode']='UNITE_loose'
select_IDs.to_csv('../output_tables/'+batch+'_UNITE-loose.csv')


