In [8]:
library(tidyverse) 
library(dplyr)
library(ggplot2)
library(ggpubr)

print("Working directory before:")
getwd()

setwd("/home/strawberry/Documents/Collaborations/sinusite_edwin/data") # changing work directory to data 
print("Working directory after:")
getwd()

# Loading reference file (linking refseq accession number and taxID) - this file was obtained parsing gff3 file provided by NCBI 
# RefseqTaxID <- read.delim("./RefseqTaxID.txt", h=T)
# head(RefseqTaxID)
SILVATaxID <- read.csv("./silva_taxonomy.csv", h=T)
SILVATaxID$id <- gsub("\\..*", "", SILVATaxID$id) # Clean the 'id' column (remove everything after the first dot)
head(SILVATaxID)


# Loading TaxID full lineage file - no need for SILVA db, already contains id linked to full taxonomy in the above df 
# TaxID_FullLineage <- read.delim("./referencetable_taxonomy_RefseqNCBI_16S.txt", quote = "")
# head(TaxID_FullLineage)

# Loading metadata file including read counts after quality check
metadata_full_clean <- read.delim("./metadata_full_clean.tsv", h=T) 
head(metadata_full_clean)

[1] "Working directory before:"


[1] "Working directory after:"


Unnamed: 0_level_0,id,Kingdom,Phylum,Class,Order,Family,Genus,Species
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,AB000393,Bacteria,Pseudomonadota,Gammaproteobacteria,Enterobacterales,Vibrionaceae,Vibrio,Vibrio halioticoli
2,AB000480,Bacteria,Pseudomonadota,Alphaproteobacteria,Rhodospirillales,Rhodospirillaceae,Insolitispirillum,Insolitispirillum peregrinum
3,AB001783,Bacteria,Chlamydiota,Chlamydiia,Chlamydiales,Chlamydiaceae,Chlamydia,Chlamydia abortus
4,FZ423313,Bacteria,Bacillota,Bacilli,Bacillales,Bacillaceae,Anoxybacillus,unidentified
5,HG529990,Bacteria,Bacteroidota,Bacteroidia,Cytophagales,Cyclobacteriaceae,Algoriphagus,Algoriphagus sp. AK58
6,AB001778,Bacteria,Chlamydiota,Chlamydiia,Chlamydiales,Chlamydiaceae,Chlamydia,Chlamydia psittaci


Unnamed: 0_level_0,ID_Sample,SNOT22,SNOT22nasal,LK_Total,LK_Secret,ClinicCtrl,AntUse,Cortuse,Patient,Filename,⋯,Timepoint,Sex,Age,Atopic_Symptoms,Asthma,NSAID_Intolerance,Total_IgE,Blood_Eosinophilia,Tissue_Eosinophilia,Final_LundMackay
Unnamed: 0_level_1,<chr>,<int>,<int>,<int>,<int>,<chr>,<int>,<int>,<chr>,<chr>,⋯,<chr>,<int>,<dbl>,<int>,<int>,<int>,<dbl>,<int>,<int>,<int>
1,0001967K_T1,38.0,12.0,4.0,2.0,NC,1.0,1,0001967K,0001967K_T1.fastq,⋯,T1,1,58.7,1,1,0,38,100,10,15
2,0001967K_T3,96.0,26.0,10.0,4.0,NC,1.0,0,0001967K,0001967K_T3.fastq,⋯,T3,1,58.7,1,1,0,38,100,10,15
3,0001967K_T6,79.0,19.0,6.0,4.0,NC,1.0,1,0001967K,0001967K_T6.fastq,⋯,T6,1,58.7,1,1,0,38,100,10,15
4,0088303G_T0,102.0,37.0,8.0,2.0,NC,,0,0088303G,0088303G_T0.fastq,⋯,T0,2,60.3,1,1,0,667,800,58,22
5,0088303G_T1,32.0,7.0,4.0,1.0,NC,1.0,0,0088303G,0088303G_T1.fastq,⋯,T1,2,60.3,1,1,0,667,800,58,22
6,0088303G_T3,,,,,,0.0,0,0088303G,0088303G_T3.fastq,⋯,T3,2,60.3,1,1,0,667,800,58,22


In [9]:
# Make a repository with kma output
fragfiles <- list.files("./fragfiles", pattern = ".frag$")
fragfiles

In [10]:
dfall <- NULL  # Start with an empty dataframe
tab <- read.delim("./fragfiles/1628845C_T1_filt_kma.frag", h=F)
print(head(tab))

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [11]:
freqtab <- table(gsub("\\..*", "", tab$V6))
print(freqtab)


    AB009937     AB233328     AB680534     AB680900     AB681292     AB681715 
           4            1            1            2            2            4 
    AE015929 AFGU01000017 AGEF01000001 AGEF01000005 AGEF01000016 AIYD01000002 
          19           17           12          122           18            1 
    AJ308297     AJ938182 AJCB01000004     AM600682     AM697499     AP017895 
           1            1            1            1            1            1 
    AP018338     AP018922     AY281086     AY308046     AY560519     AY587778 
           2            8            2            1            1            3 
    BX571857     CP000253     CP001837     CP002110     CP002727     CP003045 
           1            1            8            5            1            5 
    CP003979     CP005288     CP006937     CP009046     CP012011     CP012013 
           9            1            1           52            1            2 
    CP012972     CP012974     CP012978     CP014022

In [12]:
df <- data.frame(id = names(freqtab), Counts = as.numeric(freqtab))   
print(df)

              id Counts
1       AB009937      4
2       AB233328      1
3       AB680534      1
4       AB680900      2
5       AB681292      2
6       AB681715      4
7       AE015929     19
8   AFGU01000017     17
9   AGEF01000001     12
10  AGEF01000005    122
11  AGEF01000016     18
12  AIYD01000002      1
13      AJ308297      1
14      AJ938182      1
15  AJCB01000004      1
16      AM600682      1
17      AM697499      1
18      AP017895      1
19      AP018338      2
20      AP018922      8
21      AY281086      2
22      AY308046      1
23      AY560519      1
24      AY587778      3
25      BX571857      1
26      CP000253      1
27      CP001837      8
28      CP002110      5
29      CP002727      1
30      CP003045      5
31      CP003979      9
32      CP005288      1
33      CP006937      1
34      CP009046     52
35      CP012011      1
36      CP012013      2
37      CP012972      2
38      CP012974      3
39      CP012978      1
40      CP014022    100
41      CP014023

In [14]:
dfTaxID <- merge(SILVATaxID, df, by = "id")
print(head(dfTaxID))

        id  Kingdom    Phylum   Class            Order            Family
1 AB009937 Bacteria Bacillota Bacilli Staphylococcales Staphylococcaceae
2 AB233328 Bacteria Bacillota Bacilli Staphylococcales Staphylococcaceae
3 AB680534 Bacteria Bacillota Bacilli  Lactobacillales  Streptococcaceae
4 AB680900 Bacteria Bacillota Bacilli  Lactobacillales Carnobacteriaceae
5 AB681292 Bacteria Bacillota Bacilli Staphylococcales Staphylococcaceae
6 AB681715 Bacteria Bacillota Bacilli Staphylococcales Staphylococcaceae
           Genus                                   Species Counts
1 Staphylococcus Staphylococcus capitis subsp. urealyticus      4
2 Staphylococcus  Staphylococcus cohnii subsp. urealyticus      1
3  Streptococcus                  Streptococcus salivarius      1
4 Dolosigranulum                     Dolosigranulum pigrum      2
5 Staphylococcus                Staphylococcus epidermidis      2
6 Staphylococcus                     Staphylococcus aureus      4


In [15]:
# freqtable_tax <- merge(dfTaxID, TaxID_FullLineage, by.x = "TaxID", by.y = "id") 
# print(freqtable_tax)

In [18]:
dfTaxID$ID_Sample <- gsub("_filt_kma.frag", "", "1628845C_T1_filt_kma.frag") 
print(head(dfTaxID))

        id  Kingdom    Phylum   Class            Order            Family
1 AB009937 Bacteria Bacillota Bacilli Staphylococcales Staphylococcaceae
2 AB233328 Bacteria Bacillota Bacilli Staphylococcales Staphylococcaceae
3 AB680534 Bacteria Bacillota Bacilli  Lactobacillales  Streptococcaceae
4 AB680900 Bacteria Bacillota Bacilli  Lactobacillales Carnobacteriaceae
5 AB681292 Bacteria Bacillota Bacilli Staphylococcales Staphylococcaceae
6 AB681715 Bacteria Bacillota Bacilli Staphylococcales Staphylococcaceae
           Genus                                   Species Counts   ID_Sample
1 Staphylococcus Staphylococcus capitis subsp. urealyticus      4 1628845C_T1
2 Staphylococcus  Staphylococcus cohnii subsp. urealyticus      1 1628845C_T1
3  Streptococcus                  Streptococcus salivarius      1 1628845C_T1
4 Dolosigranulum                     Dolosigranulum pigrum      2 1628845C_T1
5 Staphylococcus                Staphylococcus epidermidis      2 1628845C_T1
6 Staphylococcus     

In [20]:
freqtable_tax_metadata <- merge(dfTaxID, metadata_full_clean, by = "ID_Sample")
print(head(freqtable_tax_metadata))

    ID_Sample       id  Kingdom    Phylum   Class            Order
1 1628845C_T1 AB009937 Bacteria Bacillota Bacilli Staphylococcales
2 1628845C_T1 AB233328 Bacteria Bacillota Bacilli Staphylococcales
3 1628845C_T1 AB680534 Bacteria Bacillota Bacilli  Lactobacillales
4 1628845C_T1 AB680900 Bacteria Bacillota Bacilli  Lactobacillales
5 1628845C_T1 AB681292 Bacteria Bacillota Bacilli Staphylococcales
6 1628845C_T1 AB681715 Bacteria Bacillota Bacilli Staphylococcales
             Family          Genus                                   Species
1 Staphylococcaceae Staphylococcus Staphylococcus capitis subsp. urealyticus
2 Staphylococcaceae Staphylococcus  Staphylococcus cohnii subsp. urealyticus
3  Streptococcaceae  Streptococcus                  Streptococcus salivarius
4 Carnobacteriaceae Dolosigranulum                     Dolosigranulum pigrum
5 Staphylococcaceae Staphylococcus                Staphylococcus epidermidis
6 Staphylococcaceae Staphylococcus                     Staphylococcus

In [25]:
freqtable_tax_metadata$RelativeAbundance <- ((freqtable_tax_metadata$Counts) / sum(freqtable_tax_metadata$Counts)) * 100
print(head(freqtable_tax_metadata))

print("Relative Abundance Summary:")
print(summary(freqtable_tax_metadata$RelativeAbundance))

    ID_Sample       id  Kingdom    Phylum   Class            Order
1 1628845C_T1 AB009937 Bacteria Bacillota Bacilli Staphylococcales
2 1628845C_T1 AB233328 Bacteria Bacillota Bacilli Staphylococcales
3 1628845C_T1 AB680534 Bacteria Bacillota Bacilli  Lactobacillales
4 1628845C_T1 AB680900 Bacteria Bacillota Bacilli  Lactobacillales
5 1628845C_T1 AB681292 Bacteria Bacillota Bacilli Staphylococcales
6 1628845C_T1 AB681715 Bacteria Bacillota Bacilli Staphylococcales
             Family          Genus                                   Species
1 Staphylococcaceae Staphylococcus Staphylococcus capitis subsp. urealyticus
2 Staphylococcaceae Staphylococcus  Staphylococcus cohnii subsp. urealyticus
3  Streptococcaceae  Streptococcus                  Streptococcus salivarius
4 Carnobacteriaceae Dolosigranulum                     Dolosigranulum pigrum
5 Staphylococcaceae Staphylococcus                Staphylococcus epidermidis
6 Staphylococcaceae Staphylococcus                     Staphylococcus

In [29]:
freqtable_tax_metadata <- freqtable_tax_metadata %>%
  group_by(ID_Sample) %>%
  mutate(AlignmentEffic = (sum(Counts) / unique(nreads)) * 100)

head(freqtable_tax_metadata)
print("Alignment Efficiency Summary:")
print(summary(freqtable_tax_metadata$AlignmentEffic))

ID_Sample,id,Kingdom,Phylum,Class,Order,Family,Genus,Species,Counts,⋯,Age,Atopic_Symptoms,Asthma,NSAID_Intolerance,Total_IgE,Blood_Eosinophilia,Tissue_Eosinophilia,Final_LundMackay,RelativeAbundance,AlignmentEffic
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,⋯,<dbl>,<int>,<int>,<int>,<dbl>,<int>,<int>,<int>,<dbl>,<dbl>
1628845C_T1,AB009937,Bacteria,Bacillota,Bacilli,Staphylococcales,Staphylococcaceae,Staphylococcus,Staphylococcus capitis subsp. urealyticus,4,⋯,37.8,1,0,0,58,200,160,17,0.05180676,119.0043
1628845C_T1,AB233328,Bacteria,Bacillota,Bacilli,Staphylococcales,Staphylococcaceae,Staphylococcus,Staphylococcus cohnii subsp. urealyticus,1,⋯,37.8,1,0,0,58,200,160,17,0.01295169,119.0043
1628845C_T1,AB680534,Bacteria,Bacillota,Bacilli,Lactobacillales,Streptococcaceae,Streptococcus,Streptococcus salivarius,1,⋯,37.8,1,0,0,58,200,160,17,0.01295169,119.0043
1628845C_T1,AB680900,Bacteria,Bacillota,Bacilli,Lactobacillales,Carnobacteriaceae,Dolosigranulum,Dolosigranulum pigrum,2,⋯,37.8,1,0,0,58,200,160,17,0.02590338,119.0043
1628845C_T1,AB681292,Bacteria,Bacillota,Bacilli,Staphylococcales,Staphylococcaceae,Staphylococcus,Staphylococcus epidermidis,2,⋯,37.8,1,0,0,58,200,160,17,0.02590338,119.0043
1628845C_T1,AB681715,Bacteria,Bacillota,Bacilli,Staphylococcales,Staphylococcaceae,Staphylococcus,Staphylococcus aureus,4,⋯,37.8,1,0,0,58,200,160,17,0.05180676,119.0043


[1] "Alignment Efficiency Summary:"
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    119     119     119     119     119     119 


In [31]:
write_tsv(freqtable_tax_metadata, "freqtable_tax_metadata.tsv")