In [1]:
import pandas as pd
from Bio import SeqIO

# Create metadata

In [2]:
df_meta = pd.read_csv("sample-metadata.tsv", sep="\t", skiprows=[1])
df_meta.head()

Unnamed: 0,sample-id,sample-name,isolation-source
0,SRR5989622,Donor_F,feces
1,SRR5989621,Donor_O,saliva
2,SRR5989559,HOMA_Ce01,gut_luminal_content
3,SRR5989558,HOMA_Ce02,gut_luminal_content
4,SRR5989557,HOMA_Ce03,gut_luminal_content


In [14]:
df_meta[df_meta['sample-name']=='Donor_O'].rename({"sample-id":"SampleID", "sample-name":"SampleName", "isolation-source":"Environment"}, axis=1).to_csv("../saliva/sample_meta.txt", sep="\t", index=False)

# Create feature table

In [3]:
df_count = pd.read_csv("feature-table/feature-table.from_biom.txt", sep="\t", skiprows=[0], index_col=0)
df_count.index.name = "ASV"
df_count.head()

Unnamed: 0_level_0,SRR5989520,SRR5989522,SRR5989523,SRR5989524,SRR5989525,SRR5989528,SRR5989529,SRR5989530,SRR5989533,SRR5989534,...,SRR5989617,SRR5989618,SRR5989619,SRR5989620,SRR5989621,SRR5989622,SRR5989623,SRR5989625,SRR5989626,SRR5989628
ASV,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
b9779a83e705307c6596e4741240726c,203.0,181.0,115.0,232.0,187.0,24.0,72.0,32.0,29.0,20.0,...,231.0,89.0,84.0,39.0,8.0,0.0,76617.0,164.0,168.0,33.0
e0b0817162b44d0e17a1c50486839924,11116.0,14203.0,12241.0,14080.0,13601.0,7799.0,15523.0,11110.0,7702.0,9871.0,...,15059.0,9045.0,14116.0,12072.0,2587.0,43.0,404.0,12599.0,12654.0,13118.0
5279cf5bf096ad48a90a13a9caa1643e,9742.0,12953.0,10803.0,12682.0,12389.0,7108.0,13776.0,10192.0,6910.0,8719.0,...,13534.0,8114.0,12721.0,10570.0,114.0,0.0,324.0,11503.0,11704.0,12222.0
75e69ac35f8b5786cf860f3f0be74929,9970.0,7052.0,10186.0,8885.0,8671.0,4783.0,6117.0,10648.0,4662.0,4737.0,...,10673.0,6480.0,7319.0,12696.0,121.0,0.0,238.0,8500.0,9857.0,6136.0
c764d61dc0ee11a504bfcf2d01ab4aa6,94.0,72.0,48.0,108.0,50.0,10.0,34.0,15.0,11.0,0.0,...,104.0,25.0,29.0,14.0,2.0,0.0,27596.0,56.0,88.0,17.0


In [4]:
df_count_source = df_count[df_meta.loc[df_meta['sample-name']=='Donor_O','sample-id']]
df_count_source = df_count_source.loc[~(df_count_source==0).all(axis=1)]
df_count_source.to_csv("../saliva/feature_table.txt", sep="\t")
df_count_source.head()

Unnamed: 0_level_0,SRR5989621
ASV,Unnamed: 1_level_1
b9779a83e705307c6596e4741240726c,8.0
e0b0817162b44d0e17a1c50486839924,2587.0
5279cf5bf096ad48a90a13a9caa1643e,114.0
75e69ac35f8b5786cf860f3f0be74929,121.0
c764d61dc0ee11a504bfcf2d01ab4aa6,2.0


In [36]:
df_count_query = df_count[df_meta.loc[df_meta['sample-name'].str.contains("HOMA"),'sample-id']]
df_count_query = df_count_query.loc[~(df_count_query==0).all(axis=1)]
df_count_query.to_csv("../query/feature_table.txt", sep="\t")
df_count_query.head()

Unnamed: 0_level_0,SRR5989559,SRR5989558,SRR5989557,SRR5989567,SRR5989566,SRR5989585,SRR5989530,SRR5989529,SRR5989528,SRR5989535,...,SRR5989578,SRR5989575,SRR5989576,SRR5989581,SRR5989592,SRR5989593,SRR5989594,SRR5989548,SRR5989547,SRR5989550
ASV,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
b9779a83e705307c6596e4741240726c,17.0,35.0,15.0,39.0,35.0,27.0,32.0,72.0,24.0,24.0,...,9963.0,573.0,44524.0,2203.0,1053.0,10711.0,4508.0,7642.0,15951.0,140.0
e0b0817162b44d0e17a1c50486839924,9103.0,7304.0,11443.0,10428.0,11872.0,10936.0,11110.0,15523.0,7799.0,13225.0,...,4750.0,8660.0,462.0,3069.0,10615.0,325.0,1480.0,4274.0,1642.0,8042.0
5279cf5bf096ad48a90a13a9caa1643e,8184.0,6503.0,10247.0,9420.0,10770.0,9782.0,10192.0,13776.0,7108.0,11875.0,...,4411.0,7664.0,433.0,2690.0,9952.0,332.0,1307.0,3815.0,1611.0,7370.0
75e69ac35f8b5786cf860f3f0be74929,7864.0,3808.0,8488.0,11183.0,6367.0,5787.0,10648.0,6117.0,4783.0,8826.0,...,7392.0,13380.0,2656.0,15121.0,7669.0,1900.0,2377.0,4408.0,3882.0,7270.0
c764d61dc0ee11a504bfcf2d01ab4aa6,5.0,12.0,4.0,13.0,11.0,14.0,15.0,34.0,10.0,6.0,...,2437.0,208.0,14478.0,565.0,378.0,3035.0,1237.0,2209.0,4582.0,57.0


# Create ASV sequence files

In [46]:
all_asv_sequences = SeqIO.parse(open("asv-sequences/dna-sequences.fasta"), 'fasta')
with open("../saliva/asv_sequences.fasta", 'w') as out_file:
    for fasta in all_asv_sequences:
        asv, seq = fasta.id, str(fasta.seq)
        if asv in list(df_count_source.index):
            out_file.write(">%s\n%s\n"%(asv, seq))

In [47]:
all_asv_sequences = SeqIO.parse(open("asv-sequences/dna-sequences.fasta"), 'fasta')
with open("../query/asv_sequences.fasta", 'w') as out_file:
    for fasta in all_asv_sequences:
        asv, seq = fasta.id, str(fasta.seq)
        if asv in list(df_count_query.index):
            out_file.write(">%s\n%s\n"%(asv, seq))

# Source mapping file

In [5]:
df_map = df_meta[df_meta['sample-name'].str.contains("HOMA")][['sample-id']].rename({"sample-id":"SampleID"}, axis=1)
df_map["SourceID"] = "saliva"
df_map["SourceDir"] = "/Users/liaoc/Projects/OralPerc/examples/Li_IJOS_2019/saliva"
df_map.to_csv("../query/source_mapping_file.txt", sep="\t", index=False)