In [1]:
import requests
from bs4 import BeautifulSoup
import wget
import os
import missingno as msno
import matplotlib.pyplot as plt
import pandas as pd

pd.set_option('display.max_rows', None)

In [4]:
##### Fetch and download amplicon data #####

# URL of the page with the .fastq.gz files
url = 'https://ibdmdb.org/downloads/html/rawfiles_16s_2018-01-08.html'

# Send a GET request to fetch the page content
response = requests.get(url)
response.raise_for_status()  # Check for successful request

# Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')

# Create a directory to save downloaded files
os.makedirs("fastq_files", exist_ok=True)

# Find all links to .fastq.gz files and download them
for link in soup.find_all('a', href=True):
    file_url = link['href']
    if file_url.endswith('.fastq.gz'):
        # Full URL of the file
        print(f"Downloading {file_url}")
        # Download and save the file
        wget.download(file_url, out='fastq_files')

print("Download complete.")

Downloading https://g-227ca.190ebd.75bc.data.globus.org/ibdmdb/raw/HMP2/16S/2018-01-08/206534.fastq.gz
Downloading https://g-227ca.190ebd.75bc.data.globus.org/ibdmdb/raw/HMP2/16S/2018-01-08/206536.fastq.gz
Downloading https://g-227ca.190ebd.75bc.data.globus.org/ibdmdb/raw/HMP2/16S/2018-01-08/206538.fastq.gz
Downloading https://g-227ca.190ebd.75bc.data.globus.org/ibdmdb/raw/HMP2/16S/2018-01-08/206547.fastq.gz
Downloading https://g-227ca.190ebd.75bc.data.globus.org/ibdmdb/raw/HMP2/16S/2018-01-08/206548.fastq.gz
Downloading https://g-227ca.190ebd.75bc.data.globus.org/ibdmdb/raw/HMP2/16S/2018-01-08/206561.fastq.gz
Downloading https://g-227ca.190ebd.75bc.data.globus.org/ibdmdb/raw/HMP2/16S/2018-01-08/206562.fastq.gz
Downloading https://g-227ca.190ebd.75bc.data.globus.org/ibdmdb/raw/HMP2/16S/2018-01-08/206563.fastq.gz
Downloading https://g-227ca.190ebd.75bc.data.globus.org/ibdmdb/raw/HMP2/16S/2018-01-08/206564.fastq.gz
Downloading https://g-227ca.190ebd.75bc.data.globus.org/ibdmdb/raw/HMP2/1

In [5]:
###### Create manifest ######

# Directory containing the downloaded fastq files
directory = './fastq_files'

# Prepare data for the manifest
data = []
for filename in os.listdir(directory):
    if filename.endswith('.fastq.gz'):
        sample_id = filename.split('.')[0]  # Extract sample ID from filename
        absolute_path = os.path.join('$PWD', directory, filename)  # Create absolute path with $PWD
        data.append({'sample-id': sample_id, 'absolute-filepath': absolute_path}) # for colab
        # data.append({'sample-id': sample_id, 'forward-absolute-filepath': absolute_path}) # debug

# Create a DataFrame
manifest_df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
manifest_df.to_csv('manifest.tsv',sep='\t', index=None)

# Show the manifest DataFrame
manifest_df.head()

Unnamed: 0,sample-id,absolute-filepath
0,206630,$PWD/./fastq_files/206630.fastq.gz
1,219668,$PWD/./fastq_files/219668.fastq.gz
2,219644,$PWD/./fastq_files/219644.fastq.gz
3,206750,$PWD/./fastq_files/206750.fastq.gz
4,206681,$PWD/./fastq_files/206681.fastq.gz
