In [2]:
# How to run Linux commands in Jupyter notebook?

%%bash
ls -al

total 54892
drwxrwxr-x  3 liangl5 liangl5     4096 May 17 19:23 .
drwxr-xr-x 12 liangl5 liangl5     4096 May 17 13:38 ..
-rw-rw-r--  1 liangl5 liangl5 56089025 May 17 05:59 assembly_summary.txt
-rw-rw-r--  1 liangl5 liangl5     3327 May 17 13:57 crispr.ipynb
-rw-rw-r--  1 liangl5 liangl5   100787 May 17 19:23 download_archaea.ipynb
-rw-rw-r--  1 liangl5 liangl5        0 May 17 14:06 ftpdirpaths
-rw-rw-r--  1 liangl5 liangl5        0 May 17 14:09 ftpfilepaths
drwxrwxr-x  2 liangl5 liangl5     4096 May 17 13:53 .ipynb_checkpoints


In [6]:
%%bash
wget 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/assembly_summary.txt'

# How can I download RefSeq data for all complete archaea genomes?
# https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/assembly_summary.txt

# Step (1) Download the /refseq/archaea/assembly_summary.txt file
# May 12, 2020
# RefSeq Release 200 is available for FTP

--2020-05-18 12:12:46--  https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/assembly_summary.txt
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.13, 2607:f220:41e:250::11
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.13|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 343595 (336K) [text/plain]
Saving to: ‘assembly_summary.txt’

     0K .......... .......... .......... .......... .......... 14% 1.06M 0s
    50K .......... .......... .......... .......... .......... 29% 2.22M 0s
   100K .......... .......... .......... .......... .......... 44% 88.9M 0s
   150K .......... .......... .......... .......... .......... 59% 2.13M 0s
   200K .......... .......... .......... .......... .......... 74%  105M 0s
   250K .......... .......... .......... .......... .......... 89%  126M 0s
   300K .......... .......... .......... .....                100%  132M=0.09s

2020-05-18 12:12:46 (3.54 MB/s) - ‘assembly_summary.txt’ saved [

In [7]:
%%bash
mv assembly_summary.txt assembly_summary_archaea.txt

# Rename assembly_summary.txt so it is less confusing

In [8]:
%%bash
awk '{FS="\t"} ($12=="Complete Genome" && $11=="latest"){print $20}' assembly_summary_archaea.txt > ftpdirpaths_archaea

# Step (2) List the FTP path (column 20) for the assemblies of interest, in this case those that have 
# "Complete Genome" assembly_level (column 12) and "latest" version_status (column 11). 
# One way to do this would be using the following awk command:
# awk -F "\t" '$12=="Complete Genome" $11=="latest"{print $20}' assembly_summary.txt > ftpdirpaths_archaea

In [10]:
%%bash
wc -l ftpdirpaths_archaea
wc -l assembly_summary_archaea.txt

# Shows how many files exist

351 ftpdirpaths_archaea
1053 assembly_summary_archaea.txt


In [14]:
# Step (3) Append the filename of interest, in this case "*genomic.fna.gz " to the FTP directory names. 
# One way to do this would be using the following awk command:
# awk 'BEGIN{FS=OFS="/";filesuffix="genomic.fna.gz"}{ftpdir=$0;asm=$10;file=asm"_"filesuffix;print ftpdir,file}' ftpdirpaths_archaea > ftpfilepaths_archaea

%%bash
awk 'BEGIN{FS=OFS="/";filesuffix="genomic.fna.gz"}{ftpdir=$0;asm=$10;file=asm"_"filesuffix;print ftpdir,file}' ftpdirpaths_archaea > ftpfilepaths_archaea

In [15]:
# Step (4) Create a folder named genome for storing all genome fasta file

%%bash               
mkdir genome_archaea
cd genome_archaea

In [16]:
# Step (5) Download each genome individually using the filepath

import os
FH=open('ftpfilepaths_archaea', 'r')

for i in FH:
    command = 'wget ' + i
    os.system(command)

In [None]:
%%bash
mv *.fna.gz genome_archaea

# Step (6) Move all downloaded genomes into genome_archaea

In [None]:
# Part 2.  Downloading every genomes gff.gz file

In [12]:
%%bash
awk 'BEGIN{FS=OFS="/";filesuffix="genomic.gff.gz"}{ftpdir=$0;asm=$10;file=asm"_"filesuffix;print ftpdir,file}' ftpdirpaths_archaea > ftpfilepaths_archaea_gff

# Step (1) Append the filename of interest, in this case "*genemoic.gff.gz" to the FTP directory names. 
# One way to do this would be using the following awk command:
# awk 'BEGIN{FS=OFS="/";filesuffix="genomic.gff.gz"}{ftpdir=$0;asm=$10;file=asm"_"filesuffix;print ftpdir,file}' ftpdirpaths_archaea > ftpfilepaths_archaea_gff

In [13]:
%%bash
mkdir gff_archaea


# Step (2) Create new folder

In [14]:
# Step (3) Download each genome individually using the filepath

import os
FH=open('ftpfilepaths_archaea_gff', 'r')

for i in FH:
    command = 'wget ' + i
    os.system(command)

In [15]:
%%bash
mv *.gff.gz gff_archaea
# Step (4) Move all files into gff_archaea