# Genome sequence analysis using Bash

### Download the goldfish reference genome

In [None]:
ftp ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003/368/295/GCF_003368295.1_ASM336829v1/GCF_003368295.1_ASM336829v1_genomic.fna.gz

Trying 2607:f220:41e:250::13...
Connected to ftp.wip.ncbi.nlm.nih.gov.
220-
 applicable federal laws, directives, and other federal guidance for accessing 
 this Government system, which includes all devices/storage media attached to 
 this system. This system is provided for Government-authorized use only. 
 Unauthorized or improper use of this system is prohibited and may result in 
 disciplinary action and/or civil and criminal penalties. At any time, and for 
 any lawful Government purpose, the government may monitor, record, and audit 
 your system usage and/or intercept, search and seize any communication or data 
 transiting or stored on this system. Therefore, you have no reasonable 
 expectation of privacy. Any communication or data transiting or stored on this 
 system may be disclosed or used for any lawful Government purpose.
220 FTP Server ready.
331 Anonymous login ok, send your complete email address as your password
230 Anonymous access granted, restrictions apply
Remot

### Count the number of scaffolds using shell commands

In [1]:
#Count using plain shell script
ls

GCF_003368295.1_ASM336829v1_genomic.fna.gz
bash.ipynb


In [1]:
#Decompress the genome sequence file
gunzip GCF_003368295.1_ASM336829v1_genomic.fna.gz

In [23]:
#Show the list of files
ls

GCF_003368295.1_ASM336829v1_genomic.fna	bash.ipynb


In [4]:
grep "^>" GCF_003368295.1_ASM336829v1_genomic.fna | head -n 5

>NC_039243.1 Carassius auratus strain Wakin chromosome 1, ASM336829v1, whole genome shotgun sequence
>NC_039244.1 Carassius auratus strain Wakin chromosome 2, ASM336829v1, whole genome shotgun sequence
>NC_039245.1 Carassius auratus strain Wakin chromosome 3, ASM336829v1, whole genome shotgun sequence
>NC_039246.1 Carassius auratus strain Wakin chromosome 4, ASM336829v1, whole genome shotgun sequence
>NC_039247.1 Carassius auratus strain Wakin chromosome 5, ASM336829v1, whole genome shotgun sequence


In [5]:
grep "^>" GCF_003368295.1_ASM336829v1_genomic.fna | wc -l

    6216


### Count the number of scaffolds using bioawk

In [7]:
#List scaffold names
bioawk -c fastx  '{print $name}' GCF_003368295.1_ASM336829v1_genomic.fna | head -n 5

NC_039243.1
NC_039244.1
NC_039245.1
NC_039246.1
NC_039247.1


In [8]:
#Count the number of scaffolds
bioawk -c fastx  '{print $name}' GCF_003368295.1_ASM336829v1_genomic.fna | wc -l

    6216


### Count the sequence lengthes of each scaffold

In [4]:
#Count sequence lengthes of scaffold
bioawk -c fastx  '{print $name,length($seq)}' GCF_003368295.1_ASM336829v1_genomic.fna | head -n 5

NC_039243.1	34767119
NC_039244.1	30202465
NC_039245.1	30025718
NC_039246.1	29272933
NC_039247.1	33106987


In [21]:
#Total length of genomic dna
bioawk -c fastx  '{SUM += length($seq)}END{print(SUM/(10^6))}' GCF_003368295.1_ASM336829v1_genomic.fna

1820.64


In [5]:
#Sort the count sequence lengthes of scaffold in Mb unit
bioawk -c fastx  '{print $name, length($seq)/(10^6)}' GCF_003368295.1_ASM336829v1_genomic.fna | sort -k2nr | head -n 20

NC_039251.1	37.1851
NC_039249.1	36.167
NC_039274.1	35.7353
NC_039264.1	34.771
NC_039243.1	34.7671
NC_039256.1	33.7302
NC_039247.1	33.107
NC_039269.1	31.5269
NC_039250.1	30.8394
NC_039244.1	30.2025
NC_039245.1	30.0257
NC_039272.1	29.8231
NC_039261.1	29.3396
NC_039246.1	29.2729
NC_039262.1	28.6241
NC_039273.1	28.3799
NC_039248.1	28.2434
NC_039258.1	28.066
NC_039276.1	27.8395
NC_039259.1	27.6483


In [6]:
#List the scaffolds whose sizes are greater than 1 Mb
bioawk -c fastx  'length($seq) > (10^6){print $name, length($seq)/(10^6)}' GCF_003368295.1_ASM336829v1_genomic.fna | sort -k2nr 

NC_039251.1	37.1851
NC_039249.1	36.167
NC_039274.1	35.7353
NC_039264.1	34.771
NC_039243.1	34.7671
NC_039256.1	33.7302
NC_039247.1	33.107
NC_039269.1	31.5269
NC_039250.1	30.8394
NC_039244.1	30.2025
NC_039245.1	30.0257
NC_039272.1	29.8231
NC_039261.1	29.3396
NC_039246.1	29.2729
NC_039262.1	28.6241
NC_039273.1	28.3799
NC_039248.1	28.2434
NC_039258.1	28.066
NC_039276.1	27.8395
NC_039259.1	27.6483
NC_039270.1	26.7925
NC_039283.1	26.7285
NC_039263.1	26.2641
NC_039255.1	26.064
NC_039257.1	25.4097
NC_039260.1	24.8561
NC_039280.1	24.4252
NC_039268.1	24.1975
NC_039271.1	23.6009
NC_039267.1	23.4256
NC_039266.1	23.1792
NC_039285.1	22.7634
NC_039252.1	21.9349
NC_039254.1	21.3215
NC_039282.1	20.9434
NC_039292.1	20.7149
NC_039278.1	20.7073
NC_039287.1	19.898
NC_039265.1	19.8852
NC_039277.1	19.8191
NC_039288.1	19.2356
NC_039279.1	18.5279
NC_039253.1	18.5272
NC_039284.1	18.5037
NC_039291.1	16.7329
NC_039289.1	15.8348
NC_039286.1	15.5725
NC_039281.1	13.3063
NC_039290.1	11.6251
NC_039293.1	10.0899
NC_039

In [11]:
#Sum of lengthes of scaffolds whose sizes are greater than 1 Mb
bioawk -c fastx 'BEGIN{SUM=0}length($seq)>(10^6){SUM += length($seq)}END{print SUM/(10^6)}' GCF_003368295.1_ASM336829v1_genomic.fna | sort -k2nr 

1342.12


In [22]:
#Parcentage of scaffolds whose sizes are greater than 1 Mb to the total genome size
echo "scale=1; 100*1342.12/1820.64" | bc

73.7
