## Set working directory

By default working directoy will be My Drive/PB_course

In [None]:
# set working pathway to your own google drive doc (~ 1 min)
from google.colab import drive
drive.mount('/content/gdrive')                         # if using for the first time, you be requested to grant permission to link your Google Drive

import os
try:
  os.mkdir("/content/gdrive/My Drive/PB_course/")         # change this path if necessary
except FileExistsError:
  print("directory already exist. OK to continue")
os.chdir("/content/gdrive/My Drive/PB_course/")

In [None]:
!pwd

## Package installation and downloads for workshop (~ 5 minutes)

1.   conda (for simple installation of packages)
2.   (optional)Download ready prepared files for analysis.
2.   lofreq 
3.   bedtools

In [None]:
# install conda (~ 1 min). There will be a message saying that the session has crashed but don't worry about this. This is due to the session restarting following conda installation
!pip install -q condacolab
import condacolab
condacolab.install()

In [None]:
# install lofreq (~ 1 mins)
!conda install -c bioconda lofreq

In [None]:
# install bedtools (~ 1 mins)
!conda install -c bioconda bedtools

In [None]:
import os
os.chdir("/content/gdrive/My Drive/PB_course/Datasets/")
if os.path.isfile("/content/gdrive/MyDrive/PB_course/Datasets/BAM/WXS_example_sorted.bam"):    # check if the file exist
  print("file already exit, OK to continue.") 
else:
  !wget -O WXS_BAMs.zip https://github.com/jasonwong-lab/HKU-Practical-Bioinformatics/raw/main/files/WXS_BAMs.zip
  !unzip -o WXS_BAMs.zip   #unzip file
  !rm WXS_BAMs.zip

## Variant calling command line

1.1. Download file

1.2. Variant calling using LoFreq

1.3. Using bedtools to identify coding variants

In [None]:
# double check that we are in right directory
import os
os.chdir("/content/gdrive/My Drive/PB_course/Datasets")

# use the file from previous class
!ls -l BAM/

In [None]:
!lofreq

In [None]:
# Variant calling using LoFreq
!lofreq call

In [5]:
#Step 1 – Create directory to store VCF files
!mkdir VCF/

In [None]:
#Step 2 – Run LoFreq call
#!rm -rf VCF/ 
!lofreq call ./BAM/WXS_example_sorted.bam -o ./VCF/WXS_germline.vcf -f ../DB_trunc/chr2.fa --verbose

In [None]:
#check result:
!head -n 20 ./VCF/WXS_germline.vcf

### Using bedtools to identify coding variants. 
UCSC genome browser: https://genome.ucsc.edu/cgi-bin/hgGateway?redirect=manual&source=genome.ucsc.edu

TABLE BROWSER : https://genome.ucsc.edu/cgi-bin/hgTables   
Output file: hg19_ucsc_exons_coding.bed

In [8]:
#Create a folder called Annotations
!mkdir Annotations/

In [None]:
#download the annotation file 
!wget -O Annotations/hg38_ucsc_exons_coding_chr2.bed https://github.com/jasonwong-lab/HKU-Practical-Bioinformatics/raw/main/files/hg38_ucsc_exons_coding_chr2.bed

In [10]:
#Use intersectBed to find out which variants in the VCF file is in coding regions.
!intersectBed -a ./VCF/WXS_germline.vcf -b ./Annotations/hg38_ucsc_exons_coding_chr2.bed -u > ./VCF/WXS_germline_coding.vcf

In [None]:
#Count the number of lines in WXS_germline_coding.vcf
!wc -l ./VCF/WXS_germline_coding.vcf

In [None]:
#Count the number of lines in WXS_germline.vcf. Remove the header by grep
!grep -v '#' ./VCF/WXS_germline.vcf | wc -l

### Check the functional impact of variants

VEP online: http://grch37.ensembl.org/Homo_sapiens/Tools/VEP

upload file: HOME > Datasets > VCF > WGS_germline.vcf

### IGV borwser

In [None]:
#Install igv-notebook
!pip install igv-notebook

In [None]:
!ls BAM/

In [None]:
#Load track from local paths
import igv_notebook

igv_notebook.init()

b = igv_notebook.Browser(
    {
        "genome": "hg38",
        "locus": "chr2:47,806,395-47,806,445"
    }
)


b.load_track(
    {
        "name": "Local BAM",
        "path": "./BAM/WXS_example_sorted.bam",
        "indexPath": "./BAM/WXS_example_sorted.bam.bai",
        "format": "bam",
        "type": "alignment"
    })

