# DNABERT

## 00. Setup

## 00-1. Mount Google Drive
You may mount data from your google drive with the cell below. 
Authenticate by tapping the address that appears on the screen and enter the verification code below it.

In [None]:
import os, sys
from google.colab import drive
drive.mount('/content/drive')
!pwd

## 00-2. Install Miniconda


In [None]:
%%bash
MINICONDA_INSTALLER_SCRIPT=Miniconda3-4.5.4-Linux-x86_64.sh
MINICONDA_PREFIX=/usr/local
wget https://repo.continuum.io/miniconda/$MINICONDA_INSTALLER_SCRIPT
chmod +x $MINICONDA_INSTALLER_SCRIPT
./$MINICONDA_INSTALLER_SCRIPT -b -f -p $MINICONDA_PREFIX

In [None]:
!which conda

In [None]:
!conda --version

conda 4.5.4


In [None]:
!python --version

In [None]:
%%bash
conda install --channel defaults conda python=3.6 --yes
conda update --channel defaults --all --yes

# 1. Installation

## 01-1. Create conda environment

In [None]:
%%bash
conda create -n dnabert python=3.6

In [None]:
%%bash
source activate dnabert && conda env list

## 01-2. Install packages


In [None]:
!conda install pytorch torchvision cudatoolkit=10.0 -c pytorch --yes

In [None]:
!git clone https://github.com/jerryji1993/DNABERT

In [None]:
os.chdir("./DNABERT")

In [None]:
!python3 -m pip install --editable .

In [None]:
os.chdir("./examples")

## 01-3. Make new directory for fine-tuned model

In [None]:
!python3 -m pip install -r requirements.txt

In [None]:
os.mkdir("./ft")

In [None]:
os.chdir("./ft")

In [None]:
os.mkdir("./6")

In [None]:
os.chdir("..") # /content/DNABERT/examples

# 2. Prediction with fine-tuned model

## ********** ***Important*** **********
- For prediction with fine-tuned model, you need to store fine-tuned model in DNABERT/examples/ft/$KMER directory
- Since the fine-tuned model is quite large, the author provides it through google drive link. 
- The 6-mer pre-trained model can be downloaded via https://drive.google.com/file/d/1BJjqb5Dl2lNMg2warsFQ0-Xvn1xxfFXC/view
- If you unzip the 6-new-12w-0.zip, there may be 5 files in the directory (config.json, pytorch_model.bin, special_tokens_map.json, tokenizer_config.json, vocab.txt) 
- Please put those 5 files into DNABERT/examples/ft/6. Now you can proceed to next step. 

- Task : promoter prediction
- Input : 1000 sequences (pre-processed in 6-mer)
- Output : 1000 probability scores

In [None]:
%%bash
export KMER=6
export MODEL_PATH=./ft/$KMER
export DATA_PATH=sample_data/ft/$KMER
export PREDICTION_PATH=./result/$KMER

python run_finetune.py \
    --model_type dna \
    --tokenizer_name=dna$KMER \
    --model_name_or_path $MODEL_PATH \
    --task_name dnaprom \
    --do_predict \
    --data_dir $DATA_PATH  \
    --max_seq_length 75 \
    --per_gpu_pred_batch_size=128   \
    --output_dir $MODEL_PATH \
    --predict_dir $PREDICTION_PATH \
    --n_process 48

In [None]:
import pandas as pd
import numpy as np

pred_results = np.load("./result/6/pred_results.npy")

print(pred_results.shape)

In [None]:
print(pred_results[:30])

# 3. Visualization

## 03-1. Calculate attention scores

## Some modification
- 1. move data_process_template/process_pretrain_data.py to examples directory

In [None]:
%%bash
mv ./data_process/template/process_pretrain_data.py .

In [None]:
%%bash
export KMER=6
export MODEL_PATH=./ft/$KMER
export DATA_PATH=sample_data/ft/$KMER
export PREDICTION_PATH=./result/$KMER

python run_finetune.py \
    --model_type dna \
    --tokenizer_name=dna$KMER \
    --model_name_or_path $MODEL_PATH \
    --task_name dnaprom \
    --do_visualize \
    --visualize_data_dir $DATA_PATH \
    --visualize_models $KMER \
    --data_dir $DATA_PATH \
    --max_seq_length 81 \
    --per_gpu_pred_batch_size=16   \
    --output_dir $MODEL_PATH \
    --predict_dir $PREDICTION_PATH \
    --n_process 96

In [None]:
!pip install seaborn
!pip install ipykernel

In [None]:
%%bash
export KMER=6
export MODEL_PATH=./ft/$KMER

python visualize.py \
    --kmer $KMER \
    --model_path $MODEL_PATH 

## 4. Motif Analysis
- Once the attention scores are generated, we can proceed further to perform motif analysis using motif/find_motifs.py

In [None]:
os.chdir("../motif")

In [None]:
%%bash
export KMER=6
export DATA_PATH=../examples/sample_data/ft/$KMER
export PREDICTION_PATH=../examples/result/$KMER
export MOTIF_PATH=./result/$KMER

python find_motifs.py \
    --data_dir $DATA_PATH \
    --predict_dir $PREDICTION_PATH \
    --window_size 24 \
    --min_len 5 \
    --pval_cutoff 0.005 \
    --min_n_motif 3 \
    --align_all_ties \
    --save_file_dir $MOTIF_PATH \
    --verbose