# Prerequisisites

- [jellyfish](https://github.com/gmarcais/Jellyfish)

# 2. Count the number of CAC occurences in mouse and human genome

## 2.1 Download mm9 genome

In [1]:
import os
from ftplib import FTP
import pandas as pd
from tqdm.auto import tqdm
import urllib.request

In [2]:
# Login to UCSC ftp server

ftp = FTP("hgdownload.soe.ucsc.edu")
ftp.login()

# Change directory to mm9 and download genome

ftp.cwd("goldenPath/mm9/bigZips")
with open("data/mm9.chromFa.tar.gz", "wb") as fp:
    ftp.retrbinary('RETR chromFa.tar.gz', fp.write)

In [3]:
!tar xzOf data/mm9.chromFa.tar.gz > data/mm9.genome.fa

In [5]:
ftp.cwd("../../mm10/bigZips")
with open("data/mm10.fa.gz", "wb") as fp:
    ftp.retrbinary('RETR mm10.fa.gz', fp.write)
ftp.quit()

'221 Goodbye.'

In [6]:
!gunzip -c data/mm10.fa.gz > data/mm10.genome.fa

## 2.2 Download human genome (chm13-v1.1)

> https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/chm13.draft_v1.1.fasta.gz

In [4]:
class DownloadProgressBar(tqdm):
    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None:
            self.total = tsize
        self.update(b * bsize - self.n)

def download_url(url, output_path):
    with DownloadProgressBar(unit='B', unit_scale=True, leave=False, miniters=1, desc=url.split('/')[-1]) as t:
        urllib.request.urlretrieve(url, filename=output_path, reporthook=t.update_to)

In [5]:
download_url("https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/chm13.draft_v1.1.fasta.gz",
             "data/chm13.draft_v1.1.fasta.gz")

chm13.draft_v1.1.fasta.gz: 0.00B [00:00, ?B/s]

In [6]:
!gunzip -c data/chm13.draft_v1.1.fasta.gz > data/chm13.draft_v1.1.fasta

### 2.2.1 Download RepeatMasker tracks for chm13-v1.1

> S. J. Hoyt, J. M. Storer, G. A. Hartley, P. G. Grady, A. Gershman, C. Limouse, R. Halabian, L. Wojenski, and R. J. O’Neill. From telomere to telomere: the transcriptional and epigenetic state of human repeat elements analysis code: T2T-CHM13, Jan. 2022. URL [https://doi.org/10.5281/zenodo.5895031](https://doi.org/10.5281/zenodo.5895031)

In [7]:
download_url("https://zenodo.org/record/5895031/files/t2t-chm13-v1.1.rmsk.bigBed", "data/t2t-chm13-v1.1.rmsk.bigBed")

t2t-chm13-v1.1.rmsk.bigBed: 0.00B [00:00, ?B/s]

In [8]:
!bigBedToBed data/t2t-chm13-v1.1.rmsk.bigBed data/t2t-chm13-v1.1.rmsk.bed

In [11]:
chm13_rmsk = pd.read_csv("data/t2t-chm13-v1.1.rmsk.bed", sep="\t", header=None)

# Iterate through different context to extract simple repeats from the full list

complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}

for repeat_sequence in tqdm(['CA', 'CAA', 'CAT', 'CAG', 'CAC'], leave=False):
    repeat_sequence_label = f"({repeat_sequence})n"
    rc_repeat_sequence_label = "(" + "".join(complement.get(base) for base in reversed(repeat_sequence)) + ")n"
    chm13_rmsk[(chm13_rmsk[3]==repeat_sequence_label) | (chm13_rmsk[3]==rc_repeat_sequence_label)][[0,1,2]].sort_values(by=[0,1,2]).to_csv(f"data/chm13.{repeat_sequence}n.bed", sep="\t",
        header=None, index=None)

  0%|          | 0/5 [00:00<?, ?it/s]

```bash
$ snakemake jellyfish -p -j 2
```

In [8]:
!bedtools nuc -fi data/chm13.draft_v1.1.fasta -bed data/chm13.CAn.bed -pattern CAC -C > data/chm13.CAn.nuc
!bedtools nuc -fi data/mm9.genome.fa -bed data/mm9.CAn.bed -pattern CAC -C > data/mm9.CAn.nuc
!bedtools nuc -fi data/mm10.genome.fa -bed data/mm10.CAn.bed -pattern CAC -C > data/mm10.CAn.nuc
!bedtools nuc -fi data/chm13.draft_v1.1.fasta -bed data/chm13.CAn.bed -pattern GTG -C > data/chm13.CAn.GTG.nuc
!bedtools nuc -fi data/mm9.genome.fa -bed data/mm9.CAn.bed -pattern GTG -C > data/mm9.CAn.GTG.nuc
!bedtools nuc -fi data/mm10.genome.fa -bed data/mm10.CAn.bed -pattern GTG -C > data/mm10.CAn.GTG.nuc



In [9]:
with open("data/mm9.chromFa.tar.gz.3mer.counts") as iH:    
    for line in iH:                
        if line.strip().startswith("CAC"):
            print("mm9.genome", "{:,}".format(int(next(iH).strip()[1:])))        
print("mm9.CAn.bed  {:,}".format(pd.concat([pd.read_csv("data/mm9.CAn.nuc", sep="\t"),
                                            pd.read_csv("data/mm9.CAn.GTG.nuc", sep="\t")])["13_user_patt_count"].sum()))
print("")

with open("data/mm10.fa.gz.3mer.counts") as iH:    
    for line in iH:                
        if line.strip().startswith("CAC"):
            print("mm10.genome", "{:,}".format(int(next(iH).strip()[1:])))        
print("mm10.CAn.bed  {:,}".format(pd.concat([pd.read_csv("data/mm10.CAn.nuc", sep="\t"),
                                            pd.read_csv("data/mm10.CAn.GTG.nuc", sep="\t")])["13_user_patt_count"].sum()))



print("")
with open("data/chm13.draft_v1.1.fasta.gz.3mer.counts") as iH:    
    for line in iH:                
        if line.strip().startswith("CAC"):
            print("chm13.genome", "{:,}".format(int(next(iH).strip()[1:])))
        prevline = line        
print("chm13.CAn.bed    {:,}".format(pd.concat([pd.read_csv("data/chm13.CAn.nuc", sep="\t"),
                                                pd.read_csv("data/chm13.CAn.GTG.nuc", sep="\t")])["13_user_patt_count"].sum()))

mm9.genome 109,196,385
mm9.CAn.bed  6,987,442

mm10.genome 110,367,609
mm10.CAn.bed  6,490,524

chm13.genome 120,794,608
chm13.CAn.bed    949,940
