# 1. Download RepeatMasker loci for mouse genome

## 1.1 mm9

In [1]:
import os
from ftplib import FTP
import pandas as pd
from tqdm.auto import tqdm

# Login to UCSC ftp server

ftp = FTP("hgdownload.soe.ucsc.edu")
ftp.login()

# Change directory to mm9 and download list of chromosomes

ftp.cwd("goldenPath/mm9/bigZips")
with open("data/mm9.chrom.sizes", "wb") as fp:
    ftp.retrbinary('RETR mm9.chrom.sizes', fp.write)
mm9_chrom_sizes = pd.read_csv("data/mm9.chrom.sizes", sep="\t", header=None)


# Download RepeatMasker tracks for each chromosome (mm9) and save it in one file

ftp.cwd("../database")
with open("data/mm9.rmsk.txt.gz", "wb") as fp:
    for chromosome in tqdm(mm9_chrom_sizes[0].values, leave=False, desc="Downloading rmsk.txt.gz for mm9"):
        ftp.retrbinary(f'RETR {chromosome}_rmsk.txt.gz', fp.write)

# Load the repeat masker loci as pandas.DataFrame
        
mm9_rmsk = pd.read_csv("data/mm9.rmsk.txt.gz", sep="\t", compression="gzip", header=None, comment="#")
mm9_rmsk = mm9_rmsk[mm9_rmsk[5].isin(mm9_chrom_sizes[0].values)]

In [2]:
ftp.cwd("../liftOver")
with open("windows/mm9ToMm10.over.chain.gz", "wb") as fp:
    ftp.retrbinary('RETR mm9ToMm10.over.chain.gz', fp.write)
ftp.quit()

'221 Goodbye.'

In [3]:
!gunzip -d windows/mm9ToMm10.over.chain.gz

In [4]:
ftp = FTP("hgdownload.cse.ucsc.edu")
ftp.login()
ftp.cwd("goldenPath/mm10/liftOver")
with open("windows/mm10ToMm9.over.chain.gz", "wb") as fp:
    ftp.retrbinary('RETR mm10ToMm9.over.chain.gz', fp.write)
ftp.quit()

'221 Goodbye.'

In [5]:
!gunzip -d windows/mm10ToMm9.over.chain.gz

### 1.1.1 Save repeat loci for different contexts in bed files

In [2]:
# Iterate through different context to extract simple repeats from the full list

complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}

for repeat_sequence in tqdm(['CA', 'CAA', 'CAT', 'CAG', 'CAC'], leave=False):
    repeat_sequence_label = f"({repeat_sequence})n"
    rc_repeat_sequence_label = "(" + "".join(complement.get(base) for base in reversed(repeat_sequence)) + ")n"
    mm9_rmsk[(mm9_rmsk[10]==repeat_sequence_label) | (mm9_rmsk[10]==rc_repeat_sequence_label)][[5,6,7]].sort_values(by=[5,6]).to_csv(f"data/mm9.{repeat_sequence}n.bed", sep="\t",
        header=None, index=None)

  0%|          | 0/5 [00:00<?, ?it/s]

```bash

$ cat mm9.CAAn.bed mm9.CACn.bed mm9.CAGn.bed mm9.CATn.bed | sort -k1,1 -k2,2n > mm9.CANn.bed
```

### 1.1.2 Save all simple repeats except (CA)n and (TG)n in a bed file

In [3]:
mm9_rmsk[(mm9_rmsk[11]=="Simple_repeat") & (mm9_rmsk[10]!="(CA)n") & (mm9_rmsk[10]!="(TG)n")][[5,6,7]].sort_values(by=[5,6]).to_csv(f"data/mm9.except.CAn.bed", sep="\t",
        header=None, index=None)

## 1.2 mm10

In [4]:
# Change directory to mm10 and download the combined RepeatMasker track

ftp.cwd("../../mm10/database")
with open("data/mm10.rmsk.txt.gz", "wb") as fp:
    ftp.retrbinary('RETR rmsk.txt.gz', fp.write)

ftp.cwd("../bigZips")
with open("data/mm10.chrom.sizes", "wb") as fp:
    ftp.retrbinary('RETR mm10.chrom.sizes', fp.write)
mm10_chrom_sizes = pd.read_csv("data/mm10.chrom.sizes", sep="\t", header=None)
    
# Load the repeat masker loci as pandas.DataFrame
        
mm10_rmsk = pd.read_csv("data/mm10.rmsk.txt.gz", sep="\t", compression="gzip", header=None, comment="#")
mm10_rmsk = mm10_rmsk[mm10_rmsk[5].isin(mm10_chrom_sizes[0].values)]

### 1.1.1 Save repeat loci for different contexts in bed files

In [5]:
# Iterate through different context to extract simple repeats from the full list

for repeat_sequence in tqdm(['CA', 'CAA', 'CAT', 'CAG', 'CAC'], leave=False):
    repeat_sequence_label = f"({repeat_sequence})n"
    rc_repeat_sequence_label = "(" + "".join(complement.get(base) for base in reversed(repeat_sequence)) + ")n"
    mm10_rmsk[(mm10_rmsk[10]==repeat_sequence_label) | (mm10_rmsk[10]==rc_repeat_sequence_label)][[5,6,7]].sort_values(by=[5,6]).to_csv(f"data/mm10.{repeat_sequence}n.bed", sep="\t",
        header=None, index=None)

  0%|          | 0/5 [00:00<?, ?it/s]

```bash

$ cat mm10.CAAn.bed mm10.CACn.bed mm10.CAGn.bed mm10.CATn.bed | sort -k1,1 -k2,2n > mm10.CANn.bed
```

### 1.1.2 Save all simple repeats except (CA)n and (TG)n in a bed file

In [6]:
mm10_rmsk[(mm10_rmsk[11]=="Simple_repeat") & (mm10_rmsk[10]!="(CA)n") & (mm10_rmsk[10]!="(TG)n")][[5,6,7]].sort_values(by=[5,6]).to_csv(f"data/mm10.except.CAn.bed", sep="\t",
        header=None, index=None)

# 2. Generate genomic windows

```bash
$ snakemake windows repeat_windows -p -j 1
```