# Differences in Processing two files
* Same input length
* Output from "Unique" layout has a much longer centromere in annotation than sequence
* **Annotation Track does not have equivalent Ns to get `.replace('N', '')`**

## Solution:
**Leave N's in sequence file, iterate through both sequence and annotation simultaneously, where there is an N, remove from both files.**  
**Result:** Annotation and sequence files stay in sync, even as Unique layout removes majority of sequence.

In [1]:
from DNASkittleUtils.Contigs import *

In [2]:
chr_names = 'chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 chrX chrY'.split()

In [22]:
anno_loc = r"E:\Projects\FluentDNA-2.4.1\www-data\dnadata\Human Unique Gene Annotations Gencode v30 vs PanTro6_\Human Unique Gene Annotations Gencode v30 vs PanTro6___285391001bp.fa"
annotations = read_contigs(anno_loc)
len(annotations)

24

In [26]:
seq_loc = r"E:\Projects\FluentDNA\DDV\www-data\dnadata\Unique Human Hg38 vs Chimpanzee PanTro6_\sources\Unique Human Hg38 vs Chimpanzee PanTro6___285391001bp.fa"
sequences = read_contigs(seq_loc)
len(sequences)

24

In [28]:
list(zip([c.name for c in sequences], [c.name for c in annotations]))

[('chr1_Hg38_unique', 'chr1_gencode_unique'),
 ('chr2_Hg38_unique', 'chr2_gencode_unique'),
 ('chr3_Hg38_unique', 'chr3_gencode_unique'),
 ('chr4_Hg38_unique', 'chr4_gencode_unique'),
 ('chr5_Hg38_unique', 'chr5_gencode_unique'),
 ('chr6_Hg38_unique', 'chr6_gencode_unique'),
 ('chr7_Hg38_unique', 'chr7_gencode_unique'),
 ('chr8_Hg38_unique', 'chr8_gencode_unique'),
 ('chr9_Hg38_unique', 'chr9_gencode_unique'),
 ('chr10_Hg38_unique', 'chr10_gencode_unique'),
 ('chr11_Hg38_unique', 'chr11_gencode_unique'),
 ('chr12_Hg38_unique', 'chr12_gencode_unique'),
 ('chr13_Hg38_unique', 'chr13_gencode_unique'),
 ('chr14_Hg38_unique', 'chr14_gencode_unique'),
 ('chr15_Hg38_unique', 'chr15_gencode_unique'),
 ('chr16_Hg38_unique', 'chr16_gencode_unique'),
 ('chr17_Hg38_unique', 'chr17_gencode_unique'),
 ('chr18_Hg38_unique', 'chr18_gencode_unique'),
 ('chr19_Hg38_unique', 'chr19_gencode_unique'),
 ('chr20_Hg38_unique', 'chr20_gencode_unique'),
 ('chr21_Hg38_unique', 'chr21_gencode_unique'),
 ('chr22_H

In [29]:
anno_contigs = []
seq_contigs = []
for s_chr, a_chr in zip(sequences, annotations):
    purged_a = []
    purged_s = []
    assert len(a_chr.seq) == len(s_chr.seq), (len(a_chr.seq), len(s_chr.seq))
    for a, s in zip(a_chr.seq, s_chr.seq):
        if s != 'N':
            purged_a.append(a)
            purged_s.append(s)
    name = s_chr.name.split('_')[0]
    anno_contigs.append(Contig(name, ''.join(purged_a)))
    seq_contigs.append(Contig(name, ''.join(purged_s)))
write_contigs_to_file(r"E:\Genomes\Human\Human Unique Genes Gencode30.fa", anno_contigs)
write_contigs_to_file(r"E:\Genomes\Human\Human Unique Sequence.fa", seq_contigs)

Done writing  24 contigs and 134,760,773bp
Done writing  24 contigs and 134,760,773bp


# Merge Gene Annotation and Centromere Annotation into One Track

In [6]:
anno_contigs = read_contigs(r"E:\Genomes\Human\Human Unique Genes Gencode30.fa")
centromere_seq = read_contigs(r"E:\Genomes\Human\Unique Human Centromere Locations.gff3_extracted.fa")
# list(zip([c.name for c in anno_contigs], [c.name for c in centromere_seq]))

In [9]:
fasta = []
for c_chr, a_chr in zip(centromere_seq, anno_contigs):
    current_contig = []
    for c, a in zip(c_chr.seq, a_chr.seq):
        if a == '-':
            current_contig.append('R' if c != '-' else c)  # centromere marked with 'R'
        else:
            current_contig.append(a)
    fasta.append(Contig(a_chr.name, ''.join(current_contig)))
write_contigs_to_file(r"E:\Genomes\Human\Human Unique Annotation merged.fa", fasta)

Done writing  24 contigs and 134,760,677bp


./FluentDNA --outname="Test chr21 unique annotation" --fasta="E:\Genomes\Human\gencode.v30.annotation.gff3__chr21.fa" --chainfile="E:\Genomes\Human\hg38ToPanTro6.over.chain" --layout=unique --contigs chr21

./FluentDNA --outname="Test chr21 unique seq" --fasta="E:\Genomes\Human\chroms\chr21.fa" --chainfile="E:\Genomes\Human\hg38ToPanTro6.over.chain" --layout=unique --contigs chr21

./FluentDNA --fasta="E:\Genomes\Human\Human Unique Annotation merged.fa_squished.fa"
--extrafastas
"E:\Genomes\Human\Human Unique Sequence.fa"
--outname="Unique Human Genes and Centromere vs Chimpanzee PanTro6"
--column_widths="[20,100]"