**Table of contents**<a id='toc0_'></a>    
- 1. [IMPORTS](#toc1_)    
- 2. [IMPORT FILES](#toc2_)    
  - 2.1. [Import and parse shatter files](#toc2_1_)    
    - 2.1.1. [FX: `get_shatter_file`](#toc2_1_1_)    
    - 2.1.2. [FX: `get_shatter_alignments`](#toc2_1_2_)    
  - 2.2. [Import matching TEs .tsv file](#toc2_2_)    
    - 2.2.1. [FX: `get_matching_tsv_file`](#toc2_2_1_)    
- 3. [COMPARE TE POSITIONS TO ALIGNMENT POSITIONS](#toc3_)    
  - 3.1. [FX: `compare_tes_to_shatter`](#toc3_1_)    
- 4. [COMPARE TE START POSITIONS WITH ALIGNMENT POSITIONS](#toc4_)    
  - 4.1. [FX: `do_comparison`](#toc4_1_)    
- 5. [COMPARE POSITIONS AND DETERMINE MOVEMENT](#toc5_)    
  - 5.1. [CREATE DF OF ONLY TES THAT ACTUALLY MOVED](#toc5_1_)    
  - 5.2. [FX: `create_allmoved_bedfile`](#toc5_2_)    
  - 5.3. [FX: `create_rideogram_tsv`](#toc5_3_)    

<!-- vscode-jupyter-toc-config
	numbering=true
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

# 1. <a id='toc1_'></a>[IMPORTS](#toc0_)

In [5]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

from matplotlib import cm
import seaborn as sns
sns.set(style="whitegrid")
from Bio import SeqIO
from Bio.Seq import Seq
# from matplotlib_venn import venn2, venn2_circles, venn3, venn3_circles
from __future__ import with_statement
import numpy as np
import random
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib.pyplot import figure as fig
import csv
import sys
import seaborn as sns
import scipy
from scipy import stats as stats
from scipy.stats import norm as norm
# import xlsxwriter
# from dataprep.eda import create_report
import argparse
# %load_ext nbtutor

  from IPython.core.display import display, HTML


# 2. <a id='toc2_'></a>[IMPORT FILES](#toc0_)
## 2.1. <a id='toc2_1_'></a>[Import and parse shatter files](#toc0_)
### 2.1.1. <a id='toc2_1_1_'></a>[FX: `get_shatter_file`](#toc0_)
- import shatter alignment file and return list of the lines of the file

### 2.1.2. <a id='toc2_1_2_'></a>[FX: `get_shatter_alignments`](#toc0_)
- parse through the shatter lines and return two lists, one with chain info and one with alignment info

In [6]:
def get_shatter_file(shatter_file: str):
    """ 
    shatter_file: str, filepath of shatter alignment file
    """
    
    shatter_lines = []
    
    with open(shatter_file, newline ='') as lines:
        shatter_reader = csv.reader(lines, delimiter=' ')
        for line in shatter_reader:
            
            shatter_lines.append(line)
            
    return shatter_lines

def get_shatter_alignments(shatter_lines: list):
    """ 
    shatter_lines: list, output from `get_shatter_file` fx
    """
    
    bases = ["A", "T", "G", "C", "-", "N"]
    
    chain_info = []
    alignments = []
    
    for i in range(len(shatter_lines)):
        
        if shatter_lines[i][0][0] != ">" and shatter_lines[i][0][0] not in bases:
            
            chain_line=shatter_lines[i]
            chain_line[2] = int(chain_line[2])
            chain_line[3] = int(chain_line[3])
            chain_line[5] = int(chain_line[5])
            chain_line[6] = int(chain_line[6])
            chain_info.append(chain_line)
            
        elif shatter_lines[i][0] in bases:
            
            align_line = shatter_lines[i]
            align_line[1]=int(align_line[1])
            align_line[3]=int(align_line[3])
            alignments.append(align_line)
            
            
    
    return chain_info, alignments

## 2.2. <a id='toc2_2_'></a>[Import matching TEs .tsv file](#toc0_)
### 2.2.1. <a id='toc2_2_1_'></a>[FX: `get_matching_tsv_file`](#toc0_)
- import the .tsv file generated using the `create_tsv_file` function with the `include_ids` option set to **True** in the TRACKING notebook

In [16]:
def get_matching_tsv_file(matching_file: str):
    """ 
    matching_file: str, filepath to matchingTEs .tsv file
    """
    
    matching_lines_list = []
    
    with open(matching_file) as matching_lines:
        
        matching_line_reader = csv.reader(matching_lines, delimiter='\t')
        
        for line in matching_line_reader:
            
            matching_lines_list.append(line)
    return matching_lines_list

# 3. <a id='toc3_'></a>[COMPARE TE POSITIONS TO ALIGNMENT POSITIONS](#toc0_)

## 3.1. <a id='toc3_1_'></a>[FX: `compare_tes_to_shatter`](#toc0_)

1. for each row in TE dataframe, get N2 start position from shatter and the corresponding CB start position from the alignment
2. for that row in the TE dataframe, check if CB TE start position against alignment position
    - if the same: 'movement' is due to position differences because of alignment 
    - if different: 'movement' is real

In [10]:
def compare_tes_to_shatter(tes_df, aligns_df):
    """ 
    tes_df: TE dataframe
    aligns_df: dataframe created 
    """
    same_list = []
    different_samechr_list = []
    different_diffchr_list = []
    what_list = []

    for index, row in tes_df.iterrows():

        n2_chrom = tes_df.at[index, 'n2_chrom']
        n2_start = tes_df.at[index, 'n2_start']

        cb_chrom = tes_df.at[index, 'cb_chrom']
        cb_start = tes_df.at[index, 'cb_start']


        if n2_chrom == cb_chrom: # if the transposons are on the same chromosome in both N2 and CB

            n2_alignment_idx = aligns_df.index[aligns_df['n2_pos'] == n2_start].tolist() # get the alignment index for the N2 TE start position

            if len(n2_alignment_idx) != 0:
                
                cb_alignment_pos = aligns_df.at[n2_alignment_idx[0], 'cb_pos'] # get the CB alignment position using the N2 TE start position alignment index

            else:
#                 print('what')
                what_list.append(row)

#             print('n2 TE:', n2_start)
#             print('cb TE:', cb_start, 'cb alignment:', cb_alignment_pos)

            if cb_start == cb_alignment_pos: # if the cb te start position is the same as the cb alignment position
                
#                 print('same alignment pos and TE start, no movement')
                same_list.append(row)
                
            else: # if the cb te start position is different from the cb alignment position
                
#                 print('different alignment pos and TE start, movement')
                different_samechr_list.append(row)

        elif n2_chrom != cb_chrom: # if the transposons are on different chromosomes in N2 and CB, then there was movement
            
#             print('different chromosomes, movement')
            
            different_diffchr_list.append(row)

    
    print('same', len(same_list))
    print('diff samechr', len(different_samechr_list))
    print('diff diffchr', len(different_diffchr_list)) 
    print('n2 start pos not in alignment file', len(what_list))
    
    same_df = pd.DataFrame(same_list)
    diff_samechr_df = pd.DataFrame(different_samechr_list)
    diff_diffchr_df = pd.DataFrame(different_diffchr_list)
    what_df = pd.DataFrame(what_list)
    
    return same_df, diff_samechr_df, diff_diffchr_df, what_df

In [11]:
## chromosome V file was organized in opposite way so needed separate function

def compare_tes_to_shatterV(tes_df, aligns_df):
    same_list = []
    different_samechr_list = []
    different_diffchr_list = []
    what_list = []

    for index, row in tes_df.iterrows():

        n2_chrom = tes_df.at[index, 'n2_chrom']
        n2_start = tes_df.at[index, 'n2_start']

        cb_chrom = tes_df.at[index, 'cb_chrom']
        cb_start = tes_df.at[index, 'cb_start']


        if n2_chrom == cb_chrom: # if the transposons are on the same chromosome in both N2 and CB

            cb_alignment_idx = aligns_df.index[aligns_df['n2_pos'] == cb_start].tolist() # get the alignment index for the CB TE start position

            if len(cb_alignment_idx) != 0:
                
                n2_alignment_pos = aligns_df.at[cb_alignment_idx[0], 'cb_pos'] # get the CB alignment position using the N2 TE start position alignment index

            else:
#                 print('what')
                what_list.append(row)

#             print('n2 TE:', n2_start)
#             print('cb TE:', cb_start, 'cb alignment:', cb_alignment_pos)

            if n2_start == n2_alignment_pos:
                
#                 print('same alignment pos and TE start, no movement')
                same_list.append(row)
                
            else:
                
#                 print('different alignment pos and TE start, movement')
                different_samechr_list.append(row)

        elif n2_chrom != cb_chrom: # if the transposons are on different chromosomes in N2 and CB
            
#             print('different chromosomes, movement')
            
            different_diffchr_list.append(row)

    print('same', len(same_list))
    print('diff samechr', len(different_samechr_list))
    print('diff diffchr', len(different_diffchr_list)) 
    print('n2 start pos not in alignment file', len(what_list))
    
    same_df = pd.DataFrame(same_list)
    diff_samechr_df = pd.DataFrame(different_samechr_list)
    diff_diffchr_df = pd.DataFrame(different_diffchr_list)
    what_df = pd.DataFrame(what_list)
    
    return same_df, diff_samechr_df, diff_diffchr_df, what_df

# 4. <a id='toc4_'></a>[COMPARE TE START POSITIONS WITH ALIGNMENT POSITIONS](#toc0_)
## 4.1. <a id='toc4_1_'></a>[FX: `do_comparison`](#toc0_)
- this function wraps the previous functions to import the shatter file, get the TE dataframe, compare the alignment positions to the TE locations and then returns 4 different lists containing the info for the unique TE in N2 and CB 
- can also save the results to an excel file with 4 different sheets (one sheet per list)
  - `same_df`: N2 TEs whose CB start position is the same as the CB alignment adjusted position, indicating no real TE movement
  - `diff_samechr_df`: N2 TEs whose CB start position is different than the CB alignment adjusted position, indicating movement
  - `diff_diffchr_df`: N2 TEs whose CB start position is on a different chromosome, indicating movement
  - `what_df`: N2 TEs whose CB start position is not in the alignment file
- output excel file has the following format \
**n2_id	| n2_chrom |	n2_start |	n2_stop |	cb_id |	cb_chrom |	cb_start |	cb_stop**

In [14]:
def do_comparison(shatter_file_list_index: int, tes_df_list_index: int, chr_title: str, to_save: bool = False):
    """ 
    shatter_file_list_index: int, index of the shatter_file needed in the shatter_file_list
    tes_df_list_index: int, index of the tes_df needed in the tes_df_list
    chr_title: str, used to name excel file
    to_save: bool, whether to save results to excel file
    """
    
    chromosome = get_shatter_file(shatter_file_list[shatter_file_list_index])
    
    chr_chains, chr_aligns = get_shatter_alignments(chromosome)
    
    aligns_df = pd.DataFrame(chr_aligns, columns = ['n2','n2_pos', 'cb', 'cb_pos'])

    convert_dict = {'n2_pos': int,
                    'cb_pos': int }  
    aligns_df = aligns_df.astype(convert_dict)  
    
    
    tes_df = tes_df_list[tes_df_list_index]
    
    print(chr_title)
    same_df, diff_samechr_df, diff_diffchr_df, what_df = compare_tes_to_shatter(tes_df, aligns_df)
    
    if to_save: 
        writer = pd.ExcelWriter(f'{chr_title}_tes_alignments.xlsx', engine='xlsxwriter')
        same_df.to_excel(writer, sheet_name='samepos')
        diff_samechr_df.to_excel(writer, sheet_name='diffpos_samechr')
        diff_diffchr_df.to_excel(writer, sheet_name='diffpos_diffchr')
        what_df.to_excel(writer, sheet_name='what')
        writer.save()
    
    return aligns_df, tes_df, same_df, diff_samechr_df, diff_diffchr_df, what_df

In [15]:
## chromosome V file was organized in opposite way so needed separate function

def do_comparisonV(shatter_file_list_index, tes_df_list_index, chr_title):

        """ 
        shatter_file_list_index: int, index of the shatter_file needed in the shatter_file_list
        tes_df_list_index: int, index of the tes_df needed in the tes_df_list
        chr_title: str, used to name excel file
        to_save: bool, whether to save results to excel file
        """
    
        chromosome = get_shatter_file(shatter_file_list[shatter_file_list_index])

        chr_chains, chr_aligns = get_shatter_alignments(chromosome)

        aligns_df = pd.DataFrame(chr_aligns, columns = ['n2','n2_pos', 'cb', 'cb_pos'])

        convert_dict = {'n2_pos': int,
                        'cb_pos': int }  
        aligns_df = aligns_df.astype(convert_dict)  
        # print(chr1_aligns_df.dtypes)            

        tes_df = tes_df_list[tes_df_list_index]

        print(chr_title)
        same_df, diff_samechr_df, diff_diffchr_df, what_df = compare_tes_to_shatterV(tes_df, aligns_df)

        if to_save: 
                writer = pd.ExcelWriter(f'{chr_title}_tes_alignments.xlsx', engine='xlsxwriter')
                same_df.to_excel(writer, sheet_name='samepos')
                diff_samechr_df.to_excel(writer, sheet_name='diffpos_samechr')
                diff_diffchr_df.to_excel(writer, sheet_name='diffpos_diffchr')
                what_df.to_excel(writer, sheet_name='what')
                writer.save()

        return aligns_df, tes_df, same_df, diff_samechr_df, diff_diffchr_df, what_df

# 5. <a id='toc5_'></a>[COMPARE POSITIONS AND DETERMINE MOVEMENT](#toc0_)

In [4]:
shatter_file_list = ['N2_to_CB_chrI.N2net.sort.axt.shatter', 
                     'N2_to_CB_chrII.N2net.sort.axt.shatter', 
                     'N2_to_CB_chrIII.N2net.sort.axt.shatter', 
                     'N2_to_CB_chrIV.N2net.sort.axt.shatter', 
                     'CB_to_N2_chrV.N2net.sort.axt.shatter', 
                     'N2_to_CB_chrX.N2net.sort.axt.shatter']

n2_chrom_lengths = np.array( [15114068, 15311845, 13819453, 17493838, 20953657, 17739129] )
cb_chrom_lengths = np.array( [15045644, 15257363, 13206755, 17183882, 20547529, 17584915] )

In [17]:
matching_file = 'matchingTEs_info_ids.tsv'
matching_lines_list = get_matching_tsv_file(matching_file)

tes_df = pd.DataFrame(matching_lines_list[1:], columns = ['n2_id','n2_chrom', 'n2_start', 'n2_stop', 'cb_id', 'cb_chrom', 'cb_start', 'cb_stop'])
convert_dict2 = {'n2_id' : int,'n2_chrom': int, 'n2_start': int, 'n2_stop': int, 'cb_id': int, 'cb_chrom': int, 'cb_start': int, 'cb_stop': int}  
  
tes_df = tes_df.astype(convert_dict2)  
# print(tes_df.dtypes)

In [9]:
chr1_tes_df = tes_df[tes_df['n2_chrom'] == 1]
chr2_tes_df = tes_df[tes_df['n2_chrom'] == 2]
chr3_tes_df = tes_df[tes_df['n2_chrom'] == 3]
chr4_tes_df = tes_df[tes_df['n2_chrom'] == 4]
chr5_tes_df = tes_df[tes_df['n2_chrom'] == 5]
chrX_tes_df = tes_df[tes_df['n2_chrom'] == 6]

tes_df_list = [chr1_tes_df, chr2_tes_df, chr3_tes_df, chr4_tes_df, chr5_tes_df, chrX_tes_df]

In [None]:
chr1_aligns_df, chr1_tes_df, chr1_same_df, chr1_diff_samechr_df, chr1_diff_diffchr_df, chr1_what_df = do_comparison(0, 0, 'chr1')
chr2_aligns_df, chr2_tes_df, chr2_same_df, chr2_diff_samechr_df, chr2_diff_diffchr_df, chr2_what_df = do_comparison(1, 1, 'chr2')
chr3_aligns_df, chr3_tes_df, chr3_same_df, chr3_diff_samechr_df, chr3_diff_diffchr_df, chr3_what_df = do_comparison(2, 2, 'chr3')
chr4_aligns_df, chr4_tes_df, chr4_same_df, chr4_diff_samechr_df, chr4_diff_diffchr_df, chr4_what_df = do_comparison(3, 3, 'chr4')
chr5_aligns_df, chr5_tes_df, chr5_same_df, chr5_diff_samechr_df, chr5_diff_diffchr_df, chr5_what_df = do_comparisonV(4, 4, 'chr5')
chrX_aligns_df, chrX_tes_df, chrX_same_df, chrX_diff_samechr_df, chrX_diff_diffchr_df, chrX_what_df = do_comparison(5, 5, 'chrX')

## 5.1. <a id='toc5_1_'></a>[CREATE DF OF ONLY TES THAT ACTUALLY MOVED](#toc0_)

In [None]:
actually_moved = [chr1_diff_samechr_df, chr1_diff_diffchr_df, 
                  chr2_diff_samechr_df, chr2_diff_diffchr_df, 
                  chr3_diff_samechr_df, chr3_diff_diffchr_df,
                  chr4_diff_samechr_df, chr4_diff_diffchr_df,
                  chr5_diff_samechr_df, chr5_diff_diffchr_df,
                  chrX_diff_samechr_df, chrX_diff_diffchr_df]



all_moved = pd.concat(actually_moved, ignore_index=True)
all_moved

In [113]:
moved_chrom_id = ['1', '2', '3', '4', '5', '6']
n2_chromosomes = ['N2_chrI', 'N2_chrII', 'N2_chrIII', 'N2_chrIV', 'N2_chrV', 'N2_chrX']
cb_chromosomes = ['CB_chrI', 'CB_chrII', 'CB_chrIII', 'CB_chrIV', 'CB_chrV', 'CB_chrX']
# for i in range(6):
        
#         gff_df['Chromosome'] = gff_df['Chromosome'].replace(gff_chrom_id[i], chromosomes[i])

## 5.2. <a id='toc5_2_'></a>[FX: `create_allmoved_bedfile`](#toc0_)
- creates a bedfile of the N2 start and stop positions of the TEs that moved 

In [19]:
def create_allmoved_bedfile(all_moved, to_save: bool = False):
    """ 
    all_moved: the dataframe created above
    to_save: bool, whether to save results to bed file
    """
    for index, row in all_moved.iterrows():
        n2_stop = all_moved.at[index, 'n2_stop']
        n2_start = all_moved.at[index, 'n2_start']
        all_moved.at[index, 'size'] = n2_stop-n2_start
        
        all_moved.at[index, 'space'] = '-'
        all_moved.at[index, 'space2'] = '-'
        all_moved.at[index, 'space3'] = '-'
        all_moved.at[index, 'space4'] = '-'
        
        n2_id = all_moved.at[index, 'n2_id']
        
        all_moved.at[index, 'unique_id'] = f'TRANS{n2_id}'
        all_moved.at[index, 'anno_type'] = f'TRANS'
        
        n2_chr = all_moved.at[index, 'n2_chrom']
        cb_chr = all_moved.at[index, 'cb_chrom']
        if n2_chr == cb_chr:
            all_moved.at[index, 'is interchromosomal?'] = 'no'
        else:
            all_moved.at[index, 'is interchromosomal?'] = 'yes'
            
    #     all_moved.at[index, 'n2_chrom'].replace(gff_chrom_id[i], chromosomes[i])

    convert_dict = {'n2_chrom': str,
                    'cb_chrom': str }  
    all_moved = all_moved.astype(convert_dict) 

    for i in range(6):
        all_moved['n2_chrom'].replace(to_replace=moved_chrom_id[i], value=n2_chromosomes[i], inplace = True)
        all_moved['cb_chrom'].replace(to_replace=moved_chrom_id[i], value=cb_chromosomes[i], inplace = True)


    all_moved_cols = ['n2_chrom', 'n2_start',  'n2_stop', 'space', 'space2', 'cb_chrom', 'cb_start', 'cb_stop', 'unique_id', 'anno_type', 'space3', 'space4']
    all_moved = all_moved[all_moved_cols]

    all_moved_cols_bed = ['n2_chrom', 'n2_start', 'n2_stop']
    all_moved_bed = all_moved[all_moved_cols_bed]

    for i, r in all_moved_bed.iterrows():
        all_moved_bed.at[i, 'genome_name'] = 'N2'
        all_moved_bed.at[i, 'tags'] = 'mt:o;mc:red;ms:3'

    # moved_tes_syriformat = []
    if to_save:
        all_moved_bed.to_csv('all_moved.bed', index=False, header=False, sep = '\t')

    return all_moved_bed

## 5.3. <a id='toc5_3_'></a>[FX: `create_rideogram_tsv`](#toc0_)
- creates a tsv file of the actually moved TEs that is compatible with RIdeogram

In [None]:
def create_rideogram_tsv(all_moved, to_save: bool = False):
    """ 
    all_moved: dataframe of moved TEs
    to_save: bool, whether to save results to RIdeogram compatible tsv file
    """

    rideogram_cols = ['n2_chrom', 'n2_start', 'n2_stop', 'cb_chrom', 'cb_start', 'cb_stop']

    color = ['000000']*len(dataframe.n2_chrom)
    all_moved['fill'] = color

    rideogram_df = all_moved[rideogram_cols]

    if to_save:
        rideogram_df.to_csv('rideogram_allmovedTEs.tsv', index=False, header=False, sep = '\t')

    
