# Parse New Gene Table

**from:** Maria D. Vibranovski

Here attached is a list from Yong Zhang group based on our paper from 2010. But this is a still not published updated version that he shared with me but you can use.

If you need details about the columns, please look at https://genome.cshlp.org/content/suppl/2010/08/27/gr.107334.110.DC1/SupplementalMaterial.pdf  table 2a.

But mainly, what you need to select is the child genes with:

gene_type = D or R or DL or RL
m_type= M
note that contains "chrX-"

D and R stands for DNA-based Duplication and RNA-based duplication
L means that the assignment of the parental genes is less reliable.
M indicates that is between chromosome movement.

Hope it helps. If you need I can parse for you. please, do not hesitate to ask. But I thought you would prefer a complete list where you can look at subsets.

cheers

Maria


In [1]:
import os
import sys
from pathlib import Path
import re

from IPython.display import display, HTML, Markdown
import numpy as np
import pandas as pd
from scipy.stats import fisher_exact, chi2_contingency
from scipy.stats.contingency import margins
import statsmodels.formula.api as smf

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Project level imports
sys.path.insert(0, '../lib')
from larval_gonad.notebook import Nb
from larval_gonad.plotting import make_figs
from larval_gonad.config import memory

In [2]:
# Setup notebook
nbconfig = Nb.setup_notebook(seurat_dir='../output/scrnaseq-wf/scrnaseq_combine_force')

last updated: 2019-03-14 
Git hash: 657094fd753a190e845ba8d3b2f5dbc8874cf279


In [3]:
def adjusted_residuals(observed, expected):
    resid = (observed - expected) / np.sqrt(expected)
    n = observed.sum().sum()
    rsum, csum = margins(observed)
    v = csum * rsum * (n - rsum) * (n - csum) / n**3
    return (observed - expected) / np.sqrt(v)

## Import data from Maria

## FBgn sanitizer

I don't know where these FBgns are from, so I need to sanitize them to my current annotation.

In [4]:
assembly = nbconfig.assembly
tag = nbconfig.tag
pth = Path(os.environ['REFERENCES_DIR'], f'{assembly}/{tag}/fb_annotation/{assembly}_{tag}.fb_annotation')

# Create an FBgn 
mapper = {}

for record in pd.read_csv(pth, sep='\t').to_records():
    mapper[record.primary_FBgn] = record.primary_FBgn
    
    try:
        for g in record.secondary_FBgn.split(','):
            mapper[g] = record.primary_FBgn
    except AttributeError:
        pass

In [5]:
autosomes = ['chr2L', 'chr2R', 'chr3L', 'chr3R']

In [8]:
movement = (
    pd.read_excel('../data/external/maria/dm6_ver78_genetype.new.xlsx')
    .query('gene_type == ["D", "R", "Dl", "Rl"] and m_type == "M"')
    .assign(child_chrom = lambda df: df.note.str.extract('(chr.*?)-'))
    .assign(parent_chrom = lambda df: df.note.str.extract('-(chr.*?)[:;]'))
    .assign(FBgn = lambda df: df.child_id.map(mapper))
    .assign(parent_FBgn = lambda df: df.parent_id.map(mapper))
    .drop(['child_id', 'parent_id', 'note', 'm_type'], axis=1)
    .dropna()
    .set_index('FBgn')
    .assign(moved_x_to_a = lambda df: (df.parent_chrom == 'chrX') & df.child_chrom.isin(autosomes))
    .assign(moved_a_to_a = lambda df: df.parent_chrom.isin(autosomes) & df.child_chrom.isin(autosomes))
    .assign(moved_a_to_x = lambda df: df.parent_chrom.isin(autosomes) & (df.child_chrom == 'chrX'))
    .query('moved_x_to_a | moved_a_to_a | moved_a_to_x')
)

movement.head()

Unnamed: 0_level_0,gene_type,child_chrom,parent_chrom,parent_FBgn,moved_x_to_a,moved_a_to_a,moved_a_to_x
FBgn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
FBgn0000246,Dl,chr3R,chrX,FBgn0002873,True,False,False
FBgn0002562,D,chrX,chr2L,FBgn0002563,False,False,True
FBgn0003060,D,chr3R,chr2L,FBgn0032282,False,True,False
FBgn0003086,Dl,chrX,chr3L,FBgn0036393,False,False,True
FBgn0003357,R,chr3R,chr3L,FBgn0250815,False,True,False


In [9]:
biomarkers = (
    nbconfig.seurat.get_biomarkers('res.0.6')
    .cluster.map(nbconfig.short_cluster_annot)
    .pipe(lambda x: x[x != 'UNK'])
    .to_frame()
    .reset_index()
    .groupby('FBgn')
    .apply(lambda x: '|'.join(x.cluster))
    .rename('biomakrer_cluster')
)

In [10]:
germ_comp = (
    pd.read_csv('../output/scrnaseq-wf/germcell_deg/gonia_vs_cytes.tsv', sep='\t')
    .assign(FBgn = lambda df: df.primary_FBgn)
    .assign(gonia = lambda df: df.avg_logFC > 0)
    .assign(cyte = lambda df: df.avg_logFC < 0)
    .set_index('FBgn')
    .loc[:, ['gonia', 'cyte']]
    .idxmax(axis=1)
    .rename('bias_gonia_vs_cyte')
)

In [12]:
biomarkers.head()

FBgn
FBgn0000014       PC|TE
FBgn0000015          TE
FBgn0000017    TE|PC|LC
FBgn0000024          PC
FBgn0000028         E1º
Name: biomakrer_cluster, dtype: object

In [23]:
df = (
    movement.join(biomarkers, how='left')
    .join(germ_comp.rename('bias_gonia_vs_cyte_child'), how='left')
    .join(germ_comp.rename('bias_gonia_vs_cyte_parent'), on='parent_FBgn', how='left')
)

In [31]:
df

Unnamed: 0_level_0,gene_type,child_chrom,parent_chrom,parent_FBgn,moved_x_to_a,moved_a_to_a,moved_a_to_x,biomakrer_cluster,bias_gonia_vs_cyte_child,bias_gonia_vs_cyte_parent
FBgn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
FBgn0000246,Dl,chr3R,chrX,FBgn0002873,True,False,False,,,
FBgn0002562,D,chrX,chr2L,FBgn0002563,False,False,True,,,
FBgn0003060,D,chr3R,chr2L,FBgn0032282,False,True,False,,,
FBgn0003086,Dl,chrX,chr3L,FBgn0036393,False,False,True,,,
FBgn0003357,R,chr3R,chr3L,FBgn0250815,False,True,False,,,
FBgn0004414,Rl,chr3L,chr2L,FBgn0283658,False,True,False,,,
FBgn0010549,Dl,chr2R,chr3R,FBgn0039645,False,True,False,,,
FBgn0011559,Dl,chr2L,chr3L,FBgn0052133,False,True,False,,,
FBgn0015008,Rl,chr3R,chrX,FBgn0025633,True,False,False,E1º|L1º|M1º,cyte,
FBgn0015025,Rl,chr3L,chr3R,FBgn0039860,False,True,False,,,


In [37]:
out_order = [
    'child_chrom',
    'parent_chrom',
    'parent_FBgn',
    'gene_type',
    'moved_x_to_a',
    'moved_a_to_a',
    'moved_a_to_x',
    'biomakrer_cluster',
    'bias_gonia_vs_cyte_child',
    'bias_gonia_vs_cyte_parent'
]

df.reindex(columns=out_order).reset_index().rename({'FBgn': 'child_FBgn'}, axis=1).fillna('nan').to_csv('../output/notebook/2019-03-14_movement_data.csv', index=None)

In [38]:
print('\n'.join(out_order))

child_chrom
parent_chrom
parent_FBgn
gene_type
moved_x_to_a
moved_a_to_a
moved_a_to_x
biomakrer_cluster
bias_gonia_vs_cyte_child
bias_gonia_vs_cyte_parent
