# X To A Gene Lists

In [32]:
import os
import sys
from pathlib import Path

import numpy as np
import pandas as pd
from IPython.display import HTML, Markdown, display

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Project level imports
sys.path.insert(0, '../lib')
from larval_gonad.notebook import Nb
from larval_gonad.x_to_a import CHROMS_CHR, AUTOSOMES_CHR

# Setup notebook
nbconfig = Nb.setup_notebook('04.01_X_to_A_gene_lists')

# Turn on cache
from joblib import Memory
memory = Memory(cachedir=nbconfig.cache, verbose=0)

last updated: 2018-01-31 
Git hash: f95a97ffaf07b81fdbccd6c0e49ed8ad3f0e208e


In [3]:
DATA_DIR = Path('../output/testis_scRNAseq_pilot')
REF = Path(os.environ['REFERENCES_DIR'])

In [4]:
# Create fbgn2symbol and symbol2fbgn map
annot = pd.read_csv(Path(REF, 'dmel/r6-16/fb_annotation/dmel_r6-16.fb_annotation'),
                   sep='\t', index_col=1)

fbgn2symbol = annot['gene_symbol'].to_dict()
symbol2fbgn = {v: k for k, v in fbgn2symbol.items()}

# Create fbgn2chrom
genes = []
with Path(REF, 'dmel/r6-16/gtf/dmel_r6-16.gtf').open() as fh:
    for row in fh:
        rows = row.strip().split()
        
        if len(rows) == 0:
            continue
            
        if rows[2] == 'gene':
            genes.append((rows[0], rows[9].replace('"', '').replace(';', '')))

fbgn2chrom = pd.DataFrame(genes, columns=['chrom', 'FBgn'])
fbgn2chrom.set_index('FBgn', inplace=True)
fbgn2chrom = fbgn2chrom.to_dict()['chrom']

In [23]:
# Import data from seurat
clusters = pd.read_csv(Path(DATA_DIR, 'clusters.tsv'), sep='\t')
clusters.index.name = 'cellID'

c3_v_c4 = pd.read_csv(Path(DATA_DIR, '2018_01_25_testis_c3_c4_differential_expression.tsv'), sep='\t', index_col=[0, 1])
c3_v_c4.index.names = ['FBgn', 'gene']
c3_v_c4['chrom'] = c3_v_c4.index.map(lambda x: fbgn2chrom[x[0]])
c3_v_c4.set_index('chrom', append=True, inplace=True)

norm = pd.read_csv(Path(DATA_DIR, 'normalized_read_counts.tsv'), sep='\t')
norm.index.name = 'FBgn'

## Genes with large differential expression between 3 and 4

In [45]:
big_diff = c3_v_c4[(c3_v_c4.p_val_adj <= 0.05) & (c3_v_c4.avg_logFC.abs() >= 2)].sort_values(by='avg_logFC')
big_diff.to_csv(DATA_DIR / '04.01-X_to_A_gene_list_c3_c4_large_diff.tsv', sep='\t')

## Gene not differentially expressed b/t cluster 3 and cluster 4

Next we are interested in which genes are not different between clusters 3 and 4 (germline). Here I look for genes with an adjusted p-value of > 0.01 and require that a gene is expressed in 80% of cells in cluster 3 and 4.

In [37]:
noDiff = c3_v_c4[(c3_v_c4.p_val_adj > 0.01) & (c3_v_c4['pct.1'] >= .8) & (c3_v_c4['pct.2'] >= .8)].sort_values(by=['avg_logFC'])
noDiff.to_csv(DATA_DIR / '04.01-X_to_A_gene_list_c3_c4_not_diff.tsv', sep='\t')

## Look for genes on in cluster 3 and off in cluster 4

First I grab the cell IDs for cells in cluster 3 or cluster 4. For each gene I sum the normalized coverage across all cells within a cluster. Finally I look for genes who have greater than 0 read counts in cluster 3, but 0 read counts in cluster 4. In other words these are genes that are on in cluster 3, but have no expression in cluster 4. If you sort by cluster 3 expression you can estimate the magnitude of expression.

In [6]:
# Prep for comparing 3 vs 4

# Get cell ids for each cluster
c3 = clusters[clusters.ident == 3].index.tolist()
c4 = clusters[clusters.ident == 4].index.tolist()

# Pull out normalized data for 3 and 4
dat3 = norm[c3].sum(axis=1)
dat4 = norm[c4].sum(axis=1)

# build nice df with all needed info including chrom
ddat = pd.DataFrame({'c3': dat3, 'c4': dat4})
ddat['gene'] = ddat.index.map(lambda x: fbgn2symbol[x])
ddat['chrom'] = ddat.index.map(lambda x: fbgn2chrom[x])
ddat.set_index(['gene', 'chrom'], append=True, inplace=True)

In [35]:
on3_off4 = ddat[(ddat.c3 > 0) & (ddat.c4 == 0)]
on3_off4.to_csv(DATA_DIR / '04.01-X_to_A_gene_list_on3_off4.tsv', sep='\t')

## Look for genes off in cluster 3 and on in cluster 4

Using the above dataset. I look for genes who have 0 read counts in cluster 3, but greater than 0 read counts in cluster 4. In other words these are genes that are off in cluster 3 and on in cluster 4. If you sort by cluster 4 expression you can estimate the magnitude of expression.

In [36]:
off3_on4 = ddat[(ddat.c3 == 0) & (ddat.c4 > 0)]
off3_on4.to_csv(DATA_DIR / '04.01-X_to_A_gene_list_off3_on4.tsv', sep='\t')