Sharvani is trying to figure out what different cell types are present. There are a number of molecular markers that she knows separates germ cell and somatic cells. However, not all of these showed up in the tSNE clustering plots. To get a better understanding of this I am generating a counts matrix which has all of the genes (~17k) as columns and all of the cells (~500) as rows for ovary or testis. Numbers indicate the number of "reads" that aligned to that gene. This will allow Sharvani to look and make sure these genetic markers are present in the dataset.

In [1]:
# %load ../start.py
# Load useful extensions

# Activate the autoreload extension for easy reloading of external packages
%reload_ext autoreload
%autoreload 2

# Trun on the water mark
%reload_ext watermark
%watermark -u -d -g

# Load ipycache extension
%reload_ext ipycache
from ipycache import CacheMagics
CacheMagics.cachedir = '../cachedir'

# The usual suspects
import os
import numpy as np
import pandas as pd

# plotting
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_context('poster')

# Turn off scientific notation
np.set_printoptions(precision=5, suppress=True)


last updated: 2017-09-28 
Git hash: a625c0d748cc18b2dde399c179a2d2b6161675d2


In [2]:
# imports
import csv
import scipy.io

In [18]:
# build matrices for ovary and testis data
ovary_dir = '../../output/ovary1/outs/filtered_gene_bc_matrices/dm6.16'
testis_dir = '../../output/testis1/outs/filtered_gene_bc_matrices/dm6.16'

symbols = pd.read_csv('/data/LCDB/lcdb-references/dmel/r6-16/fb_annotation/dmel_r6-16.fb_annotation', sep='\t')
symbols = symbols[['gene_symbol', 'primary_FBgn']].copy().set_index('primary_FBgn')

def get_matrix(curr_dir, symbols):
    # import data matrix in sparse format
    mat = scipy.io.mmread(os.path.join(curr_dir, 'matrix.mtx'))

    # import row names "gene"
    genes_path = os.path.join(curr_dir, "genes.tsv")
    gene_ids = [row[0] for row in csv.reader(open(genes_path), delimiter="\t")]
    assert mat.shape[0] == len(gene_ids)

    # import column names "cell barcode"
    barcodes_path = os.path.join(curr_dir, "barcodes.tsv")
    barcodes = [row[0] for row in csv.reader(open(barcodes_path), delimiter="\t")]
    assert mat.shape[1] == len(barcodes)

    # Make data frame
    df = pd.DataFrame(mat.todense(), index=gene_ids, columns=barcodes)
    assert df.shape == mat.shape

    # merge on symbols and return
    return df.join(symbols, how='left').set_index('gene_symbol', append=True)

ovary = get_matrix(ovary_dir, symbols)
testis = get_matrix(testis_dir, symbols)

In [19]:
# linkify gene names for excel
url = '=HYPERLINK("http://flybase.org/reports/{fbgn}.html", "{gene}")'
ovary.index = ovary.index.map(lambda x: url.format(fbgn=x[0], gene=x[1]))
testis.index = testis.index.map(lambda x: url.format(fbgn=x[0], gene=x[1]))

In [None]:
# Write out to an excel workbook for sharvani (this takes a while)
writer = pd.ExcelWriter('../../output/single_cell_matrix.xlsx')
ovary.T.to_excel(writer, sheet_name='ovary')
testis.T.to_excel(writer, sheet_name='testis')
writer.save()