In [44]:
from git import Repo
import pandas as pd
import numpy as np
import gzip
import os
from collections import defaultdict
from pybiomart import Server
import shutil

In [45]:
git_url = "https://github.com/LuyiTian/sc_mixology.git"
repo_dir = f"{os.getcwd()}../data/empirical_data/raw/"
raw_data_dir = f"{repo_dir}/data/csv/"
relevant_cell_types = ["H2228", "H1975", "HCC827"]

In [5]:
# based on https://github.com/LuyiTian/sc_mixology
Repo.clone_from(git_url, repo_dir)

<git.repo.base.Repo '/groups/itay_mayrose/halabikeren/unsupervised_learning_final_project/data/empirical_data/raw/.git'>

In [4]:
relevant_paths = [f"{raw_data_dir}{p}" for p in os.listdir(raw_data_dir) if p.startswith("sc_")]

In [46]:
data = defaultdict(dict)
for path in relevant_paths:
    technology = os.path.basename(path).replace("sc_", "").split(".")[0].split("_")[0]
    data_type = os.path.basename(path).split(".")[1]
    with gzip.open(path, 'rb') as f:
        data[technology][data_type] = pd.read_csv(f)

In [47]:
for technology in data:
    perc_dropouts = (data[technology]['count'] == 0).sum().sum() / (data[technology]['count'].shape[0]*data[technology]['count'].shape[1])*100
    print(f"technology = {technology} has {data[technology]['count'].shape[1]:,} cells, {data[technology]['count'].shape[0]:,} genes, and {np.round(perc_dropouts,2)}% dropout rate")

technology = 10x has 3,918 cells, 11,786 genes, and 63.01% dropout rate
technology = celseq2 has 305 cells, 13,426 genes, and 64.27% dropout rate
technology = dropseq has 225 cells, 15,127 genes, and 62.07% dropout rate


In [51]:
# filter to relevsnt cell types
for technology in data:
    counts = data[technology]['count']
    metadata = data[technology]['metadata']
    metadata = metadata.loc[metadata.cell_line_demuxlet.isin(['HCC827', 'H1975', 'H2228'])]
    counts = counts[metadata.index.tolist()] # no need to normalize, normalization will be done within scIGANs
    data[technology]['count'] = counts
    data[technology]['metadata'] = metadata

In [52]:
server = Server(host='http://www.ensembl.org')

dataset = (server.marts['ENSEMBL_MART_ENSEMBL']
                 .datasets['hsapiens_gene_ensembl'])
gene_id_to_name_map = dataset.query(attributes=['ensembl_gene_id', 'external_gene_name'])

In [53]:
for technology in data:
    counts = data[technology]['count']
    counts = counts.rename(index=gene_id_to_name_map.set_index("Gene stable ID")["Gene name"].to_dict())
    counts = counts.loc[counts.index.notna()]
    data[technology]['count'] = counts

In [54]:
# compute intersection genes
intersection_genes = set(data['10x']['count'].index) & set(data['celseq2']['count'].index) & set(data['dropseq']['count'].index) 
print(f"# intersection genes = {len(intersection_genes):,}")

# intersection genes = 9,305


In [80]:
technologies = list(data.keys())
full_counts = None
cell_type_labels = None
technology_labels = None
for technology in data:
    counts = data[technology]['count']
    counts = counts.loc[counts.index.isin(intersection_genes)]
    data[technology]['count'] = counts
    metadata = data[technology]['metadata']
    assert("cell_line_demuxlet" in metadata.columns)
    assert(metadata.shape[0] == counts.shape[1])
    metadata["technology"] = technology
    perc_dropouts = (counts == 0).sum().sum() / (counts.shape[0]*counts.shape[1])*100
    print(f"technology = {technology} has {counts.shape[1]:,} cells, {counts.shape[0]:,} genes, and {np.round(perc_dropouts,2)}% dropout rate")
    if cell_type_labels is None:
        cell_type_labels = metadata.cell_line_demuxlet
    else:
        cell_type_labels = pd.concat([cell_type_labels, metadata.cell_line_demuxlet])
    if technology_labels is None:
        technology_labels = metadata.technology
    else:
        technology_labels = pd.concat([technology_labels, metadata.technology])
    if full_counts is None:
        full_counts = counts
    else:
        full_counts = full_counts.join(counts)

technology = 10x has 1,786 cells, 9,305 genes, and 55.7% dropout rate
technology = celseq2 has 112 cells, 9,305 genes, and 53.53% dropout rate
technology = dropseq has 225 cells, 9,305 genes, and 49.83% dropout rate


In [64]:
perc_dropouts = (full_counts == 0).sum().sum() / (full_counts.shape[0]*full_counts.shape[1])*100
print(f"# total genes = {full_counts.shape[0]:,}, # total cells = {full_counts.shape[1]:,}")
print(f"% united dropout = {np.round(perc_dropouts, 2)}%")

# total genes = 9,305, # total cells = 2,123
% united dropout = 54.96%


In [65]:
full_counts.to_csv("../data/empirical_data/full_counts.csv")

In [88]:
cell_type_labels.loc[full_counts.columns].to_csv("../data/empirical_data/cell_type_labels.csv", index=False)

  """Entry point for launching an IPython kernel.


In [89]:
technology_labels.loc[full_counts.columns].to_csv("../data/empirical_data/technology_labels.csv", index=False)

  """Entry point for launching an IPython kernel.


In [None]:
shutil.rmtree(repo_dir)