# Dataset Download for Transmet

This script downloads the original datasets to be used with Transmet.

Three datasets are available: `Norman`, `Dixit`, and `Adamson`.

Each dataset is generated from the original data, with cells filtered so that only those selected by GEARS are included.

For each dataset, the output is a `.h5ad` file that contains the gene expression matrix for each sample, along with the condition (i.e., which gene(s) were perturbed).

The datasets are saved in `.h5ad` format in the `../datasets/[dataset_name]/processed` folder.


## Libraries and functions

### Load libraries

In [139]:
import os
import sys
import scanpy as sc
import pandas as pd
import gzip
import shutil
import anndata as ad

sys.path.append("../gene_expression")
import utils

### Define functions

In [156]:
def download_and_extract_tar(tar_link, dir_name):
    os.system("mkdir -p " + dir_name)
    utils.download_file(tar_link, dir_name + "/data.tar")
    utils.extract_tar(dir_name + "/data.tar", dir_name)


def organize_adamson_experiments(root_folder):
    """
    Organizes the folder by creating a new directory for each experiment and moving the respective files
    into the new directories with updated names.

    Args:
    - root_folder (str): Path to the folder containing the experiment files.
    """
    # List all files in the root folder
    files = os.listdir(root_folder)

    # Iterate through the files to identify unique experiments
    experiments = set()
    for file in files:
        if file.startswith("GSM"):
            experiment = "_".join(
                file.split("_")[:2]
            )  # Extract experiment identifier (e.g., GSM2406681_10X010)
            experiments.add(experiment)

    # For each experiment, create a folder and move respective files
    for experiment in experiments:
        # Create a new folder for the experiment
        experiment_dir = os.path.join(root_folder, experiment)
        os.makedirs(experiment_dir, exist_ok=True)

        # Define the new file names
        new_file_names = {
            "barcodes": "raw_barcodes.tsv.gz",
            "matrix": "raw_matrix.mtx.gz",
            "genes": "raw_features.tsv.gz",
            "cell_identities": "raw_cell_identities.csv.gz",
        }

        # Move and rename the respective files
        for file_type, new_name in new_file_names.items():
            for file in files:
                if file.startswith(experiment) and file_type in file:
                    old_path = os.path.join(root_folder, file)
                    new_path = os.path.join(experiment_dir, new_name)
                    shutil.move(old_path, new_path)
                    print(f"Moved and renamed {file} to {new_path}")

    print("Folder organization complete.")


# Function to load a single dataset and return an AnnData object
def load_dixit_dataset(experiment_dir_root):
    """
    Load a dataset from MTX, gene, and cell files into an AnnData object.

    This function reads an expression matrix in MTX format, gene names, and cell names,
    and constructs an AnnData object with the appropriate annotations.

    Parameters:
    mtx_file (str): Path to the MTX file containing the expression matrix.
    gene_file (str): Path to the CSV file containing gene names and IDs.
    cell_file (str): Path to the CSV file containing cell names.

    Returns:
    AnnData: An AnnData object containing the expression matrix with gene and cell annotations.
    """
    mtx_file = experiment_dir_root + ".mtx.txt.gz"
    gene_file = experiment_dir_root + "_genenames.csv.gz"
    cell_file = experiment_dir_root + "_cellnames.csv.gz"

    # Load the expression matrix (MTX format)
    adata = sc.read_mtx(mtx_file).transpose()

    # Load gene names and gene IDs (using only the gene_id part)
    gene_names = pd.read_csv(gene_file, index_col=0)
    gene_ids = gene_names["0"].str.split("_").str[0]  # Extract only the gene IDs
    adata.var_names = gene_ids

    # Load cell names and set them in the AnnData object
    cell_names = pd.read_csv(cell_file, index_col=0)
    cell_ids = cell_names["0"]
    adata.obs_names = cell_ids

    return adata


def modify_features_file(genes_path):
    """Modifies the features file to append 'Gene Expression' if not already present."""
    temp_file_path = "temp_genes.tsv.gz"

    with gzip.open(genes_path, "rt") as f_in:
        lines = f_in.readlines()

    if not lines[0].strip().endswith("Gene Expression"):
        with gzip.open(temp_file_path, "wt") as f_out:
            for line in lines:
                line = line.strip() + "\tGene Expression\n"
                f_out.write(line)
        os.replace(temp_file_path, genes_path)
        print(f"Modified {genes_path} to add 'Gene Expression'.")
    else:
        print(
            f"'Gene Expression' already present in {genes_path}, no modification needed."
        )


def filter_barcodes_and_add_condition(adata, barcode_file, verbose=False):
    """
    Filters the AnnData object to keep only cells present in the barcode-to-cell-type mapping file and adds condition info.
    Also prints the barcodes that are in the barcode file but not found in the AnnData object.

    Args:
    - adata (AnnData): AnnData object containing the gene expression data.
    - barcode_file (str): Path to the barcode file containing cell barcodes and their corresponding conditions.
    """
    # Load barcode file
    barcode_df = pd.read_csv(barcode_file, sep=",")
    barcodes_to_keep = barcode_df["cell_barcode"].values

    if verbose:
        # Find barcodes in the file but not in adata
        missing_barcodes = set(barcodes_to_keep) - set(adata.obs_names)
        if missing_barcodes:
            print(f"Barcodes not found in AnnData: {len(missing_barcodes)}")
            print(missing_barcodes)
        else:
            print("All barcodes from the file are present in the AnnData object.")

    # Filter adata based on the barcodes present in the barcode file
    adata_filtered = adata[adata.obs_names.isin(barcodes_to_keep)].copy()

    # Add condition information to the filtered adata
    barcode_dict = dict(zip(barcode_df["cell_barcode"], barcode_df["condition"]))
    adata_filtered.obs["condition"] = adata_filtered.obs_names.map(barcode_dict)

    print(f"Filtered AnnData to {adata_filtered.shape[0]} cells based on barcodes.")
    return adata_filtered


def save_h5ad(adata, output_path):
    """Saves the AnnData object to an HDF5 file."""
    adata.write_h5ad(output_path)
    print(f"AnnData object saved to {output_path}.")

### Create data directories

All the data is stored in the `data` directory. There are two subdirectories: `raw` and `processed`. The `raw` directory stores the raw data files as uploaded to the NCBI original paper dataset. The `processed` directory stores, for each perturbed dataset, an `h5ad` file that contains the raw counts and the normalized data.

In [None]:
# Create the directory for the raw data
datasets_dir = "../datasets"
os.system("mkdir -p " + datasets_dir)

# Norman

The original dataset from Norman et al., titled "Exploring genetic interaction manifolds constructed from rich single-cell phenotypes," is available at [GEO: GSE133344](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE133344).

First, we create the directories where the data will be stored.

In [None]:
# Norman directory
norman_dataset_dir = datasets_dir + "/norman"
os.system("mkdir -p " + norman_dataset_dir)

# Norman raw data directory
norman_raw_dir = norman_dataset_dir + "/raw"
os.system("mkdir -p " + norman_raw_dir)

# Norman processed data directory
norman_processed_dir = norman_dataset_dir + "/processed"
os.system("mkdir -p " + norman_processed_dir)

Download the Norman raw data

In [None]:
# Download the Norman dataset
if not os.path.exists(norman_raw_dir + "/raw_barcodes.tsv.gz"):
    utils.download_file(
        "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE133344&format=file&file=GSE133344%5Fraw%5Fbarcodes%2Etsv%2Egz",
        norman_raw_dir + "/raw_barcodes.tsv.gz",
    )
if not os.path.exists(norman_raw_dir + "/raw_cell_identities.csv.gz"):
    utils.download_file(
        "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE133344&format=file&file=GSE133344%5Fraw%5Fcell%5Fidentities%2Ecsv%2Egz",
        norman_raw_dir + "/raw_cell_identities.csv.gz",
    )
if not os.path.exists(norman_raw_dir + "/raw_features.tsv.gz"):
    utils.download_file(
        "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE133344&format=file&file=GSE133344%5Fraw%5Fgenes%2Etsv%2Egz",
        norman_raw_dir + "/raw_features.tsv.gz",
    )
if not os.path.exists(norman_raw_dir + "/raw_matrix.mtx.gz"):
    utils.download_file(
        "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE133nnn/GSE133344/suppl/GSE133344%5Fraw%5Fmatrix%2Emtx%2Egz",
        norman_raw_dir + "/raw_matrix.mtx.gz",
    )

The features.tsv.gz file needs to have a third column with the value "Gene Expression" to be loaded with sc.read_10x_mtx

In [None]:
modify_features_file(norman_raw_dir + "/raw_features.tsv.gz")

Define a prefix for the files in the Norman dataset. This is needed to run sc.read_10x_mtx method from scanpy.

In [133]:
norman_prefix = "raw_"

Load the Norman data into an AnnData object. This may take a few minutes.

In [None]:
adata = sc.read_10x_mtx(
    norman_raw_dir, var_names="gene_ids", cache=False, prefix=norman_prefix
)
print(adata)

Filter the Anne data object to keep only cells present in the gears barcodes file

In [None]:
adata = filter_barcodes_and_add_condition(
    adata, "../data/gears_barcodes/norman_barcodes.csv", verbose=True
)
print(adata)

Output the adata into a .h5ad file

In [None]:
save_h5ad(adata, norman_processed_dir + "/norman.h5ad")

# Dixit

A. Dixit et al. “Perturb-Seq: Dissecting Molecular Circuits with Scalable Single-Cell RNA Profiling
of Pooled Genetic Screens”

Frist, we create the directories where de data will be stored.

In [None]:
# Dixit directory
dixit_dataset_dir = datasets_dir + "/dixit"
os.system("mkdir -p " + dixit_dataset_dir)

# Dixit raw data directory
dixit_raw_dir = dixit_dataset_dir + "/raw"
os.system("mkdir -p " + dixit_raw_dir)

# Dixit processed data directory
dixit_processed_dir = dixit_dataset_dir + "/processed"
os.system("mkdir -p " + dixit_processed_dir)

Download the data. In this case, the data is in a `.tar` format and should be extracted.

In [None]:
dixit_tar_link = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE90063&format=file"
# Use the function for Dixit dataset
download_and_extract_tar(dixit_tar_link, dixit_raw_dir)

There are several experiments in the Dixit dataset, however, GEARS only uses two of the experiments, whose files have the following prefix:
1. GSM2396858_k562_tfs_7
2. GSM2396861_k562_ccycle

In [None]:
# Load the first experiment (TFs)
experiment1 = "GSM2396858_k562_tfs_7"
adata1 = load_dixit_dataset(dixit_raw_dir + "/" + experiment1)

# Load the second experiment (Cell Cycle)
experiment2 = "GSM2396861_k562_ccycle"
adata2 = load_dixit_dataset(dixit_raw_dir + "/" + experiment2)

# Concatenate the two AnnData objects, ensuring unique observations
adata_combined = ad.concat([adata1, adata2], axis=0, join="outer", index_unique=None)

# Output the combined AnnData object
print(adata_combined)

Filter the Anne data to keep only cells present in the gears barcodes file

In [None]:
adata = filter_barcodes_and_add_condition(
    adata_combined, "gears_barcodes/dixit_barcodes.csv", verbose=True
)
print(adata)

Note that 89 barcodes which are used in GEARS data are not found in our repository.

Output the adata into a .h5ad file

In [None]:
save_h5ad(adata, dixit_processed_dir + "/dixit.h5ad")

# Adamson

B. Adamson et al. “A Multiplexed Single-Cell CRISPR Screening Platform Enables Systematic
Dissection of the Unfolded Protein Response”

First, we create the directories where de data will be stored.

In [None]:
# Adamson directory
adamson_dataset_dir = datasets_dir + "/adamson"
os.system("mkdir -p " + adamson_dataset_dir)

# Adamson raw data directory
adamson_raw_dir = adamson_dataset_dir + "/raw"
os.system("mkdir -p " + adamson_raw_dir)

# Adamson processed data directory
adamson_processed_dir = adamson_dataset_dir + "/processed"
os.system("mkdir -p " + adamson_processed_dir)

Download the data in .tar format and extract it

In [None]:
adamson_tar_link = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE90546&format=file"  # Use the function for Adamson dataset
download_and_extract_tar(adamson_tar_link, adamson_raw_dir)

Adamson 10x genomics output data needs to be organized in different folders to be loaded using the sc.read_10x function.

In [None]:
# Organize the Adamson directory
organize_adamson_experiments(adamson_raw_dir)

Some of the experiments need to add a column with Gene expression for sc.read_10x function to work.

In [None]:
# For each directory of an experiment in adamson_raw_dir run modify_features_file
for dir_name in os.listdir(adamson_raw_dir):
    dir_path = os.path.join(adamson_raw_dir, dir_name)
    if os.path.isdir(dir_path):
        modify_features_file(dir_path + "/raw_features.tsv.gz")

Define a prefix for adamson experiment data

In [167]:
adamson_prefix = "raw_"

Load the experiments Anne objects and combine them into one. This may take a few minutes.

In [None]:
# Read the datasets
adata1 = sc.read_10x_mtx(
    adamson_raw_dir + "/GSM2406675_10X001",
    var_names="gene_ids",
    cache=False,
    prefix=adamson_prefix,
)

adata2 = sc.read_10x_mtx(
    adamson_raw_dir + "/GSM2406677_10X005",
    var_names="gene_ids",
    cache=False,
    prefix=adamson_prefix,
)

adata3 = sc.read_10x_mtx(
    adamson_raw_dir + "/GSM2406681_10X010",
    var_names="gene_ids",
    cache=False,
    prefix=adamson_prefix,
)

# Concatenate the datasets with unique observations
adata_combined = ad.concat(
    [adata1, adata2, adata3], axis=0, join="outer", index_unique=None
)  # Use "-" to make obs_names unique

# Output the combined AnnData object
print(adata_combined)

Filter the Anne data to keep only cells present in the gears barcodes file

In [None]:
adata = filter_barcodes_and_add_condition(
    adata_combined, "../data/gears_barcodes/adamson_barcodes.csv", verbose=True
)
print(adata)

Note that 172 barcodes which are used in GEARS data are not found in our repository.

Output the adata into a .h5ad file

In [None]:
save_h5ad(adata, adamson_processed_dir + "/adamson.h5ad")

# Replogle et al. 2022

In [180]:
replogle_raw_h5ad_link = "https://plus.figshare.com/ndownloader/files/35775606"

# TODO: Download the Replogle dataset