# Dataset download for Transmet

This script downloads the original datasets to be used with Transmet.

Currently, the only fully processed dataset is Norman.

The datasets are stored in h5ad format in the `processed` folder.


### Needed libraries

In [45]:
import os
import sys
import scanpy as sc
import pandas as pd
import gzip

sys.path.append("../gene_expression")
import utils

### Needed functions

In [46]:
def download_and_extract_tar(tar_link, dir_name):
    os.system("mkdir -p " + dir_name)
    utils.download_file(tar_link, dir_name + "/data.tar")
    utils.extract_tar(dir_name + "/data.tar", dir_name)


def modify_features_file(genes_path):
    """Modifies the features file to append 'Gene Expression' if not already present."""
    temp_file_path = "temp_genes.tsv.gz"

    with gzip.open(genes_path, "rt") as f_in:
        lines = f_in.readlines()

    if not lines[0].strip().endswith("Gene Expression"):
        with gzip.open(temp_file_path, "wt") as f_out:
            for line in lines:
                line = line.strip() + "\tGene Expression\n"
                f_out.write(line)
        os.replace(temp_file_path, genes_path)
        print(f"Modified {genes_path} to add 'Gene Expression'.")
    else:
        print(
            f"'Gene Expression' already present in {genes_path}, no modification needed."
        )


def filter_barcodes_and_add_condition(adata, barcode_file):
    """
    Filters the AnnData object to keep only cells present in the barcode-to-cell-type mapping file and adds condition info.

    Args:
    - adata (AnnData): AnnData object containing the gene expression data.
    - barcode_file (str): Path to the barcode file containing cell barcodes and their corresponding conditions.
    """
    barcode_df = pd.read_csv(barcode_file, sep=",")
    barcodes_to_keep = barcode_df["cell_barcode"].values
    adata = adata[adata.obs_names.isin(barcodes_to_keep)].copy()

    barcode_dict = dict(zip(barcode_df["cell_barcode"], barcode_df["condition"]))
    adata.obs["condition"] = adata.obs_names.map(barcode_dict)

    print(f"Filtered AnnData to {adata.shape[0]} cells based on barcodes.")
    return adata


def save_h5ad(adata, output_path):
    """Saves the AnnData object to an HDF5 file."""
    adata.write_h5ad(output_path)
    print(f"AnnData object saved to {output_path}.")

### Create data directories

All the data is stored in the `data` directory. There are two subdirectories: `raw` and `processed`. The `raw` directory stores the raw data files as uploaded to the NCBI original paper dataset. The `processed` directory stores, for each perturbed dataset, an `h5ad` file that contains the raw counts and the normalized data.

In [None]:
# Create the directory for the raw data
raw_data_dir = "raw"
os.system("mkdir -p " + raw_data_dir)

# Create the directory for the processed data
processed_data_dir = "processed"
os.system("mkdir -p " + processed_data_dir)

# Norman

The original dataset from Norman et al., titled "Exploring genetic interaction manifolds constructed from rich single-cell phenotypes," is available at [GEO: GSE133344](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE133344).

In [48]:
# Norman directory
norman_dir = raw_data_dir + "/norman"
os.system("mkdir -p " + norman_dir)
# Use the function for Norman dataset
if not os.path.exists(norman_dir + "/raw_barcodes.tsv.gz"):
    utils.download_file(
        "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE133344&format=file&file=GSE133344%5Fraw%5Fbarcodes%2Etsv%2Egz",
        norman_dir + "/raw_barcodes.tsv.gz",
    )
if not os.path.exists(norman_dir + "/raw_cell_identities.csv.gz"):
    utils.download_file(
        "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE133344&format=file&file=GSE133344%5Fraw%5Fcell%5Fidentities%2Ecsv%2Egz",
        norman_dir + "/raw_cell_identities.csv.gz",
    )
if not os.path.exists(norman_dir + "/raw_features.tsv.gz"):
    utils.download_file(
        "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE133344&format=file&file=GSE133344%5Fraw%5Fgenes%2Etsv%2Egz",
        norman_dir + "/raw_features.tsv.gz",
    )
if not os.path.exists(norman_dir + "/raw_matrix.mtx.gz"):
    utils.download_file(
        "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE133nnn/GSE133344/suppl/GSE133344%5Fraw%5Fmatrix%2Emtx%2Egz",
        norman_dir + "/raw_matrix.mtx.gz",
    )

The features.tsv.gz file needs to have a third column with the value "Gene Expression"

In [None]:
modify_features_file(norman_dir + "/raw_features.tsv.gz")

Define a prefix for the files in the Norman dataset. This is needed to run sc.read_10x_mtx method from scanpy.

In [13]:
norman_prefix = "raw_"

Load the Norman data into an AnnData object. This make take a few minutes.

In [None]:
adata = sc.read_10x_mtx(
    norman_dir, var_names="gene_ids", cache=False, prefix=norman_prefix
)
print(adata)

Filter the Anne data to keep only cells present in the gears barcodes file

In [None]:
adata = filter_barcodes_and_add_condition(adata, "gears_barcodes/norman_barcodes.csv")
print(adata)

Output the adata into a .h5ad file

In [None]:
save_h5ad(adata, "processed/norman.h5ad")

# Dixit

In [None]:
dixit_tar_link = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE90063&format=file"

# Dixit directory
dixit_dir = raw_data_dir + "/dixit"
# Use the function for Dixit dataset
download_and_extract_tar(dixit_tar_link, dixit_dir)

There are several experiments in the Dixit dataset, however, GEARS only uses two of the experiments, whose files have the following prefix:

In [43]:
prefix_p7d = "GSM2396858_k562_tfs_7"
prefix_cc7d = "GSM2396861_k562_ccycle_"

In [None]:
# GSM2396858_k562_tfs_7_cellnames.csv.gz
# GSM2396858_k562_tfs_7_genenames.csv.gz
# GSM2396858_k562_tfs_7.mtx.txt.gz

In [None]:
# Load adata_p7d
adata_p7d = sc.read_10x_mtx(
    dixit_dir, var_names="gene_ids", cache=False, prefix=prefix_p7d
)

# Adamson

In [None]:
raw_tar = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE90546&format=file"

# Raw directory