# Curate GWS CRISPR IFNg raw data

In [None]:
import lamindb as ln
import bionty as bt

ln.track("M0BgdFXT7Az7", project="Schmidt22")

This raw data was uploaded through the UI.

In [None]:
lamindata_artifacts = ln.Artifact.using("laminlabs/lamindata")
df = lamindata_artifacts.get("MDG7BbeFVPvEyyUb0000").load()
df

Create dedicated features for this project.

In [None]:
schmidt22_features = ln.Feature(
    name="Schmidt22",
    is_type=True,
    description="Features from Schmidt et al. 2022, Genome-wide CRISPRa screen with IFN-gamma readout in melanoma cells",
).save()
target_gene_symbol = ln.Feature(
    name="target_gene_symbol",
    dtype=bt.Gene.symbol,
    type=schmidt22_features,
    description="Target gene of CRISPRa experiment",
).save()
crispr_ifng_p_value_neg = ln.Feature(
    name="crispr_ifng_p_value_neg",
    dtype=float,
    type=schmidt22_features,
    description="Negative CRISPR IFN-gamma p-value",
).save()
crispr_ifng_p_value_pos = ln.Feature(
    name="crispr_ifng_p_value_pos",
    dtype=float,
    type=schmidt22_features,
    description="Positive CRISPR IFN-gamma p-value",
).save()

And a schema.

In [None]:
schmidt22_schemas = ln.Schema(
    name="Schmidt22",
    is_type=True,
    description="Schemas from Schmidt et al. 2022, Genome-wide CRISPRa screen with IFN-gamma readout in melanoma cells",
).save()
schema = ln.Schema(
    name="GWS_CRISPRa_IFN-gamma_readout",
    features=[target_gene_symbol, crispr_ifng_p_value_neg, crispr_ifng_p_value_pos],
    type=schmidt22_schemas,
    description="Genome-wide CRISPRa screen with IFN-gamma readout in melanoma cells",
).save()
schema.describe()

Rename columns to match schema.

In [None]:
df.rename(
    columns={
        "gene_target": "target_gene_symbol",
        "neg|p-value": "crispr_ifng_p_value_neg",
        "pos|p-value": "crispr_ifng_p_value_pos",
    },
    inplace=True,
)

Standardize gene symbols and update gene registry.

In [None]:
# standardize gene symbols against public ontology
df.target_gene_symbol = bt.Gene.standardize(df.target_gene_symbol)
# find those genes that were not validated
validated = bt.Gene.validate(df.target_gene_symbol)
# add those genes that were not validated to the gene registry
bt.Gene.from_values(df.target_gene_symbol[~validated].unique(), create=True).save()

Make them categorical.

In [None]:
df.target_gene_symbol = df.target_gene_symbol.astype("category")

Inspect result.

In [None]:
df.head()

Save the artifact with validation through `schema`.

In [None]:
ln.Artifact.from_dataframe(
    df,
    key="schmidt22/gws-crispr-ifng-readout.parquet",
    description="Genome-wide CRISPRa screen with IFN-gamma readout in melanoma cells from Schmidt et al. (2022)",
    schema=schema,
    features={
        "biosample": "Schmidt22-S001",
        "experiment": "Schmidt22-EXP001",
        "assays_efo": ["gRNA-seq"],
        "readouts_efo": ["interferon gamma"],
        "original_publication": "Schmidt22",
    },
).save().describe()

In [None]:
ln.finish()