# CRISPR-studio Quickstart (Colab friendly)

This notebook runs the demo pipeline on bundled sample data with MAGeCK disabled and annotations skipped to keep runtime under two minutes. It is designed to work in Google Colab or locally without fetching any external datasets.

**Environment setup**
- Install the package with report extras: `pip install "crispr_screen_expert[reports]"`
- If running in Colab, ensure the repository is available so `sample_data/` can be read (e.g., `git clone https://github.com/jameshyojaelee/CRISPR-studio`).

In [None]:
import sys
from pathlib import Path

IN_COLAB = "google.colab" in sys.modules
if IN_COLAB:
    print("Detected Colab; installing crispr_screen_expert with report extras...")
    import subprocess
    import sys as _sys
    subprocess.check_call([_sys.executable, "-m", "pip", "install", "crispr_screen_expert[reports]"])
else:
    print("Using existing environment; ensure crispr_screen_expert[reports] is available.")


In [None]:
from datetime import datetime
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

from crispr_screen_expert.data_loader import (
    load_counts,
    load_library,
    load_metadata,
    match_counts_to_library,
    validate_metadata_against_counts,
)
from crispr_screen_expert.pipeline import DataPaths, PipelineSettings, run_analysis


## Locate bundled sample data

Update `DATA_DIR` if you want to point at your own files instead.

In [None]:
DATA_DIR = Path("../sample_data").resolve()
if not DATA_DIR.exists():
    raise FileNotFoundError(
        "sample_data directory not found. Clone the repository and set DATA_DIR to your data directory."
    )

counts_path = DATA_DIR / "demo_counts.csv"
library_path = DATA_DIR / "demo_library.csv"
metadata_path = DATA_DIR / "demo_metadata.json"

print(f"Counts: {counts_path}")
print(f"Library: {library_path}")
print(f"Metadata: {metadata_path}")


## Validate the dataset structure

In [None]:
counts = load_counts(counts_path)
library = load_library(library_path)
metadata = load_metadata(metadata_path)

validate_metadata_against_counts(metadata, counts)
aligned_counts, missing_guides, merged = match_counts_to_library(counts, library)

print(f"Loaded counts shape: {counts.shape}")
print(f"Aligned counts shape: {aligned_counts.shape}")
print(f"Missing guides: {len(missing_guides)}")


## Run the pipeline (MAGeCK off, annotations skipped)

Native backends are optional; this quickstart sticks to pure-Python execution for portability.

In [None]:
output_root = Path("artifacts/notebooks")
output_root.mkdir(parents=True, exist_ok=True)

settings = PipelineSettings(
    use_mageck=False,
    use_native_rra=False,
    use_native_enrichment=False,
    cache_annotations=False,
    enrichr_libraries=[],
    output_root=output_root,
)

start = datetime.utcnow()
result = run_analysis(
    config=metadata,
    paths=DataPaths(counts=counts_path, library=library_path, metadata=metadata_path),
    settings=settings,
)
print(f"Runtime: {(datetime.utcnow() - start).total_seconds():.2f}s")
print(f"Genes scored: {len(result.gene_results)}")
print(f"Warnings: {len(result.warnings)}")


## Visualize top genes and warnings

In [None]:
gene_df = pd.DataFrame([g.model_dump() for g in result.gene_results])
if gene_df.empty:
    raise SystemExit("No gene results available; check inputs.")

sorted_genes = gene_df.sort_values(by="fdr").head(15)
top_fig = px.bar(
    sorted_genes,
    x="gene_symbol",
    y="score",
    color="fdr",
    hover_data={"log2_fold_change": True, "rank": True},
    title="Top hits by FDR",
)
top_fig.update_layout(xaxis_title="Gene", yaxis_title="Score")

top_fig.show()

warning_rows = [w.model_dump() for w in result.warnings] or [
    {"code": "clean_run", "message": "No warnings raised", "details": {}}
]
warning_fig = go.Figure(
    data=[
        go.Table(
            header=dict(values=["Code", "Message"], fill_color="#202433", font=dict(color="white")),
            cells=dict(
                values=[
                    [row.get("code", "?") for row in warning_rows],
                    [row.get("message", "") for row in warning_rows],
                ],
            ),
        )
    ]
)
warning_fig.update_layout(title="Pipeline warnings")
warning_fig.show()
