# Search and query RxRx1, RxRx2

In [1]:
!lamin load sunnyosun/rxrx1-2

2023-09-21 15:19:17,808:INFO - Found credentials in shared credentials file: ~/.aws/credentials
❗ updating local SQLite & locking cloud SQLite (sync back & unlock: lamin close)
💡 loaded instance: sunnyosun/rxrx1-2
[0m

In [2]:
import lamindb as ln
import lnschema_bionty as lb
import lnschema_lamin1 as ln1

2023-09-21 15:19:24,972:INFO - Found credentials in shared credentials file: ~/.aws/credentials


💡 loaded instance: sunnyosun/rxrx1-2 (lamindb 0.54.0)


In [3]:
ln.track()

💡 notebook imports: duckdb==0.8.1 lamindb==0.54.0 lnschema_bionty==0.31.1 lnschema_lamin1==0.23.0
💡 Transform(id='sx3wFSwnhCYYz8', name='Search and query RxRx1, RxRx2', short_name='query-rxrx', version='0', type='notebook', updated_at=2023-09-21 12:30:46, created_by_id='kmvZDIX9')
💡 Run(id='lyIGDhFyutVCERkiriqp', run_at=2023-09-21 13:19:28, transform_id='sx3wFSwnhCYYz8', created_by_id='kmvZDIX9')
💡   parent transforms:
   - Transform(id='Zo0qJt4IQPsbz8', name='Validate and register RxRx1 metadata', short_name='rxrx1-register', version='0', type='notebook', updated_at=2023-09-21 13:03:50, created_by_id='kmvZDIX9')
   - Transform(id='kq1P1Aho94siz8', name='Register RxRx1 metadata and embedding files', short_name='rxrx1-download', version='0', type='notebook', updated_at=2023-09-19 10:57:46, created_by_id='kmvZDIX9')


In [4]:
features = ln.Feature.lookup(return_field="name")
cell_lines = lb.CellLine.lookup(return_field="abbr")
sirnas = ln1.Treatment.lookup(return_field="name")
wells = ln1.Well.lookup(return_field="name")

## Pandas

In [5]:
file = ln.File.filter(key="rxrx1/metadata.parquet").one()
df = file.load()

In [6]:
df[
    (df.cell_type == cell_lines.hep_g2_cell)
    & (df.sirna == sirnas.s19486)
    & (df.well == wells.l20)
    & (df.plate == "3")
    & (df.site == "2")
]

Unnamed: 0,site_id,well_id,cell_type,dataset,experiment,plate,well,site,well_type,sirna,sirna_id,file_keys
54582,HEPG2-11_3_L20_2,HEPG2-11_3_L20,HEPG2,test,HEPG2-11,3,L20,2,treatment,s19486,848,images/test/HEPG2-11/Plate3/L20_s2_w1.png
54583,HEPG2-11_3_L20_2,HEPG2-11_3_L20,HEPG2,test,HEPG2-11,3,L20,2,treatment,s19486,848,images/test/HEPG2-11/Plate3/L20_s2_w2.png
54584,HEPG2-11_3_L20_2,HEPG2-11_3_L20,HEPG2,test,HEPG2-11,3,L20,2,treatment,s19486,848,images/test/HEPG2-11/Plate3/L20_s2_w3.png
54585,HEPG2-11_3_L20_2,HEPG2-11_3_L20,HEPG2,test,HEPG2-11,3,L20,2,treatment,s19486,848,images/test/HEPG2-11/Plate3/L20_s2_w4.png
54586,HEPG2-11_3_L20_2,HEPG2-11_3_L20,HEPG2,test,HEPG2-11,3,L20,2,treatment,s19486,848,images/test/HEPG2-11/Plate3/L20_s2_w5.png
54587,HEPG2-11_3_L20_2,HEPG2-11_3_L20,HEPG2,test,HEPG2-11,3,L20,2,treatment,s19486,848,images/test/HEPG2-11/Plate3/L20_s2_w6.png


## duckdb

Using duckdb to query from the cloud parquet file without downloading:

In [7]:
# pip install duckdb
import duckdb

In [8]:
filters = (
    f"{features.cell_type} == '{cell_lines.hep_g2_cell}' and {features.sirna} =="
    f" '{sirnas.s19486}' and {features.well} == '{wells.l20}' and "
    f"{features.plate} == '3' and {features.site} == '2'"
)

In [9]:
filters

"cell_type == 'HEPG2' and sirna == 's19486' and well == 'L20' and plate == '3' and site == '2'"

In [29]:
# requires httpfs extension
parquet_data = duckdb.from_parquet(str(file.path))

In [30]:
parquet_data.filter(filters)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

┌──────────────────┬────────────────┬───────────┬─────────┬───┬───────────┬─────────┬──────────┬──────────────────────┐
│     site_id      │    well_id     │ cell_type │ dataset │ … │ well_type │  sirna  │ sirna_id │      file_keys       │
│     varchar      │    varchar     │  varchar  │ varchar │   │  varchar  │ varchar │ varchar  │       varchar        │
├──────────────────┼────────────────┼───────────┼─────────┼───┼───────────┼─────────┼──────────┼──────────────────────┤
│ HEPG2-11_3_L20_2 │ HEPG2-11_3_L20 │ HEPG2     │ test    │ … │ treatment │ s19486  │ 848      │ images/test/HEPG2-…  │
│ HEPG2-11_3_L20_2 │ HEPG2-11_3_L20 │ HEPG2     │ test    │ … │ treatment │ s19486  │ 848      │ images/test/HEPG2-…  │
│ HEPG2-11_3_L20_2 │ HEPG2-11_3_L20 │ HEPG2     │ test    │ … │ treatment │ s19486  │ 848      │ images/test/HEPG2-…  │
│ HEPG2-11_3_L20_2 │ HEPG2-11_3_L20 │ HEPG2     │ test    │ … │ treatment │ s19486  │ 848      │ images/test/HEPG2-…  │
│ HEPG2-11_3_L20_2 │ HEPG2-11_3_L20 │ HE

Get the corresponding embeddings:

In [12]:
embedding_file = ln.File.filter(key="rxrx1/embeddings.h5ad").one()
embedding = embedding_file.load()

In [13]:
embedding

AnnData object with n_obs × n_vars = 125510 × 128
    obs: 'well_id', 'cell_type', 'dataset', 'experiment', 'plate', 'well', 'site', 'well_type', 'sirna', 'sirna_id', 'file_keys'

In [21]:
embedding["HEPG2-11_3_L20_2", :].to_df()

Unnamed: 0_level_0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_118,feature_119,feature_120,feature_121,feature_122,feature_123,feature_124,feature_125,feature_126,feature_127
site_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HEPG2-11_3_L20_2,2.021484,0.39917,1.97168,3.021484,0.157104,2.697266,-2.542969,-0.251709,0.732422,-0.24707,...,-2.638672,-0.043182,-0.63623,0.874023,-0.013046,-0.657715,1.760742,-0.081848,-1.27832,0.573242


In [22]:
# close the SQLite instance
!lamin close

2023-09-21 15:23:14,392:INFO - Found credentials in shared credentials file: ~/.aws/credentials
❗ updating & unlocking cloud SQLite 's3://lamindata/rxrx1-2.lndb' of instance 'sunnyosun/rxrx1-2'
✅ closed instance: sunnyosun/rxrx1-2
