In [1]:
from webdav4.fsspec import WebdavFileSystem
import duckdb

## SQL With DuckDB

#### Download the Data

For this notebook, we'll be exploring some data from the Steinmetz et al 20XX study, processed here into JSON files for our tabular analysis, along with some other familiar file types.  Please run the code below to download the data.

In [139]:
# https://uni-bonn.sciebo.de/s/oZql1bk0p1AvK0w
fs = WebdavFileSystem("https://uni-bonn.sciebo.de/public.php/webdav", auth=("oZql1bk0p1AvK0w", ""))
fs.download("/", "data/steinmetz", recursive=True)

## Loading Data with DuckDB

| Code | Description |
| :-- | :-- |
| `duckdb.sql('FROM "path/to/file.json"')` |  |
| `duckdb.sql('FROM "path/to/file.csv"')` |  |
| `duckdb.sql('FROM "path/to/file.parquet"')` |  |
| `duckdb.sql('FROM read_json("path/to/file.json", filename=true)')` |  |





**Exercises**

Use duckdb to read the `session.json` file from the session recorded on 2017-11-02.  What metadata fields were recorded on that day?

In [3]:
duckdb.sql('FROM "data/steinmetz/steinmetz_2017-11-02_Forssmann/session.json"')

┌──────────────┬───────────┬────────────┬──────────┬─────────┐
│ session_date │   mouse   │ stim_onset │ bin_size │   id    │
│     date     │  varchar  │   double   │  double  │ varchar │
├──────────────┼───────────┼────────────┼──────────┼─────────┤
│ 2017-11-02   │ Forssmann │        0.5 │     0.01 │ dda4    │
└──────────────┴───────────┴────────────┴──────────┴─────────┘

Use duckdb to read the 'trials.csv' file recorded on 2017-11-02.  What trial variables were recorded for that session?  (Note: if you cannot see all the columns, either add `"DESCRIBE"` to the front of the SQL statement, or you can convert the output to a Pandas DataFrame with `.to_df()`)

In [31]:
duckdb.sql('FROM "data/steinmetz/steinmetz_2017-11-02_Forssmann/trials.csv"')

┌───────────────┬────────────────┬────────────────────┬───┬─────────────────────┬───────────────┬────────────┐
│ contrast_left │ contrast_right │       gocue        │ … │     prev_reward     │ active_trials │ session_id │
│     int64     │     int64      │       double       │   │       double        │    boolean    │  varchar   │
├───────────────┼────────────────┼────────────────────┼───┼─────────────────────┼───────────────┼────────────┤
│             0 │            100 │ 0.5821628508391683 │ … │               -10.0 │ true          │ dda4       │
│             0 │              0 │ 0.4842213627248384 │ … │ -5.5394480599438225 │ true          │ dda4       │
│            25 │             25 │ 0.6884175083797288 │ … │ -4.1964409490420564 │ true          │ dda4       │
│             0 │              0 │  0.628083671326948 │ … │ -2.9373264076337335 │ true          │ dda4       │
│             0 │              0 │ 0.7482593361695535 │ … │ -2.3423466082563067 │ true          │ dda4       │
│

Use duckdb to read **all** of the `session.json` files in the dataset.

In [156]:
duckdb.sql('FROM "data/steinmetz/**/session.json"')

┌──────────────┬───────────┬────────────┬──────────┬─────────┐
│ session_date │   mouse   │ stim_onset │ bin_size │   id    │
│     date     │  varchar  │   double   │  double  │ varchar │
├──────────────┼───────────┼────────────┼──────────┼─────────┤
│ 2017-01-07   │ Muller    │        0.5 │     0.01 │ b5b6    │
│ 2017-01-08   │ Muller    │        0.5 │     0.01 │ 49bb    │
│ 2017-01-08   │ Radnitz   │        0.5 │     0.01 │ 769e    │
│ 2017-01-09   │ Muller    │        0.5 │     0.01 │ 31dc    │
│ 2017-01-09   │ Radnitz   │        0.5 │     0.01 │ 99f4    │
│ 2017-01-10   │ Radnitz   │        0.5 │     0.01 │ a400    │
│ 2017-01-11   │ Radnitz   │        0.5 │     0.01 │ 6207    │
│ 2017-01-12   │ Radnitz   │        0.5 │     0.01 │ 3e6c    │
│ 2017-05-15   │ Moniz     │        0.5 │     0.01 │ 40f7    │
│ 2017-05-16   │ Moniz     │        0.5 │     0.01 │ 2474    │
│     ·        │   ·       │         ·  │       ·  │  ·      │
│     ·        │   ·       │         ·  │       ·  │  ·

Use DuckDB to read all the `trials.csv` files in the dataset.

In [94]:
duckdb.sql('DESCRIBE FROM "data/steinmetz/*/trials.csv"')

┌────────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│  column_name   │ column_type │  null   │   key   │ default │  extra  │
│    varchar     │   varchar   │ varchar │ varchar │ varchar │ varchar │
├────────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ contrast_left  │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
│ contrast_right │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
│ gocue          │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ stim_onset     │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ feedback_type  │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ feedback_time  │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ response_type  │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ response_time  │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ reaction_type  │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ reaction_time  │ DOUBLE      │ YES     │ NULL    

Use DuckDB to get all the cells that were recorded from in the dataset.

In [95]:
duckdb.sql('DESCRIBE FROM "data/steinmetz/*/cells.parquet"')

┌────────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│  column_name   │ column_type │  null   │   key   │ default │  extra  │
│    varchar     │   varchar   │ varchar │ varchar │ varchar │ varchar │
├────────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ ccf_ap         │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ ccf_dv         │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ ccf_lr         │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ brain_area     │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ brain_groups   │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ trough_to_peak │ TINYINT     │ YES     │ NULL    │ NULL    │ NULL    │
│ session_id     │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ cell           │ INTEGER     │ YES     │ NULL    │ NULL    │ NULL    │
└────────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┘

What variables were stored when recording lick behaviors?

In [96]:
duckdb.sql('DESCRIBE FROM "data/steinmetz/*/licks.parquet"')

┌─────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│ column_name │ column_type │  null   │   key   │ default │  extra  │
│   varchar   │   varchar   │ varchar │ varchar │ varchar │ varchar │
├─────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ licks       │ TINYINT     │ YES     │ NULL    │ NULL    │ NULL    │
│ session_id  │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ trial       │ INTEGER     │ YES     │ NULL    │ NULL    │ NULL    │
│ time        │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
└─────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┘

## Filtering Data with SELECT, DISTINCT, WHERE, and LIMIT

**Exercises**

In [113]:
duckdb.sql(
"""
SELECT DISTINCT 
    mouse,
FROM 
    "data/steinmetz/*/session.json"
""")

┌───────────┐
│   mouse   │
│  varchar  │
├───────────┤
│ Hench     │
│ Muller    │
│ Moniz     │
│ Forssmann │
│ Radnitz   │
│ Theiler   │
│ Tatum     │
│ Richards  │
│ Lederberg │
│ NULL      │
├───────────┤
│  10 rows  │
└───────────┘

What unique ("distinct") contrast levels of the left stimulus were there in this experiment?  To make it easier to read, order the rows in the resulting table.

In [124]:
duckdb.sql(
"""
SELECT DISTINCT
    contrast_left
FROM 
    "data/steinmetz/*/trials.csv"
""")

┌───────────────┐
│ contrast_left │
│     int64     │
├───────────────┤
│            50 │
│             0 │
│           100 │
│            25 │
└───────────────┘

What unique ("distinct") combinations of contrast levels between the left and right stimulus were there in this experiment?  To make it easier to read, order the rows in the resulting table.

In [128]:
duckdb.sql(
"""
SELECT DISTINCT
    contrast_left,
    contrast_right
FROM 
    "data/steinmetz/*/trials.csv"
ORDER BY contrast_left, contrast_right
""")

┌───────────────┬────────────────┐
│ contrast_left │ contrast_right │
│     int64     │     int64      │
├───────────────┼────────────────┤
│             0 │              0 │
│             0 │             25 │
│             0 │             50 │
│             0 │            100 │
│            25 │              0 │
│            25 │             25 │
│            25 │             50 │
│            25 │            100 │
│            50 │              0 │
│            50 │             25 │
│            50 │             50 │
│            50 │            100 │
│           100 │              0 │
│           100 │             25 │
│           100 │             50 │
│           100 │            100 │
├───────────────┴────────────────┤
│ 16 rows              2 columns │
└────────────────────────────────┘

What were the different stimulus onset time settings used this experiment?

In [130]:
duckdb.sql(
"""
SELECT DISTINCT
    stim_onset
FROM 
    "data/steinmetz/*/trials.csv"
""")

┌────────────┐
│ stim_onset │
│   double   │
├────────────┤
│        0.5 │
└────────────┘

Which general areas of the  brain (let's use "brain_groups" here) were the cells in this study recorded from?

In [137]:
duckdb.sql(
"""
SELECT DISTINCT
    brain_groups
FROM 
    "data/steinmetz/*/cells.parquet"
ORDER BY brain_groups
""")

┌───────────────────┐
│   brain_groups    │
│      varchar      │
├───────────────────┤
│ TT                │
│ basal ganglia     │
│ cortical subplate │
│ hippocampus       │
│ midbrain          │
│ non-visual cortex │
│ root              │
│ thalamus          │
│ visual cortex     │
└───────────────────┘

Which brain groups were associated with which smaller brain areas?

In [139]:
duckdb.sql(
"""
SELECT DISTINCT
    brain_groups,
    brain_area
FROM 
    "data/steinmetz/*/cells.parquet"
ORDER BY brain_groups
""")

┌───────────────┬────────────┐
│ brain_groups  │ brain_area │
│    varchar    │  varchar   │
├───────────────┼────────────┤
│ TT            │ TT         │
│ basal ganglia │ GPe        │
│ basal ganglia │ CP         │
│ basal ganglia │ LSc        │
│ basal ganglia │ MS         │
│ basal ganglia │ SI         │
│ basal ganglia │ ACB        │
│ basal ganglia │ SNr        │
│ basal ganglia │ OT         │
│ basal ganglia │ LSr        │
│    ·          │ ·          │
│    ·          │ ·          │
│    ·          │ ·          │
│ thalamus      │ MD         │
│ thalamus      │ TH         │
│ thalamus      │ LD         │
│ thalamus      │ CL         │
│ visual cortex │ VISa       │
│ visual cortex │ VISam      │
│ visual cortex │ VISp       │
│ visual cortex │ VISpm      │
│ visual cortex │ VISrl      │
│ visual cortex │ VISl       │
├───────────────┴────────────┤
│     72 rows (20 shown)     │
└────────────────────────────┘

Let's say we're interested mainly in the thalamus.  Which areas of the thalamus were recorded from (i.e. "get a table WHERE only the "thalamus" brain group is included.") in this experiment?  (Tip: use single quotes `'` to reference text.)

In [143]:
duckdb.sql(
"""
SELECT DISTINCT
    brain_groups,
    brain_area
FROM 
    'data/steinmetz/*/cells.parquet'
WHERE brain_groups = 'thalamus'
ORDER BY brain_groups
""")

┌──────────────┬────────────┐
│ brain_groups │ brain_area │
│   varchar    │  varchar   │
├──────────────┼────────────┤
│ thalamus     │ RT         │
│ thalamus     │ VAL        │
│ thalamus     │ SPF        │
│ thalamus     │ PT         │
│ thalamus     │ MG         │
│ thalamus     │ LGd        │
│ thalamus     │ VPL        │
│ thalamus     │ VPM        │
│ thalamus     │ PO         │
│ thalamus     │ POL        │
│ thalamus     │ CL         │
│ thalamus     │ LH         │
│ thalamus     │ LP         │
│ thalamus     │ LD         │
│ thalamus     │ TH         │
│ thalamus     │ MD         │
├──────────────┴────────────┤
│ 16 rows         2 columns │
└───────────────────────────┘

In [145]:
duckdb.sql(
"""
SELECT
    min(ccf_lr),
    max(ccf_lr)
FROM 
    "data/steinmetz/*/cells.parquet"
""")

┌─────────────┬─────────────┐
│ min(ccf_lr) │ max(ccf_lr) │
│   double    │   double    │
├─────────────┼─────────────┤
│      1078.8 │      6346.6 │
└─────────────┴─────────────┘

How many sessions were recorded?

In [157]:
duckdb.sql('SELECT count(*) FROM "data/steinmetz/**/session.json"')

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│           37 │
└──────────────┘

How many total trials were done in the experiment, across all sessions?

In [158]:
duckdb.sql('SELECT count(*) FROM "data/steinmetz/*/trials.csv"')

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│        13617 │
└──────────────┘

How many total sessions were done by the mouse named "Richards"?

In [167]:
duckdb.sql(
"""
SELECT count(*) FROM  "data/steinmetz/*/session.json" WHERE mouse = 'Richards'
""")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│            5 │
└──────────────┘

In [154]:
duckdb.sql('SELECT count(*) FROM "data/steinmetz/steinmetz_2017-11-02_Forssmann/session.json"')

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│            1 │
└──────────────┘

In [112]:
duckdb.sql('DESCRIBE FROM "../data_processing/data7/**/trials.csv"')

┌────────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│  column_name   │ column_type │  null   │   key   │ default │  extra  │
│    varchar     │   varchar   │ varchar │ varchar │ varchar │ varchar │
├────────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ contrast_left  │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
│ contrast_right │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
│ gocue          │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ stim_onset     │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ feedback_type  │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ feedback_time  │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ response_type  │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ response_time  │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ reaction_type  │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ reaction_time  │ DOUBLE      │ YES     │ NULL    

In [77]:
duckdb.sql(
"""
SELECT
    brain_groups,
    count(*),
    round(avg(num_cells), 1),
FROM
    (SELECT 
        brain_groups,
        session_id,
        count(*) AS num_cells
    FROM "../data_processing/data7/**/cells.parquet"
    GROUP BY (brain_groups, session_id)
    )
GROUP BY brain_groups
ORDER BY lower(brain_groups)
""")

┌───────────────────┬──────────────┬──────────────────────────┐
│   brain_groups    │ count_star() │ round(avg(num_cells), 1) │
│      varchar      │    int64     │          double          │
├───────────────────┼──────────────┼──────────────────────────┤
│ basal ganglia     │           12 │                    169.0 │
│ cortical subplate │            3 │                     82.3 │
│ hippocampus       │           18 │                    149.9 │
│ midbrain          │           11 │                    240.8 │
│ non-visual cortex │           21 │                    288.7 │
│ root              │           22 │                    134.3 │
│ thalamus          │           19 │                    222.4 │
│ TT                │            4 │                     45.3 │
│ visual cortex     │           17 │                    149.7 │
└───────────────────┴──────────────┴──────────────────────────┘

In [72]:
%load_ext sql

In [74]:
%sql duckdb://

In [174]:
result = duckdb.sql(
"""
PIVOT (
    SELECT
        mouse,
        session_id,
        session_date,
        brain_groups,
        count(cell) as num_cells,
    FROM "../data_processing/data7/**/cells.parquet" cells
    INNER JOIN "../data_processing/data7/**/session.json" sessions ON cells.session_id = sessions.id
    GROUP BY ALL
    )
ON brain_groups
USING sum(num_cells)
GROUP BY mouse
"""
).to_df()
result.set_index('mouse').convert_dtypes().style.format()

Unnamed: 0_level_0,TT,basal ganglia,cortical subplate,hippocampus,midbrain,non-visual cortex,root,thalamus,visual cortex
mouse,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Hench,137.0,450.0,,506,389.0,1019,60,819,593
Muller,10.0,92.0,,459,714.0,408,583,366,619
Theiler,,,,209,,497,442,59,141
Tatum,,351.0,185.0,329,703.0,757,433,372,205
Lederberg,,681.0,173.0,429,787.0,951,816,1250,221
Richards,,562.0,195.0,235,344.0,1387,136,957,19
,,,,220,,78,100,155,145
Forssmann,,494.0,52.0,661,,1068,1241,864,219
Radnitz,34.0,430.0,,111,1017.0,1229,348,204,430
Moniz,,,,297,185.0,377,44,801,379


**Exercises**

## SQL with SQLite3

In [18]:
type(dd)
dd.DataFrame()

Unnamed: 0,exposure_time,laser_power,num_frames,frame_rate,region_of_interest,start_time
0,300,15,292,30,ROI1,376.0
1,100,10,226,10,ROI2,
2,100,15,317,30,ROI1,3101.0
3,200,15,271,10,ROI1,2788.0
4,100,10,297,20,ROI3,1800.0
5,300,15,339,10,ROI3,
6,100,5,225,20,ROI2,
7,100,5,329,30,ROI1,4465.0
8,300,5,206,30,ROI2,
9,200,10,253,30,ROI2,585.0


In [16]:
duckdb.sql('FROM "webdav://data1.json"')

┌─────────┬───────┐
│  name   │  age  │
│ varchar │ int64 │
├─────────┼───────┤
│ George  │    32 │
└─────────┴───────┘

In [17]:
duckdb.sql('FROM "webdav://*.json"')

┌─────────┬───────┐
│  name   │  age  │
│ varchar │ int64 │
├─────────┼───────┤
│ George  │    32 │
│ Forrest │    50 │
└─────────┴───────┘

In [28]:
fs.ls("/", detail=False)

['data1.json', 'data2.json']

In [26]:
fs.glob("*.json")

['data1.json', 'data2.json']

In [29]:
fs.read_text("data1.json")

'{"name": "George", "age": 32}\n'

In [34]:
datas

['{"name": "George", "age": 32}\n', '{"name": "Forrest", "age": 50}']

In [40]:
datas = [json.loads(fs.read_text(fname)) for fname in fs.glob("*.json")]
df = pd.DataFrame(datas)
# duckdb.sql("FROM df")
df

Unnamed: 0,name,age
0,George,32
1,Forrest,50


In [9]:
from webdav4.client import Client

In [10]:
client = Client("https://uni-bonn.sciebo.de/public.php/webdav", auth=("Zg5pBvtfasoz9uB", ""))
client

<webdav4.client.Client at 0x1288712fb10>

In [7]:
from webdav4.fsspec import WebdavFileSystem

In [11]:
fs = WebdavFileSystem("https://uni-bonn.sciebo.de/public.php/webdav", auth=("Zg5pBvtfasoz9uB", ""))
fs

<webdav4.fsspec.WebdavFileSystem at 0x1288711bbd0>

In [16]:
fs.download("/", "./webdav", recursive=True)

In [17]:
fs.download?

[1;31mSignature:[0m [0mfs[0m[1;33m.[0m[0mdownload[0m[1;33m([0m[0mrpath[0m[1;33m,[0m [0mlpath[0m[1;33m,[0m [0mrecursive[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m [1;33m**[0m[0mkwargs[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m Alias of `AbstractFileSystem.get`.
[1;31mFile:[0m      c:\users\nickdg\miniconda3\envs\duckdb\lib\site-packages\fsspec\spec.py
[1;31mType:[0m      method

In [4]:
# %pip install pyocclient

In [5]:
import owncloud

In [6]:
client = owncloud.Client.from_public_link("https://uni-bonn.sciebo.de/s/Zg5pBvtfasoz9uB")
objs = client.list("/", depth=30)
objs
# , auth=("Zg5pBvtfasoz9uB", "")))

[File(path=/data1.json,file_type=file,attributes={'{DAV:}getlastmodified': 'Wed, 14 Feb 2024 18:59:35 GMT', '{DAV:}getcontentlength': '30', '{DAV:}resourcetype': None, '{DAV:}getetag': '"f4b12075861ac116b30978c1977a3648"', '{DAV:}getcontenttype': 'application/json'}),
 File(path=/data2.json,file_type=file,attributes={'{DAV:}getlastmodified': 'Wed, 14 Feb 2024 19:03:18 GMT', '{DAV:}getcontentlength': '30', '{DAV:}resourcetype': None, '{DAV:}getetag': '"e7516f9c3607841daea0b59ea93409cb"', '{DAV:}getcontenttype': 'application/json'}),
 File(path=/fold/,file_type=dir,attributes={'{DAV:}getlastmodified': 'Thu, 15 Feb 2024 07:27:53 GMT', '{DAV:}resourcetype': None, '{DAV:}quota-used-bytes': '28', '{DAV:}quota-available-bytes': '8171673147', '{DAV:}getetag': '"65cdbcf965e3a"'}),
 File(path=/fold/data3.json,file_type=file,attributes={'{DAV:}getlastmodified': 'Thu, 15 Feb 2024 07:27:53 GMT', '{DAV:}getcontentlength': '28', '{DAV:}resourcetype': None, '{DAV:}getetag': '"5213c97191d189de9e70c724a

In [20]:
obj = objs[0]
obj

File(path=/data1.json,file_type=file,attributes={'{DAV:}getlastmodified': 'Wed, 14 Feb 2024 18:59:35 GMT', '{DAV:}getcontentlength': '30', '{DAV:}resourcetype': None, '{DAV:}getetag': '"f4b12075861ac116b30978c1977a3648"', '{DAV:}getcontenttype': 'application/json'})

In [21]:
obj.file_type

'file'

In [22]:
obj.name

'data1.json'

In [23]:
obj.get_path()

'/'

In [24]:
obj.attributes

{'{DAV:}getlastmodified': 'Wed, 14 Feb 2024 18:59:35 GMT',
 '{DAV:}getcontentlength': '30',
 '{DAV:}resourcetype': None,
 '{DAV:}getetag': '"f4b12075861ac116b30978c1977a3648"',
 '{DAV:}getcontenttype': 'application/json'}

In [45]:
client.get_file_contents(obj)

b'{"name": "Bruno", "age": 43}'

In [32]:
obj.path

'/data1.json'

In [33]:
obj.file_type

'file'

In [47]:
datas = []
for obj in client.list("/", depth=30):
    if obj.file_type == "file" and obj.path.endswith(".json"):
        text = client.get_file_contents(obj)
        data = json.loads(text)
        datas.append(data)

pd.DataFrame(datas)


Unnamed: 0,name,age
0,George,32
1,Forrest,50
2,Bruno,43


In [38]:
from duckdb import sql

In [39]:
%load_ext sql

## SQL FROM Statement to extract all data

#### Glob patterns

In [3]:
%%sql

UsageError: Cell magic `%%sql` not found.


#### Include Filenames

In [3]:
sql('FROM read_json_auto("data3/**/*.json", filename=true, hive_partitioning=true)')

┌───────────┬───────┬────────────────┬──────────────────────────────┬───────────┐
│  author   │  age  │      job       │           filename           │   sess    │
│  varchar  │ int64 │    varchar     │           varchar            │  varchar  │
├───────────┼───────┼────────────────┼──────────────────────────────┼───────────┤
│ Sangeetha │ 30000 │ animal trainer │ data3\sess=afternoon\bb.json │ afternoon │
│ Nick      │  1003 │ NULL           │ data3\sess=morning\aa.json   │ morning   │
└───────────┴───────┴────────────────┴──────────────────────────────┴───────────┘

## SQL SELECT to select and rename columns

In [24]:
sql('SELECT author, job FROM "data3/**/*.json"')

┌───────────┬────────────────┐
│  author   │      job       │
│  varchar  │    varchar     │
├───────────┼────────────────┤
│ Sangeetha │ animal trainer │
│ Nick      │ NULL           │
└───────────┴────────────────┘

## SQL WHERE for Filtering

In [27]:
sql("FROM 'data3/**/*.json' WHERE sess='morning'")

┌─────────┬───────┬─────────┬─────────┐
│ author  │  age  │   job   │  sess   │
│ varchar │ int64 │ varchar │ varchar │
├─────────┼───────┼─────────┼─────────┤
│ Nick    │  1003 │ NULL    │ morning │
└─────────┴───────┴─────────┴─────────┘

## SQL ORDER BY and LIMIT

In [30]:
sql("FROM 'data3/**/*.json' LIMIT 1")

┌───────────┬───────┬────────────────┬───────────┐
│  author   │  age  │      job       │   sess    │
│  varchar  │ int64 │    varchar     │  varchar  │
├───────────┼───────┼────────────────┼───────────┤
│ Sangeetha │ 30000 │ animal trainer │ afternoon │
└───────────┴───────┴────────────────┴───────────┘

In [32]:
sql("FROM 'data3/**/*.json' ORDER BY age")

┌───────────┬───────┬────────────────┬───────────┐
│  author   │  age  │      job       │   sess    │
│  varchar  │ int64 │    varchar     │  varchar  │
├───────────┼───────┼────────────────┼───────────┤
│ Nick      │  1003 │ NULL           │ morning   │
│ Sangeetha │ 30000 │ animal trainer │ afternoon │
└───────────┴───────┴────────────────┴───────────┘

## (extra)  SQL GROUP BY for basic Session Reporting

In [3]:
sql("SELECT sess, count(*) as total, max(age) FROM 'data3/**/*.json' GROUP BY sess")

┌───────────┬───────┬──────────┐
│   sess    │ total │ max(age) │
│  varchar  │ int64 │  int64   │
├───────────┼───────┼──────────┤
│ afternoon │     1 │    30000 │
│ morning   │     1 │     1003 │
└───────────┴───────┴──────────┘

In [5]:
df = sql("SELECT sess, count(*) as total, max(age) FROM 'data3/**/*.json' GROUP BY sess").to_df()
df

Unnamed: 0,sess,total,max(age)
0,afternoon,1,30000
1,morning,1,1003


## Describing Data Location in Session Tabel

In [155]:
sql(
r"""
SELECT 
    -- *, 
    -- strlen(filename), 
    parse_dirpath(filename, '/') as path,
    'spikes.h5' as spikefile,
    FROM read_json_auto('data3/**/*.json', filename=true)
""")

┌──────────────────────┬───────────┐
│         path         │ spikefile │
│       varchar        │  varchar  │
├──────────────────────┼───────────┤
│ data3\sess=afternoon │ spikes.h5 │
│ data3\sess=morning   │ spikes.h5 │
└──────────────────────┴───────────┘

In [193]:
sql(
"""
SELECT
  parse_dirpath(filename) AS session_path,
  * EXCLUDE(filename),
  'spikes.h5' as s,
FROM read_json("data4/**/*.json", filename=true)
"""
)

┌──────────────┬───────────┬───────┬────────────────┬───────────┐
│ session_path │  author   │  age  │      job       │     s     │
│   varchar    │  varchar  │ int64 │    varchar     │  varchar  │
├──────────────┼───────────┼───────┼────────────────┼───────────┤
│ data4\s1     │ Sangeetha │ 30000 │ animal trainer │ spikes.h5 │
│ data4\s2     │ Nick      │  1003 │ NULL           │ spikes.h5 │
└──────────────┴───────────┴───────┴────────────────┴───────────┘

## Pivoting Data

In [154]:
def get_file_table(path):
    return sql(
        f"""
        PIVOT (
            SELECT
                parse_dirpath(file) as path,
                parse_filename(path) as session_id,
                parse_filename(file) as filename,
            FROM glob('{path}/**')
        )
        ON filename;
        """ 
    ).to_df()

get_file_table("data4")

Unnamed: 0,path,session_id,behav.mov,session.json,spikes.h5
0,data4\s1,s1,0,1,1
1,data4\s2,s2,1,1,1


In [114]:
from pathlib import Path

In [146]:
import pandas as pd
from glob import glob

df = pd.DataFrame(data=glob("data4/**/*"), columns=['file'])
df['path'] = df['file'].apply(lambda x: Path(x).parent)
df['file'] = df['file'].apply(lambda x: Path(x).name)
df['session_id'] = df['path'].apply(lambda x: Path(x).name)
df = df[['session_id', 'path', 'file']]

pd.crosstab([df.session_id, df.path], df.file).reset_index()

file,session_id,path,behav.mov,session.json,spikes.h5
0,s1,data4\s1,0,1,1
1,s2,data4\s2,1,1,1


┌──────────┬────────────┬──────────────┬───────────┐
│   path   │ session_id │ session.json │ spikes.h5 │
│ varchar  │  varchar   │   varchar    │  varchar  │
├──────────┼────────────┼──────────────┼───────────┤
│ data4\s1 │ s1         │ session.json │ spikes.h5 │
│ data4\s2 │ s2         │ session.json │ spikes.h5 │
└──────────┴────────────┴──────────────┴───────────┘