In [429]:
# Load dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sqlalchemy
from sqlalchemy import create_engine, text
import pymc as pm
import arviz as az
from scipy.stats import zscore

# Define the SQLalchemy engine
engine = create_engine(f"sqlite:////Users/JO/PhD/neuro-ascertainment/data/db.sqlite")

# Read the SQL query from the file
with open('/Users/JO/PhD/neuro-ascertainment/candidate-queries/karolinska-nsicu-cohort/karolinska-cohort.sql', 'r') as file:
    query = file.read()

SEED = 20
rng = np.random.default_rng(SEED)

In [409]:
query_PAR_HADM = query + "SELECT * FROM PAR_HADM"
PAR_HADM = pd.read_sql(query_PAR_HADM, engine)

In [410]:
def count_id(df):
    LopNr = df['LopNr'].nunique() if 'LopNr' in df else 'Column missing'
    VtfId_LopNr = df['VtfId_LopNr'].nunique() if 'VtfId_LopNr' in df else 'Column missing'
    HADM_ID = df['HADM_ID'].nunique() if 'HADM_ID' in df else 'Column missing'
    return print(f'Unique patients: {LopNr} | Unique SIR admits: {VtfId_LopNr} | Unique PAR admits: {HADM_ID}')

In [411]:
count_id(PAR_HADM)

Unique patients: 59333 | Unique SIR admits: Column missing | Unique PAR admits: 359305


In [412]:
query_K_ICU_ADMISSIONS = query + "SELECT * FROM K_ICU_ADMISSIONS"
K_ICU_ADMISSIONS = pd.read_sql(query_K_ICU_ADMISSIONS, engine)

In [413]:
count_id(K_ICU_ADMISSIONS)

Unique patients: 6454 | Unique SIR admits: 7673 | Unique PAR admits: Column missing


In [414]:
query_K_ICU_ADMISSIONS_MATCHED_WITH_PAR = query + "SELECT * FROM K_ICU_ADMISSIONS_MATCHED_WITH_PAR"
K_ICU_ADMISSIONS_MATCHED_WITH_PAR = pd.read_sql(query_K_ICU_ADMISSIONS_MATCHED_WITH_PAR, engine)

In [415]:
count_id(K_ICU_ADMISSIONS_MATCHED_WITH_PAR)

Unique patients: 5649 | Unique SIR admits: 6498 | Unique PAR admits: 6621


In [416]:
query_K_ICU_ADMISSIONS_MATCHED_WITH_PAR_WITH_DX = query + "SELECT * FROM K_ICU_ADMISSIONS_MATCHED_WITH_PAR_WITH_DX"
K_ICU_ADMISSIONS_MATCHED_WITH_PAR_WITH_DX = pd.read_sql(query_K_ICU_ADMISSIONS_MATCHED_WITH_PAR_WITH_DX, engine)

In [417]:
count_id(K_ICU_ADMISSIONS_MATCHED_WITH_PAR_WITH_DX)

Unique patients: 5649 | Unique SIR admits: 6498 | Unique PAR admits: 6621


In [418]:
K_ICU_ADMISSIONS_MATCHED_WITH_PAR_WITH_DX

Unnamed: 0,VtfId_LopNr,LopNr,InskrTidPunkt,UtskrTidPunkt,LopNr:1,indexdatum1,indexdatum2,indexdatum3,indexdatum4,indexdatum5,...,OPD28,OPD29,OPD30,LK,slutrapporterad,HADM_ID,HADM_ID:1,LopNr:2,Diagnos:1,DX_GROUP
0,175619.0,33177.0,1606887420,1607366520,33177.0,18598,18609.0,,,,...,,,,0182,,183658,183658,33177.0,S3280 S8290 S831 S2720 S2240 C900 I489 I109 N1...,OTHER
1,175620.0,33177.0,1607879820,1609839480,33177.0,18598,18609.0,,,,...,,,,0182,,183660,183660,33177.0,J969 J958A C900 I109 N189 I489 A410 Z930,OTHER
2,175617.0,21183.0,1609025100,1609623000,21183.0,18622,,,,,...,,,,0180,,117964,117964,21183.0,I608 S8260 I639 J9690 G819 R470 R139,ASAH
3,176206.0,33694.0,1610649900,1612279200,33694.0,18641,18641.0,,,,...,,,,0180,,186729,186729,33694.0,S065 S020 F102A Z930 R139 Z720A,TBI
4,176749.0,14417.0,1610759340,1611749700,14417.0,18643,18659.0,,,,...,,,,0180,,80792,80792,14417.0,U071 J128 R651 J809B I109 I693 N179 N183 G728 ...,OTHER
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7300,145337.0,12526.0,1475668260,1477704540,12526.0,17078,17079.0,,,,...,,,,0136,,70328,70328,12526.0,J809X J172 B599 M332 J969 E119 E039 I109 G473 ...,OTHER
7301,144960.0,12526.0,1475607660,1475638200,12526.0,17078,17079.0,,,,...,,,,0136,,70328,70328,12526.0,J809X J172 B599 M332 J969 E119 E039 I109 G473 ...,OTHER
7302,146669.0,14255.0,1482789960,1483287600,14255.0,17161,17161.0,,,,...,,,,0123,,79829,79829,14255.0,G001 J014 Z721 F319 F419,ABM
7303,146638.0,14255.0,1482769200,1482789960,14255.0,17161,17161.0,,,,...,,,,0123,,79829,79829,14255.0,G001 J014 Z721 F319 F419,ABM


In [419]:
query_DESCRIPTIVE = query + "SELECT * FROM DESCRIPTIVE "
DESCRIPTIVE  = pd.read_sql(query_DESCRIPTIVE, engine)

In [420]:
count_id(DESCRIPTIVE)

Unique patients: 5649 | Unique SIR admits: 6498 | Unique PAR admits: 6621


In [421]:
result = DESCRIPTIVE.groupby('DX_GROUP', dropna=False).agg({'DX_GROUP': 'size', 'age': 'mean', 'sex_female': 'mean', 'SAPS_GCS': 'median', 'd30': 'mean', 'any_AMV': 'mean', 'MVO': lambda x: x.mode().iloc[0] if not x.mode().empty else None})
result = result.rename(columns={'DX_GROUP': 'count'})
result

Unnamed: 0_level_0,count,age,sex_female,SAPS_GCS,d30,any_AMV,MVO
DX_GROUP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ABM,245,55.64898,0.493878,12.0,0.159184,0.771429,121
AIS,548,62.244526,0.304745,10.0,0.251825,0.843066,221
ASAH,1398,57.694564,0.639485,14.0,0.121602,0.61588,331
AVM,45,47.422222,0.533333,15.0,0.044444,0.533333,331
CFX,300,60.833333,0.26,15.0,0.106667,0.706667,331
CVT,23,43.956522,0.434783,11.0,0.173913,0.521739,221
ENC,44,46.659091,0.431818,10.0,0.068182,0.795455,121
HC,62,56.306452,0.580645,11.0,0.080645,0.822581,331
ICH,743,58.663526,0.401077,8.0,0.309556,0.846568,331
OTHER,2092,55.221797,0.339388,13.0,0.186902,0.768642,301


In [422]:
DESCRIPTIVE[DESCRIPTIVE.DX_GROUP != "OTHER"].VtfId_LopNr.nunique()

4748

In [423]:
K_ICU_ADMISSIONS_MATCHED_WITH_PAR_WITH_DX.columns

Index(['VtfId_LopNr', 'LopNr', 'InskrTidPunkt', 'UtskrTidPunkt', 'LopNr:1',
       'indexdatum1', 'indexdatum2', 'indexdatum3', 'indexdatum4',
       'indexdatum5', 'indexdatum6', 'indexdatum7', 'indexdatum8',
       'indexdatum9', 'indexdatum10', 'indexdatum11', 'indexdatum12',
       'indexdatum13', 'indexdatum14', 'indexdatum15', 'indexdatum16',
       'indexdatum17', 'indexdatum18', 'indexdatum19', 'indexdatum20',
       'indexdatum21', 'indexdatum22', 'indexdatum23', 'indexdatum24',
       'indexdatum25', 'indexdatum26', 'AR', 'KON', 'ALDER', 'LKF', 'SJUKHUS',
       'MVO', 'INDATUMA', 'UTDATUMA', 'INDATUM', 'UTDATUM', 'VTID', 'UTSATT',
       'PVARD', 'HDIA', 'DIAGNOS', 'EKOD1', 'OP', 'EKOD2', 'OPD1', 'OPD2',
       'OPD3', 'OPD4', 'OPD5', 'OPD6', 'OPD7', 'OPD8', 'OPD9', 'OPD10',
       'OPD11', 'OPD12', 'NATION', 'EKOD3', 'EKOD4', 'EKOD5', 'OPD13', 'OPD14',
       'OPD15', 'OPD16', 'OPD17', 'OPD18', 'OPD19', 'OPD20', 'OPD21', 'OPD22',
       'OPD23', 'OPD24', 'OPD25', 'OPD26', '

In [424]:
small_d = K_ICU_ADMISSIONS_MATCHED_WITH_PAR_WITH_DX[['VtfId_LopNr','InskrTidPunkt', 'UtskrTidPunkt', 'HADM_ID', 'MVO', 'LopNr', 'INDATUM', 'UTDATUM', 'DX_GROUP', 'DIAGNOS']]
small_d['INDATUM'] = pd.to_datetime(small_d['INDATUM'], unit='D')
small_d['UTDATUM'] = pd.to_datetime(small_d['UTDATUM'], unit='D')
small_d['InskrTidPunkt'] = pd.to_datetime(small_d['InskrTidPunkt'], unit='s').dt.normalize()
small_d['UtskrTidPunkt'] = pd.to_datetime(small_d['UtskrTidPunkt'], unit='s').dt.normalize()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_d['INDATUM'] = pd.to_datetime(small_d['INDATUM'], unit='D')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_d['UTDATUM'] = pd.to_datetime(small_d['UTDATUM'], unit='D')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_d['InskrTidPunkt'] = pd.to_datetime(small_d['InskrTidPunkt'], unit=

In [425]:
filtered_df = small_d[small_d.groupby('VtfId_LopNr')['VtfId_LopNr'].transform('size') > 1]

In [426]:
filtered_df['timediff_to_sir'] = filtered_df['InskrTidPunkt'] - filtered_df['INDATUM']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['timediff_to_sir'] = filtered_df['InskrTidPunkt'] - filtered_df['INDATUM']


In [430]:
check_vtf = rng.choice(filtered_df['VtfId_LopNr'], 1)
filtered_df.query("VtfId_LopNr == check_vtf")

Unnamed: 0,VtfId_LopNr,InskrTidPunkt,UtskrTidPunkt,HADM_ID,MVO,LopNr,INDATUM,UTDATUM,DX_GROUP,DIAGNOS,timediff_to_sir
7155,112100.0,2011-11-26,2011-12-01,231545,221,41651.0,2011-11-24,2011-12-02,ICH,I613 I109 E119,2 days
7156,112100.0,2011-11-26,2011-12-01,231544,331,41651.0,2011-11-20,2011-11-24,ICH,I613 I109 E669,6 days
7167,159834.0,2018-11-05,2018-11-07,292196,221,52629.0,2018-11-07,2018-11-13,ICH,I619 I675,-2 days
7168,159834.0,2018-11-05,2018-11-07,292195,331,52629.0,2018-11-05,2018-11-07,ICH,I619 I675,0 days
7174,159362.0,2018-10-02,2018-10-07,332763,107,59783.0,2018-09-23,2018-09-24,OTHER,I959 N185 Z936,9 days
7175,159362.0,2018-10-02,2018-10-07,332764,151,59783.0,2018-09-24,2018-11-12,OTHER,L899 B965 L899E N185 G821 Z936 E639 D509 E871B...,8 days
7189,159373.0,2018-10-16,2018-10-17,222823,107,40155.0,2018-10-16,2018-10-16,OTHER,T814,0 days
7190,159373.0,2018-10-16,2018-10-17,222822,304,40155.0,2018-10-13,2018-10-15,OTHER,I652 I509 I252 I109,3 days
7191,159373.0,2018-10-16,2018-10-17,222824,304,40155.0,2018-10-17,2018-10-22,OTHER,T814 I652 I509 I252 I109 Z955,-1 days
7192,159373.0,2018-10-16,2018-10-17,222821,221,40155.0,2018-10-12,2018-10-13,OTHER,I652 I109 I509 I252,4 days
