In [2]:
import pandas as pd
import numpy as np


### Goal (for now):
Given a single row, representing a single service, predict whether or not that patient will end up requiring inpatient services in the next 18 months (540 days)

Constructing the outcome will require a sliding window over the entire dataframe

If svc_period is > 0, then the patient has attended inpatient services

In this case, any service row for that patient that has a svc_start > (svc_start of inpatient - 18*30 days) will be flagged as positive
Any rows indicating inpatient procedures can be removed from the dataset (in this case since is is directing the outcome)

Would be better to find architectures that make use of multiple rows, and can find patterns in pre-inpatient visits across different patients who end up eventually getting admitted

In [3]:
data = pd.read_csv('data/anon_claims_data_traindev_15to22.csv')

In [4]:
data.head(n=10)

Unnamed: 0.1,Unnamed: 0,FNAME,LNAME,SERVICE_RENDERED_ROW_ID,MCI_UNIQ_ID,DOB,RACE,GENDER,AGE,AGE_GROUP,...,DIAGNOSIS_COMMON_SORT,DIAGNOSIS_CAT,SVC1ST,SERVICE_SEQ,svc_start,svc_end,svc_period,set,mci_anon,svc_id_anon
0,0,,,,,,White,Female,62,45-64,...,Bipolar D/O,Bipolar Disorder,,,2021-01-21,2021-01-21,0 days 00:00:00.000000000,dev,0,0
1,12,,,,,,Black,Male,56,45-64,...,Org Mental D/O,Organic Mental Disorders,,,2019-08-02,2019-08-02,0 days 00:00:00.000000000,train,2,12
2,13,,,,,,Black,Male,56,45-64,...,Org Mental D/O,Organic Mental Disorders,,,2019-09-16,2019-09-16,0 days 00:00:00.000000000,train,2,13
3,14,,,,,,Black,Male,56,45-64,...,Org Mental D/O,Organic Mental Disorders,,,2019-11-08,2019-11-08,0 days 00:00:00.000000000,train,2,14
4,15,,,,,,Black,Male,56,45-64,...,Org Mental D/O,Organic Mental Disorders,,,2019-12-13,2019-12-13,0 days 00:00:00.000000000,train,2,15
5,16,,,,,,Black,Male,57,45-64,...,Org Mental D/O,Organic Mental Disorders,,,2020-01-13,2020-01-13,0 days 00:00:00.000000000,train,2,16
6,17,,,,,,Black,Male,57,45-64,...,Org Mental D/O,Organic Mental Disorders,,,2020-02-14,2020-02-14,0 days 00:00:00.000000000,train,2,17
7,18,,,,,,Black,Male,57,45-64,...,Org Mental D/O,Organic Mental Disorders,,,2020-03-11,2020-03-11,0 days 00:00:00.000000000,train,2,18
8,19,,,,,,Black,Male,54,45-64,...,Depressive D/O,Major Depression,,,2017-05-22,2017-05-22,0 days 00:00:00.000000000,train,2,19
9,20,,,,,,Black,Male,54,45-64,...,Org Mental D/O,Organic Mental Disorders,,,2017-09-18,2017-09-18,0 days 00:00:00.000000000,train,2,20


In [5]:
# Drop first column (old index)
data.drop(columns=['Unnamed: 0'])

# Change svc start and end to datetime
data['svc_start'] = pd.to_datetime(data['svc_start'])
data['svc_end'] = pd.to_datetime(data['svc_end'])

# Change svc_period to a timedelta of days
data['svc_period'] = data['svc_period'].apply(lambda x: pd.Timedelta(days = int(x.split(' ')[0])))

# For efficiency purposes, we change svc_period to an interger representing days instead of an actual pd.Timedelta object
# If needed, this will be used to create an actual Timedelta object to perform operations on other columns, this is not needed for most rows


In [6]:
def drop_null_cols(df: pd.DataFrame) -> pd.DataFrame:

    dropcols = [col for col in df.columns if df[col].notna().sum() == 0]

    return df.drop(columns=dropcols)


data = drop_null_cols(data)


In [7]:
test = data.loc[data['mci_anon']==2]

test['svc_period'].max()

Timedelta('0 days 00:00:00')

User service category: 
inpatient mental health
pshycheatric inpatien hospital
IP-MH

if they have any row in the services table that has one of those categories then that row is considered inpatient services

service category also has crisis: MH-CRISIS-OUTCOME variable, can be second label that can be predicted on

In [8]:
def construct_outcome(
    df: pd.DataFrame, window: pd.Timedelta = pd.Timedelta(days=540)
) -> pd.DataFrame:
    """ """

    # Initialize outcome column
    df["outcome"] = 0

    null_period = pd.Timedelta(days=0)

    for idx, mci_group in df.groupby(by="mci_anon"):
        if mci_group["svc_period"].max() != null_period:
            crit_period_cutoff = (
                mci_group["svc_start"].loc[mci_group["svc_period"] != null_period].min()
                - window
            )

            df["outcome"].iloc[(mci_group["svc_start"] >= crit_period_cutoff).index] = 1

    return df


In [9]:
data = construct_outcome(data)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['outcome'].iloc[(mci_group['svc_start']>=crit_period_cutoff).index] = 1


In [10]:
data.loc[data['outcome']==1]

Unnamed: 0.1,Unnamed: 0,RACE,GENDER,AGE,AGE_GROUP,SVC_START_DT,SVC_END_DT,FISCAL_YEAR,FISCAL_QUARTER,PAYER,...,DIAGNOSIS_DESC,DIAGNOSIS_COMMON_SORT,DIAGNOSIS_CAT,svc_start,svc_end,svc_period,set,mci_anon,svc_id_anon,outcome
38,53,Black,Female,29,21-44,2021-06-01 00:00:00.000,2021-06-30 00:00:00.000,FY20/21,Q4,CCBH,...,"Cannabis dependence, uncomplicated",Cannabis,Cannabis Use Disorder,2021-06-01,2021-06-30,29 days,dev,6,53,1
39,54,Black,Female,29,21-44,2021-10-01 00:00:00.000,2021-10-31 00:00:00.000,FY21/22,Q2,CCBH,...,"Cannabis abuse, uncomplicated",Cannabis,Cannabis Use Disorder,2021-10-01,2021-10-31,30 days,dev,6,54,1
108,229,Black,Male,56,45-64,2020-07-03 00:00:00.000,2020-07-03 00:00:00.000,FY20/21,Q1,CCBH,...,"Bipolar disorder, unspecified",Bipolar D/O,Bipolar Disorder,2020-07-03,2020-07-03,0 days,dev,23,229,1
109,230,Black,Male,56,45-64,2020-07-08 00:00:00.000,2020-07-08 00:00:00.000,FY20/21,Q1,CCBH,...,"Bipolar disorder, unspecified",Bipolar D/O,Bipolar Disorder,2020-07-08,2020-07-08,0 days,dev,23,230,1
110,231,Black,Male,56,45-64,2020-07-09 00:00:00.000,2020-07-09 00:00:00.000,FY20/21,Q1,CCBH,...,"Bipolar disorder, unspecified",Bipolar D/O,Bipolar Disorder,2020-07-09,2020-07-09,0 days,dev,23,231,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1669708,2909956,White,Female,53,45-64,2015-10-07 00:00:00.000,2015-10-07 00:00:00.000,FY15/16,Q2,CCBH,...,"Schizoaffective disorder, bipolar type",Schizophrenia,Schizoaffective Disorders,2015-10-07,2015-10-07,0 days,train,70807,2909956,1
1669710,2909960,White,Female,54,45-64,2016-05-09 00:00:00.000,2016-05-09 00:00:00.000,FY15/16,Q4,COUNTY,...,"Schizoaffective disorder, bipolar type",Schizophrenia,Schizoaffective Disorders,2016-05-09,2016-05-09,0 days,train,70807,2909960,1
1669712,2909978,White,Female,52,45-64,2017-05-30 00:00:00.000,2017-05-30 00:00:00.000,FY16/17,Q4,COUNTY,...,"Bipolar disord, crnt episode manic w/o psych f...",Bipolar D/O,Bipolar Disorder,2017-05-30,2017-05-30,0 days,train,58587,2909978,1
1669713,2909981,White,Female,21,21-44,2022-05-11 00:00:00.000,2022-05-11 00:00:00.000,FY21/22,Q4,CCBH,...,Bipolar II disorder,Bipolar D/O,Bipolar Disorder,2022-05-11,2022-05-11,0 days,train,80968,2909981,1


In [14]:
print(data['mci_anon'].loc[data['outcome']==1].unique())

9286


In [12]:
data.loc[data['mci_anon']==83115]

Unnamed: 0.1,Unnamed: 0,RACE,GENDER,AGE,AGE_GROUP,SVC_START_DT,SVC_END_DT,FISCAL_YEAR,FISCAL_QUARTER,PAYER,...,DIAGNOSIS_DESC,DIAGNOSIS_COMMON_SORT,DIAGNOSIS_CAT,svc_start,svc_end,svc_period,set,mci_anon,svc_id_anon,outcome
589871,870113,White,Female,36,21-44,2019-06-12 00:00:00.000,2019-06-12 00:00:00.000,FY18/19,Q4,CCBH,...,"Bipolar disord, crnt epsd depress, sev, w/o ps...",Bipolar D/O,Bipolar Disorder,2019-06-12,2019-06-12,0 days,dev,83115,870113,1
693900,1117023,White,Female,33,21-44,2016-02-24 00:00:00.000,2016-02-29 00:00:00.000,FY15/16,Q3,CCBH,...,"Major depressive disorder, single episode, mod...",Maj Depression,Major Depression,2016-02-24,2016-02-29,5 days,dev,83115,1117023,1
717698,1173280,White,Female,36,21-44,2019-07-10 00:00:00.000,2019-07-10 00:00:00.000,FY19/20,Q1,CCBH,...,"Bipolar disord, crnt episode manic w/o psych f...",Bipolar D/O,Bipolar Disorder,2019-07-10,2019-07-10,0 days,dev,83115,1173280,1
769223,1296304,White,Female,37,21-44,2020-11-01 00:00:00.000,2020-11-30 00:00:00.000,FY20/21,Q2,CCBH,...,"Bipolar disorder, current episode depressed, m...",Bipolar D/O,Bipolar Disorder,2020-11-01,2020-11-30,29 days,dev,83115,1296304,1
792888,1352318,White,Female,33,21-44,2016-08-22 00:00:00.000,2016-08-22 00:00:00.000,FY16/17,Q1,CCBH,...,"Bipolar disord, crnt epsd depress, sev, w/o ps...",Bipolar D/O,Bipolar Disorder,2016-08-22,2016-08-22,0 days,dev,83115,1352318,1
792892,1352331,White,Female,38,21-44,2021-04-01 00:00:00.000,2021-04-30 00:00:00.000,FY20/21,Q4,CCBH,...,"Bipolar disorder, current episode depressed, m...",Bipolar D/O,Bipolar Disorder,2021-04-01,2021-04-30,29 days,dev,83115,1352331,1
807008,1385729,White,Female,37,21-44,2020-06-01 00:00:00.000,2020-06-30 00:00:00.000,FY19/20,Q4,CCBH,...,"Bipolar disorder, current episode depressed, m...",Bipolar D/O,Bipolar Disorder,2020-06-01,2020-06-30,29 days,dev,83115,1385729,1
1401151,2270210,White,Female,33,21-44,2016-06-01 00:00:00.000,2016-06-01 00:00:00.000,FY15/16,Q4,CCBH,...,"Bipolar disord, crnt epsd depress, sev, w/o ps...",Bipolar D/O,Bipolar Disorder,2016-06-01,2016-06-01,0 days,dev,83115,2270210,1
1424576,2326230,White,Female,34,21-44,2017-06-26 00:00:00.000,2017-06-26 00:00:00.000,FY16/17,Q4,CCBH,...,"Bipolar disord, crnt epsd depress, sev, w/o ps...",Bipolar D/O,Bipolar Disorder,2017-06-26,2017-06-26,0 days,dev,83115,2326230,1
1429323,2337483,White,Female,34,21-44,2017-05-03 00:00:00.000,2017-05-03 00:00:00.000,FY16/17,Q4,CCBH,...,Bipolar II disorder,Bipolar D/O,Bipolar Disorder,2017-05-03,2017-05-03,0 days,dev,83115,2337483,1
