<a href="https://colab.research.google.com/github/kayserim/prj_id/blob/main/generate_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd '/content/drive/MyDrive/cse6250_proj' 

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/cse6250_proj


In [2]:
import pandas as pd
import numpy as np
from datetime import datetime,timedelta
from collections import defaultdict, Counter
import dask
import math
import dask.dataframe as dd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from dateutil.relativedelta import relativedelta
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

HOURS_IN_A_DAY = 24
HOURS_LIMIT = 48

path = './data/all/'
#path = './data/demo/'

def fill_missing_values(current_value, map_value, default_value):
  if pd.notna(current_value):
    return current_value
  if pd.notna(map_value):
    return map_value
  return default_value

In [3]:
def regress_features(df: pd.DataFrame, 
                    x_val: str,
                    feature_list: list) -> pd.DataFrame:
  '''This function regresses a features against a common predictor.

  Args:
    data: Pandas dataframe to regress
    x_val: String value of X value to use as predictor.
    feature_list: The features to regress.

  Returns:
    feature_dict: Dictionary of features storing feature mapped to a tuple.
  '''
  icu_map = dict()
  feature_dict = dict()
  data = df.copy()
  icu_id_list = data['ICUSTAY_ID'].unique()
  for icu_id in icu_id_list:
    data_temp = data[data['ICUSTAY_ID'] == icu_id]
    temp_list = []
    for f in feature_list:
      x = np.array(data_temp[x_val]).reshape(-1, 1)
      y = np.array(data_temp[f])
      reg = LinearRegression().fit(x, y)
      feature_dict[icu_id, f] = (reg.coef_.item(), reg.intercept_)

  feature_df = pd.DataFrame.from_dict(feature_dict, orient='index').reset_index()
  feature_df[['ICUSTAY_ID', 'FEATURE']]= pd.DataFrame(feature_df['index'].to_list())
  feature_df = feature_df.rename(columns = {0:'RATE', 1:'BIAS'}).drop(columns = ['index'])

  return feature_df
    

In [4]:
def diagnoses_map(col:any) -> str:
  '''A mapping of ICD code to diagnosis.

  Args: 
    col = the column to input, any value
  Returns:
    result = string corresponding to the ICD9 code
  '''
  if str(col).startswith(('E', 'V')):
    result = 'DIAG_EXTERNAL'
  elif math.isnan(float(col)):
    result = 'NONE'
  else:
    val = int(col)

    if val < 140:
      result = 'DIAG_INFECTIOUS'
    elif val < 240:
      result = 'DIAG_NEOPLASMS'
    elif val < 280:
      result = 'DIAG_IMMUNITY'
    elif val < 290:
      result = 'DIAG_BLOOD'
    elif val < 320:
      result = 'DIAG_MENTAL'
    elif val < 390:
      result = 'DIAG_NERVOUS_SYS'
    elif val < 460:
      result = 'DIAG_CIRCULATORY_SYS'
    elif val < 520:
      result = 'DIAG_RESP_SYS'
    elif val < 580:
      result = 'DIAG_DIGESTIVE_SYS'
    elif val < 630:
      result = 'DIAG_GENITOURINARY'
    elif val < 680:
      result = 'DIAG_PREGNANCY'
    elif val < 710:
      result = 'DIAG_SKIN'
    elif val < 740:
      result = 'DIAG_MUSC'
    elif val < 760:
      result = 'DIAG_CONGENITAL'
    elif val < 780:
      result = 'DIAG_PERINATAL'
    elif val < 800:
      result = 'DIAG_ILL_DEFINED'
    elif val < 1000:
      result = 'DIAG_INJURY'
    else:
      result = 'OTHER'
  return result

In [5]:
def create_diag_dataset(diagnoses_icd: pd.DataFrame) -> pd.DataFrame:
  '''This function takes in the diagnoses data and returns columns that can be
  input into the model.

  Params:
    diagnoses_icd: the diagnosis dataframe
  Returns:
    df_diag: dignosis vals to columns
  '''
  diagnoses_icd['GROUP'] = diagnoses_icd["ICD9_CODE"].map(diagnoses_map)
  df_diag = diagnoses_icd[(diagnoses_icd['GROUP'] != 'OTHER') & (diagnoses_icd['GROUP'] != 'NONE')]
  df_diag['value'] = 1
  df_diag = df_diag.pivot_table(values = 'value', index = ['SUBJECT_ID', 'HADM_ID'], columns = 'GROUP').reset_index()
  df_diag.fillna(0, inplace = True)
  return df_diag

In [6]:
def create_demographics_dataset(admissions, patients):
  demographics_merged = admissions.merge(patients,
                                       on = "SUBJECT_ID")[[
                                           "SUBJECT_ID",
                                           "DOB",
                                           "ADMITTIME",
                                           "ETHNICITY",
                                           "GENDER"]]

  demographics_merged["ADMITTIME"] = pd.to_datetime(demographics_merged["ADMITTIME"]).dt.date
  demographics_merged["DOB"] = pd.to_datetime(demographics_merged["DOB"]).dt.date
  demographics_merged['AGE'] = demographics_merged.apply(lambda e: (e['ADMITTIME'] - e['DOB']).days/365, axis=1)

  demographics = demographics_merged.groupby(["SUBJECT_ID",
                                                    "DOB",
                                                    "ADMITTIME",
                                                    "ETHNICITY",
                                                    "GENDER"])["AGE"].min().reset_index()

  demographics["value"] = 1
  demographics["GENDER"] = np.where(demographics["GENDER"] == "M", 1, 0)
  demographics = demographics.pivot_table(values = "value",
                                          index = ['SUBJECT_ID', 'AGE', 'GENDER'],
                                          columns = ["ETHNICITY"]).reset_index()      

  demographics.fillna(0, inplace = True)        

  return demographics

## Import Datasets

In [7]:
file = 'CHARTEVENTS_LITE.csv'
chartevents = dd.read_csv(path+file)
#chartevents['CHARTTIME'] = pd.to_datetime(chartevents['CHARTTIME'])

In [8]:
file = 'ICUSTAYS_LITE.csv'
icustays = pd.read_csv(path+file)
icustays['OUTTIME'] = pd.to_datetime(icustays['OUTTIME'])

In [9]:
file = 'DIAGNOSES_ICD.csv'
diagnoses_icd = pd.read_csv(path+file)

In [10]:
file = 'D_ICD_DIAGNOSES.csv'
d_icd_diagnoses = pd.read_csv(path+file)

In [11]:
file = 'ADMISSIONS.csv'
admissions = pd.read_csv(path+file)

In [12]:
file = 'PATIENTS.csv'
patients = pd.read_csv(path+file)

# Process Data

In [13]:
diagnosis = create_diag_dataset(diagnoses_icd)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_diag['value'] = 1


In [14]:
demographics = create_demographics_dataset(admissions, patients)

In [15]:
chartevents_merged = chartevents.merge(dd.from_pandas(icustays, npartitions=1), on='ICUSTAY_ID', how='inner').compute().dropna(subset=['ICUSTAY_ID'])
chartevents_merged['CHARTTIME'] = pd.to_datetime(chartevents_merged['CHARTTIME'])

In [16]:
# only one HADMID per user
max(chartevents_merged.groupby("ICUSTAY_ID")["HADM_ID"].nunique())

1

In [None]:
base_df=pd.DataFrame([])
base_df['ICUSTAY_ID'] = icustays.ICUSTAY_ID
base_df['SUBJECT_ID'] = icustays.SUBJECT_ID
base_df['HADM_ID'] = icustays.HADM_ID # if we want to add hadm id
base_df['HOUR'] = 1
base_df_extended = pd.concat([pd.DataFrame({'ICUSTAY_ID': row.ICUSTAY_ID,
                                            'HADM_ID' : row.HADM_ID,
                                            'SUBJECT_ID' : row.SUBJECT_ID,
                                            'HOUR': pd.RangeIndex(1,HOURS_LIMIT+1)}) for i, row in base_df.iterrows()], ignore_index=True)

data_all = base_df_extended.copy()
list_of_features = [
    {'ID':223761, 'DESC':'HRLY_TEMP', 'CESTAT':True}, 
    {'ID':220050, 'DESC':'HRLY_BPRS_SYS', 'CESTAT':True}, 
    {'ID':220051, 'DESC':'HRLY_BPRS_DIA', 'CESTAT':True}, 
    {'ID':220045, 'DESC':'HRLY_HRT_RATE', 'CESTAT':True}, 
    {'ID':225664, 'DESC':'HRLY_GLCS', 'CESTAT':True}, 
    {'ID':220210, 'DESC':'HRLY_RSP_RATE', 'CESTAT':True}, 
    {'ID':223830, 'DESC':'HRLY_PH', 'CESTAT':True}]#later will use HRLY_ prefix when giving datasets to models
# SKIPPING FOLLOWING:
# TOO FEW MEASUREMENTS FOR: capillary refill rate, Cholesterol
# TOO MANY POSSIBILITIES: URINE
# OPEN ENDED TEXT DATA: Glascow coma eye, Verbal Response, motor response parameters   
    
for elem in list_of_features:
  ID = elem['ID']
  DESC = elem['DESC']
  data = chartevents_merged.loc[chartevents_merged.ITEMID==ID]
  data['HOUR'] = np.ceil((data['OUTTIME']-data['CHARTTIME'])/pd.Timedelta(1,'h'))
  data['HOUR'] = data.HOUR.astype('int64')
  data = data.loc[data.HOUR <= HOURS_LIMIT]#last 48 hours only
  #SHOWS THAT MEASUREMENTS ARE NOT UNIFORMLY TAKEN (SO MISSING DATA EXPECTED)
  #data.HOUR.plot.hist(bins=HOURS_LIMIT)
  #WHEN CNT>1 SHOWS THAT MULTIPLE DATA POINTS EXISTS PER HOUR
  #print(data.groupby(['ICUSTAY_ID', 'HOUR']).size().reset_index(name='CNT').sort_values(by='CNT').groupby(['ICUSTAY_ID']).last().reset_index().head(20))

  #TODO AVERAGING WON'T WORK FOR CATEGORICAL DATA
  data_avg = data.groupby(['ICUSTAY_ID', 'HOUR', 'HADM_ID'])['VALUENUM'].mean().reset_index()
  ALL_AVG = data_avg.VALUENUM.mean() #tobe used if no data exists for the icu stay
  icustay_most_recent_data = data_avg.sort_values(by='HOUR').groupby('ICUSTAY_ID').first().reset_index()[['ICUSTAY_ID', 'VALUENUM']] 
  icustay_most_recent_data_map = defaultdict(lambda:np.NaN, dict(zip(icustay_most_recent_data.ICUSTAY_ID, icustay_most_recent_data.VALUENUM)))#tobe used for missing values i.e. use most recent measurement

  #filling missing values
  data_extended = base_df_extended.merge(data_avg, on=['ICUSTAY_ID', 'HOUR'], how='left')
  data_extended['VALUENUM'] = data_extended.apply(lambda row: fill_missing_values(row['VALUENUM'], icustay_most_recent_data_map[row['ICUSTAY_ID']], ALL_AVG), axis=1)
  data_all[DESC] = data_extended['VALUENUM'] #assuming order is maintained 

regress_ft_list = [feature['DESC'] for feature in list_of_features if feature['CESTAT']]
regressed_features = regress_features(data_all, 'HOUR', regress_ft_list)

regression_df = pd.DataFrame()
for ft in regress_ft_list:
  data_temp = regressed_features[regressed_features['FEATURE'] == ft][['ICUSTAY_ID', 'RATE', 'BIAS']]
  data_temp = data_temp.rename(columns = {'RATE': 'CESTAT_'+ft+'_RATE', 'BIAS': 'CESTAT_'+ft+'_BIAS'})#later will use CESTAT prefix when giving datasets to models
  regression_df = data_temp if regression_df.empty else regression_df.merge(data_temp, on = ['ICUSTAY_ID'])
  

In [18]:
df_final = data_all.pivot(index='ICUSTAY_ID', columns=['HOUR']).reset_index()
reordered_columns = [(desc,hour) for hour in range(1,1+HOURS_LIMIT) for desc in [feature['DESC'] for feature in list_of_features]]
df_final = df_final.reindex([('ICUSTAY_ID', '')]+reordered_columns, axis=1)
df_final.columns = [str(col[0])+str(col[1]) for col in df_final.columns.values] #converting tuples to string for better display as well as making ICUSTAY_ID column name simpler
df_final = df_final.merge(data_all[["ICUSTAY_ID", "HADM_ID", "SUBJECT_ID"]].drop_duplicates(), on="ICUSTAY_ID")
df_final = df_final.merge(regression_df, on='ICUSTAY_ID', how='inner')
print(df_final.shape)
df_final = df_final.merge(demographics, on='SUBJECT_ID', how='inner')
print(df_final.shape)
df_final = df_final.merge(diagnosis, on='HADM_ID', how='inner')
print(df_final.shape)
df_final = df_final.merge(icustays[['ICUSTAY_ID', 'POSITIVE']], on='ICUSTAY_ID', how='inner')
#df_final.head(5)

(49128, 353)
(101763, 396)
(83873, 413)


## Normalization

In [19]:
# Min Max Scaler
scaler = MinMaxScaler()
df_final = df_final.drop(['HADM_ID', 'SUBJECT_ID_x','SUBJECT_ID_y'], axis=1)
df_final = pd.DataFrame(scaler.fit_transform(df_final.drop(["ICUSTAY_ID"], axis = 1)),
                        columns = df_final.drop(["ICUSTAY_ID"], axis = 1).columns)

## Create Final Datasets

In [20]:
X=df_final.iloc[:,:-1]
y=df_final.iloc[:,-1:]

# Smote Sampling
sm = SMOTE(random_state = 42)
X_res, y_res = sm.fit_resample(X, y)

X_train, X_temp, y_train, y_temp = train_test_split(X_res, y_res, test_size=0.2, random_state=0, stratify=y_res)
X_test, X_validation, y_test, y_validation = train_test_split(X_temp, y_temp, test_size=0.5, random_state=0, stratify=y_temp)


In [22]:
pd.concat([X_train, y_train], axis=1).to_csv('./data/all/XY_train_LITE.csv', index=False)
pd.concat([X_test, y_test], axis=1).to_csv('./data/all/XY_test_LITE.csv', index=False)
pd.concat([X_validation, y_validation], axis=1).to_csv('./data/all/XY_validation_LITE.csv', index=False)