In [1]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch

In [2]:
diabetes = pd.read_csv('../../diabetes_data/diabetic_data.csv')
diabetes.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [3]:
diabetes.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [4]:
diabetes_dropped = diabetes[['age', 'gender', 'race', 'admission_type_id', 'diag_1', 'diag_2', 'diag_3', \
    'discharge_disposition_id', 'insulin', 'readmitted', 'time_in_hospital', 'A1Cresult', 'number_inpatient']]
diabetes_dropped.head()

Unnamed: 0,age,gender,race,admission_type_id,diag_1,diag_2,diag_3,discharge_disposition_id,insulin,readmitted,time_in_hospital,A1Cresult,number_inpatient
0,[0-10),Female,Caucasian,6,250.83,?,?,25,No,NO,1,,0
1,[10-20),Female,Caucasian,1,276.0,250.01,255,1,Up,>30,3,,0
2,[20-30),Female,AfricanAmerican,1,648.0,250,V27,1,No,NO,2,,1
3,[30-40),Male,Caucasian,1,8.0,250.43,403,1,Up,NO,2,,0
4,[40-50),Male,Caucasian,1,197.0,157,250,1,Steady,NO,1,,0


In [5]:
diabetes_dropped['age'] = diabetes_dropped['age'].apply(lambda x: x[1:-1])
diabetes_dropped['race'] = diabetes_dropped['race'].apply(lambda x: 'African American' if x == 'AfricanAmerican' else x)
diabetes_dropped.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diabetes_dropped['age'] = diabetes_dropped['age'].apply(lambda x: x[1:-1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diabetes_dropped['race'] = diabetes_dropped['race'].apply(lambda x: 'African American' if x == 'AfricanAmerican' else x)


Unnamed: 0,age,gender,race,admission_type_id,diag_1,diag_2,diag_3,discharge_disposition_id,insulin,readmitted,time_in_hospital,A1Cresult,number_inpatient
0,0-10,Female,Caucasian,6,250.83,?,?,25,No,NO,1,,0
1,10-20,Female,Caucasian,1,276.0,250.01,255,1,Up,>30,3,,0
2,20-30,Female,African American,1,648.0,250,V27,1,No,NO,2,,1
3,30-40,Male,Caucasian,1,8.0,250.43,403,1,Up,NO,2,,0
4,40-50,Male,Caucasian,1,197.0,157,250,1,Steady,NO,1,,0


In [6]:
admission_type_map = {
    1: 'Emergency',
    2: 'Urgent',
    3: 'Elective',
    4: 'Newborn',
    5: 'Not Available',
    6: 'NULL',
    7: 'Trauma Center',
    8: 'Not Mapped'
}

diabetes_dropped['admission_type'] = diabetes_dropped['admission_type_id'].map(admission_type_map)

# replace nulls
diabetes_dropped['admission_type'] = diabetes_dropped['admission_type'].replace(['Not Available', 'NULL', 'Not Mapped'], 'Unknown')
diabetes_dropped.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diabetes_dropped['admission_type'] = diabetes_dropped['admission_type_id'].map(admission_type_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diabetes_dropped['admission_type'] = diabetes_dropped['admission_type'].replace(['Not Available', 'NULL', 'Not Mapped'], 'Unknown')


Unnamed: 0,age,gender,race,admission_type_id,diag_1,diag_2,diag_3,discharge_disposition_id,insulin,readmitted,time_in_hospital,A1Cresult,number_inpatient,admission_type
0,0-10,Female,Caucasian,6,250.83,?,?,25,No,NO,1,,0,Unknown
1,10-20,Female,Caucasian,1,276.0,250.01,255,1,Up,>30,3,,0,Emergency
2,20-30,Female,African American,1,648.0,250,V27,1,No,NO,2,,1,Emergency
3,30-40,Male,Caucasian,1,8.0,250.43,403,1,Up,NO,2,,0,Emergency
4,40-50,Male,Caucasian,1,197.0,157,250,1,Steady,NO,1,,0,Emergency


In [7]:
def map_icd9_to_category(code):
    try:
        code = float(code)
    except:
        return "Unknown"

    if 390 <= code <= 459 or code == 785:
        return "Circulatory"
    elif 460 <= code <= 519 or code == 786:
        return "Respiratory"
    elif 520 <= code <= 579 or code == 787:
        return "Digestive"
    elif 250.0 <= code <= 250.99:
        return "Diabetes"
    elif 800 <= code <= 999:
        return "Injury"
    elif 710 <= code <= 739:
        return "Musculoskeletal"
    elif 580 <= code <= 629 or code == 788:
        return "Genitourinary"
    elif 140 <= code <= 239:
        return "Neoplasms"
    elif 240 <= code <= 279:
        return "Endocrine/Metabolic"
    elif 280 <= code <= 289:
        return "Blood"
    elif 290 <= code <= 319:
        return "Mental Disorders"
    else:
        return "Other"
diabetes_dropped['diag_1'] = diabetes_dropped['diag_1'].apply(map_icd9_to_category)
diabetes_dropped['diag_2'] = diabetes_dropped['diag_2'].apply(map_icd9_to_category)
diabetes_dropped['diag_3'] = diabetes_dropped['diag_3'].apply(map_icd9_to_category)
diabetes_dropped.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diabetes_dropped['diag_1'] = diabetes_dropped['diag_1'].apply(map_icd9_to_category)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diabetes_dropped['diag_2'] = diabetes_dropped['diag_2'].apply(map_icd9_to_category)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diabetes_dropped['diag_3'] = diabetes

Unnamed: 0,age,gender,race,admission_type_id,diag_1,diag_2,diag_3,discharge_disposition_id,insulin,readmitted,time_in_hospital,A1Cresult,number_inpatient,admission_type
0,0-10,Female,Caucasian,6,Diabetes,Unknown,Unknown,25,No,NO,1,,0,Unknown
1,10-20,Female,Caucasian,1,Endocrine/Metabolic,Diabetes,Endocrine/Metabolic,1,Up,>30,3,,0,Emergency
2,20-30,Female,African American,1,Other,Diabetes,Unknown,1,No,NO,2,,1,Emergency
3,30-40,Male,Caucasian,1,Other,Diabetes,Circulatory,1,Up,NO,2,,0,Emergency
4,40-50,Male,Caucasian,1,Neoplasms,Neoplasms,Diabetes,1,Steady,NO,1,,0,Emergency


In [8]:
discharge_status_dict = {
    1: "Discharged to home",
    2: "Discharged/transferred to another short term hospital",
    3: "Discharged/transferred to SNF",
    4: "Discharged/transferred to ICF",
    5: "Discharged/transferred to another type of inpatient care institution",
    6: "Discharged/transferred to home with home health service",
    11: "Expired",
    18: "NULL",
    22: "Discharged/transferred to another rehab fac including rehab units of a hospital.",
    25: "Not Mapped"
}

diabetes_dropped['discharge_status'] = diabetes_dropped['discharge_disposition_id'].map(discharge_status_dict)

# replace nulls
diabetes_dropped['discharge_status'] = diabetes_dropped['discharge_status'].replace(['NULL', 'Not Mapped'], 'Unknown')
diabetes_dropped['discharge_status'] = diabetes_dropped['discharge_status'].fillna('Other')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diabetes_dropped['discharge_status'] = diabetes_dropped['discharge_disposition_id'].map(discharge_status_dict)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diabetes_dropped['discharge_status'] = diabetes_dropped['discharge_status'].replace(['NULL', 'Not Mapped'], 'Unknown')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returnin

In [9]:
diabetes_dropped['insulin'].unique()

array(['No', 'Up', 'Steady', 'Down'], dtype=object)

In [10]:
pd.set_option('display.max_colwidth', None)

In [11]:
diabetes_dropped['diag_1'].unique()

array(['Diabetes', 'Endocrine/Metabolic', 'Other', 'Neoplasms',
       'Circulatory', 'Respiratory', 'Injury', 'Musculoskeletal',
       'Digestive', 'Unknown', 'Genitourinary', 'Mental Disorders',
       'Blood'], dtype=object)

In [12]:
def create_summary(row):
    # SENTENCE 1
    age = f"{row['age']}-year-old"
    if row['gender'] == 'Unknown/Invalid':
        gender = ''
    else:
        gender = f"{row['gender'].lower()}"
    if row['race'] == '?':
        race = ''
    else:
        race = f"{row['race']}"
    gender = f"{row['gender'].lower()}"
    # diagnoses
    if row['diag_1'] == 'Other':
        primary = "a condition not categorized under common diagnostic groups"
    else:
        primary = f"a {row['diag_1']} condition"
    # admission type
    if row['admission_type'] == 'Emergency':
        admission = 'through emergency'
    elif row['admission_type'] == 'Urgent':
        admission = 'urgently'
    elif row['admission_type'] == 'Elective':
        admission = 'electively'
    elif row['admission_type'] == 'Newborn':
        admission = 'as a newborn'
    elif row['admission_type'] == 'Trauma Center':
        admission = 'through the trauma center'
    else:
        admission = ''

    # SENTENCE 2
    if row['gender'] == 'Female':
        pronoun = 'She'
    else:
        pronoun = 'He'
    duration = row['time_in_hospital']

    # SENTENCE 3
    if not pd.isna(row['A1Cresult']):
        a1c = f"HbA1C {row['A1Cresult']}."
    else:
        a1c = ''

    # SENTENCE 4
    if row['insulin'] == 'Steady':
        insulin = 'No change in insulin.'
    elif row['insulin'] == 'Up':
        insulin = 'Insulin was increased.'
    elif row['insulin'] == 'Down':
        insulin = 'Insulin was decreased.'
    else: 
        insulin=''

    # SENTENCE 5
    if row['number_inpatient'] != 0:
        inpatient = f"{row['number_inpatient']} previous inpatient visits."
    else:
        inpatient = ''

    # SENTENCE 6
    if row['discharge_status'] != 'Unknown':
        discharge_status = f'{row['discharge_status']}.'
    else:
        discharge_status = ''

    # SENTENCE 7
    diagnoses = [row['diag_1'], row['diag_2'], row['diag_3']]
    diags = np.unique([d for d in diagnoses if pd.notna(d) and d not in ['Unknown', 'Other']])
    diag_str = f"Diagnoses: {', '.join(diags)}"
    
    summary = (
        f"A {age} {race} {gender} was admitted {admission} with {primary}. "
        f"{pronoun} stayed for {duration} days. "
        f"{a1c} "
        f"{insulin} "
        f"{inpatient} "
        f"{discharge_status} "
        f"{diag_str}"
    )
    return summary

diabetes_dropped['Summary'] = diabetes_dropped.apply(create_summary, axis=1)
diabetes_dropped['Summary'] = diabetes_dropped['Summary'].str.replace('  ', ' ')
diabetes_dropped[['Summary']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diabetes_dropped['Summary'] = diabetes_dropped.apply(create_summary, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diabetes_dropped['Summary'] = diabetes_dropped['Summary'].str.replace('  ', ' ')


Unnamed: 0,Summary
0,A 0-10-year-old Caucasian female was admitted with a Diabetes condition. She stayed for 1 days. Diagnoses: Diabetes
1,"A 10-20-year-old Caucasian female was admitted through emergency with a Endocrine/Metabolic condition. She stayed for 3 days. Insulin was increased. Discharged to home. Diagnoses: Diabetes, Endocrine/Metabolic"
2,A 20-30-year-old African American female was admitted through emergency with a condition not categorized under common diagnostic groups. She stayed for 2 days. 1 previous inpatient visits. Discharged to home. Diagnoses: Diabetes
3,"A 30-40-year-old Caucasian male was admitted through emergency with a condition not categorized under common diagnostic groups. He stayed for 2 days. Insulin was increased. Discharged to home. Diagnoses: Circulatory, Diabetes"
4,"A 40-50-year-old Caucasian male was admitted through emergency with a Neoplasms condition. He stayed for 1 days. No change in insulin. Discharged to home. Diagnoses: Diabetes, Neoplasms"
...,...
101761,"A 70-80-year-old African American male was admitted through emergency with a Diabetes condition. He stayed for 3 days. HbA1C >8. Insulin was decreased. Discharged/transferred to SNF. Diagnoses: Circulatory, Diabetes, Mental Disorders"
101762,"A 80-90-year-old African American female was admitted through emergency with a Digestive condition. She stayed for 5 days. No change in insulin. 1 previous inpatient visits. Discharged/transferred to ICF. Diagnoses: Digestive, Endocrine/Metabolic"
101763,"A 70-80-year-old Caucasian male was admitted through emergency with a condition not categorized under common diagnostic groups. He stayed for 1 days. Insulin was decreased. Discharged to home. Diagnoses: Genitourinary, Mental Disorders"
101764,"A 80-90-year-old Caucasian female was admitted urgently with a Injury condition. She stayed for 10 days. Insulin was increased. 1 previous inpatient visits. Discharged/transferred to SNF. Diagnoses: Blood, Injury"


In [13]:
diabetes_dropped[diabetes_dropped['discharge_disposition_id'] == 11][['Summary']]

Unnamed: 0,Summary
34,"A 70-80-year-old Caucasian female was admitted urgently with a Circulatory condition. She stayed for 5 days. Insulin was decreased. Expired. Diagnoses: Circulatory, Genitourinary"
44,"A 80-90-year-old Caucasian female was admitted through emergency with a Circulatory condition. She stayed for 7 days. No change in insulin. Expired. Diagnoses: Circulatory, Injury"
64,"A 50-60-year-old African American female was admitted through emergency with a Circulatory condition. She stayed for 4 days. Insulin was decreased. Expired. Diagnoses: Circulatory, Genitourinary"
78,"A 50-60-year-old Caucasian female was admitted urgently with a Injury condition. She stayed for 2 days. Insulin was increased. Expired. Diagnoses: Circulatory, Injury"
100,"A 70-80-year-old Caucasian male was admitted through emergency with a Circulatory condition. He stayed for 9 days. No change in insulin. Expired. Diagnoses: Circulatory, Endocrine/Metabolic, Respiratory"
...,...
101308,"A 80-90-year-old Other female was admitted through emergency with a Circulatory condition. She stayed for 2 days. Expired. Diagnoses: Circulatory, Respiratory"
101494,"A 80-90-year-old Asian male was admitted urgently with a Diabetes condition. He stayed for 9 days. HbA1C >7. Insulin was decreased. Expired. Diagnoses: Diabetes, Musculoskeletal, Respiratory"
101507,"A 70-80-year-old Caucasian female was admitted urgently with a Injury condition. She stayed for 4 days. 1 previous inpatient visits. Expired. Diagnoses: Diabetes, Injury, Respiratory"
101547,"A 80-90-year-old Caucasian female was admitted through emergency with a Respiratory condition. She stayed for 3 days. 1 previous inpatient visits. Expired. Diagnoses: Circulatory, Respiratory"


In [14]:
diabetes_dropped.to_csv('nlp.csv')

In [15]:
# load in pretrained model and tokenizer
# model_name = 'emilyalsentzer/Bio_ClinicalBERT'
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModel.from_pretrained(model_name)

In [16]:
# # Tokenize Summary column
# def get_bert_embedding(text):
#     encoded_inputs = tokenizer(
#         diabetes_dropped['Summary'].to_list(),
#         padding=True, #all inputs are the same length
#         truncation=True, #cuts off tokens if the input is too long
#         max_length=128, #max 128 tokens
#         return_tensors='pt' #return pyTorch sensors --> what BERT takes in as input
#     )
#     with torch.no_grad():
#         outputs = model(**encoded_inputs)
#         cls_embedding = outputs.last_hidden_state[:,0,:].squeeze().numpy()
#     return cls_embedding

In [17]:
# selected = diabetes_dropped[diabetes_dropped['readmitted'] != 'NO']
# selected.shape[0]

In [18]:
# embeddings = np.stack(selected['Summary'].apply(get_bert_embedding).values)
# embeddings.shape

In [19]:
# import torch
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(device)

In [20]:
# diabetes_dropped['readmitted']