In [1]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch

In [2]:
diabetes = pd.read_csv('../../diabetes_data/diabetic_data.csv')
diabetes.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [3]:
diabetes.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [4]:
diabetes_dropped = diabetes[['age', 'gender', 'race', 'admission_type_id', 'diag_1', 'diag_2', 'diag_3', \
    'discharge_disposition_id', 'insulin']]
diabetes_dropped.head()

Unnamed: 0,age,gender,race,admission_type_id,diag_1,diag_2,diag_3,discharge_disposition_id,insulin
0,[0-10),Female,Caucasian,6,250.83,?,?,25,No
1,[10-20),Female,Caucasian,1,276.0,250.01,255,1,Up
2,[20-30),Female,AfricanAmerican,1,648.0,250,V27,1,No
3,[30-40),Male,Caucasian,1,8.0,250.43,403,1,Up
4,[40-50),Male,Caucasian,1,197.0,157,250,1,Steady


In [5]:
diabetes_dropped['age'] = diabetes_dropped['age'].apply(lambda x: x[1:-1])
diabetes_dropped['race'] = diabetes_dropped['race'].apply(lambda x: 'African American' if x == 'AfricanAmerican' else x)
diabetes_dropped.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diabetes_dropped['age'] = diabetes_dropped['age'].apply(lambda x: x[1:-1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diabetes_dropped['race'] = diabetes_dropped['race'].apply(lambda x: 'African American' if x == 'AfricanAmerican' else x)


Unnamed: 0,age,gender,race,admission_type_id,diag_1,diag_2,diag_3,discharge_disposition_id,insulin
0,0-10,Female,Caucasian,6,250.83,?,?,25,No
1,10-20,Female,Caucasian,1,276.0,250.01,255,1,Up
2,20-30,Female,African American,1,648.0,250,V27,1,No
3,30-40,Male,Caucasian,1,8.0,250.43,403,1,Up
4,40-50,Male,Caucasian,1,197.0,157,250,1,Steady


In [6]:
admission_type_map = {
    1: 'Emergency',
    2: 'Urgent',
    3: 'Elective',
    4: 'Newborn',
    5: 'Not Available',
    6: 'NULL',
    7: 'Trauma Center',
    8: 'Not Mapped'
}

diabetes_dropped['admission_type'] = diabetes_dropped['admission_type_id'].map(admission_type_map)

# replace nulls
diabetes_dropped['admission_type'] = diabetes_dropped['admission_type'].replace(['Not Available', 'NULL', 'Not Mapped'], 'Unknown')
diabetes_dropped.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diabetes_dropped['admission_type'] = diabetes_dropped['admission_type_id'].map(admission_type_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diabetes_dropped['admission_type'] = diabetes_dropped['admission_type'].replace(['Not Available', 'NULL', 'Not Mapped'], 'Unknown')


Unnamed: 0,age,gender,race,admission_type_id,diag_1,diag_2,diag_3,discharge_disposition_id,insulin,admission_type
0,0-10,Female,Caucasian,6,250.83,?,?,25,No,Unknown
1,10-20,Female,Caucasian,1,276.0,250.01,255,1,Up,Emergency
2,20-30,Female,African American,1,648.0,250,V27,1,No,Emergency
3,30-40,Male,Caucasian,1,8.0,250.43,403,1,Up,Emergency
4,40-50,Male,Caucasian,1,197.0,157,250,1,Steady,Emergency


In [7]:
def map_icd9_to_category(code):
    try:
        code = float(code)
    except:
        return "Unknown"

    if 390 <= code <= 459 or code == 785:
        return "Circulatory"
    elif 460 <= code <= 519 or code == 786:
        return "Respiratory"
    elif 520 <= code <= 579 or code == 787:
        return "Digestive"
    elif 250.0 <= code <= 250.99:
        return "Diabetes"
    elif 800 <= code <= 999:
        return "Injury"
    elif 710 <= code <= 739:
        return "Musculoskeletal"
    elif 580 <= code <= 629 or code == 788:
        return "Genitourinary"
    elif 140 <= code <= 239:
        return "Neoplasms"
    elif 240 <= code <= 279:
        return "Endocrine/Metabolic"
    elif 280 <= code <= 289:
        return "Blood"
    elif 290 <= code <= 319:
        return "Mental Disorders"
    else:
        return "Other"
diabetes_dropped['diag_1'] = diabetes_dropped['diag_1'].apply(map_icd9_to_category)
diabetes_dropped['diag_2'] = diabetes_dropped['diag_2'].apply(map_icd9_to_category)
diabetes_dropped['diag_3'] = diabetes_dropped['diag_3'].apply(map_icd9_to_category)
diabetes_dropped.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diabetes_dropped['diag_1'] = diabetes_dropped['diag_1'].apply(map_icd9_to_category)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diabetes_dropped['diag_2'] = diabetes_dropped['diag_2'].apply(map_icd9_to_category)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diabetes_dropped['diag_3'] = diabetes

Unnamed: 0,age,gender,race,admission_type_id,diag_1,diag_2,diag_3,discharge_disposition_id,insulin,admission_type
0,0-10,Female,Caucasian,6,Diabetes,Unknown,Unknown,25,No,Unknown
1,10-20,Female,Caucasian,1,Endocrine/Metabolic,Diabetes,Endocrine/Metabolic,1,Up,Emergency
2,20-30,Female,African American,1,Other,Diabetes,Unknown,1,No,Emergency
3,30-40,Male,Caucasian,1,Other,Diabetes,Circulatory,1,Up,Emergency
4,40-50,Male,Caucasian,1,Neoplasms,Neoplasms,Diabetes,1,Steady,Emergency


In [8]:
discharge_status_dict = {
    1: "Discharged to home",
    2: "Discharged/transferred to another short term hospital",
    3: "Discharged/transferred to SNF",
    4: "Discharged/transferred to ICF",
    5: "Discharged/transferred to another type of inpatient care institution",
    6: "Discharged/transferred to home with home health service",
    11: "Expired",
    18: "NULL",
    22: "Discharged/transferred to another rehab fac including rehab units of a hospital.",
    25: "Not Mapped"
}

diabetes_dropped['discharge_status'] = diabetes_dropped['discharge_disposition_id'].map(discharge_status_dict)

# replace nulls
diabetes_dropped['discharge_status'] = diabetes_dropped['discharge_status'].replace(['NULL', 'Not Mapped'], 'Unknown')
diabetes_dropped['discharge_status'] = diabetes_dropped['discharge_status'].fillna('Other')
diabetes_dropped

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diabetes_dropped['discharge_status'] = diabetes_dropped['discharge_disposition_id'].map(discharge_status_dict)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diabetes_dropped['discharge_status'] = diabetes_dropped['discharge_status'].replace(['NULL', 'Not Mapped'], 'Unknown')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returnin

Unnamed: 0,age,gender,race,admission_type_id,diag_1,diag_2,diag_3,discharge_disposition_id,insulin,admission_type,discharge_status
0,0-10,Female,Caucasian,6,Diabetes,Unknown,Unknown,25,No,Unknown,Unknown
1,10-20,Female,Caucasian,1,Endocrine/Metabolic,Diabetes,Endocrine/Metabolic,1,Up,Emergency,Discharged to home
2,20-30,Female,African American,1,Other,Diabetes,Unknown,1,No,Emergency,Discharged to home
3,30-40,Male,Caucasian,1,Other,Diabetes,Circulatory,1,Up,Emergency,Discharged to home
4,40-50,Male,Caucasian,1,Neoplasms,Neoplasms,Diabetes,1,Steady,Emergency,Discharged to home
...,...,...,...,...,...,...,...,...,...,...,...
101761,70-80,Male,African American,1,Diabetes,Mental Disorders,Circulatory,3,Down,Emergency,Discharged/transferred to SNF
101762,80-90,Female,African American,1,Digestive,Endocrine/Metabolic,Digestive,4,Steady,Emergency,Discharged/transferred to ICF
101763,70-80,Male,Caucasian,1,Other,Genitourinary,Mental Disorders,1,Down,Emergency,Discharged to home
101764,80-90,Female,Caucasian,2,Injury,Blood,Injury,3,Up,Urgent,Discharged/transferred to SNF


In [9]:
diabetes_dropped['insulin'].unique()

array(['No', 'Up', 'Steady', 'Down'], dtype=object)

In [10]:
def create_summary(row):
    # SENTENCE 1
    age = f"{row['age']}-year-old"
    if row['gender'] == 'Unknown/Invalid':
        gender = ''
    else:
        gender = f"{row['gender'].lower()}"
    if row['race'] == '?':
        race = ''
    else:
        race = f"{row['race']}"
    gender = f"{row['gender'].lower()}"
    # diagnoses
    if row['diag_1'] == 'Other':
        primary = "a condition not categorized under common diagnostic groups"
    else:
        primary = f"a {row['diag_1']} condition"
    if row['diag_2'] != 'Unknown' and row['diag_2'] != 'Other'\
          and row['diag_3'] != 'Unknown' and row['diag_3']!='Other':
        primary += f", a secondary diagnosis related to the {row['diag_2']} system, and a third diagnosis related to the {row['diag_3']} system"
    if row['diag_2'] != 'Unknown' and row['diag_2'] != 'Other'\
        and row['diag_3'] == 'Unknown':
        primary += f" and a secondary diagnosis related to the {row['diag_2']} system"
    # admission type
    if row['admission_type'] == 'Emergency':
        admission = 'through emergency'
    elif row['admission_type'] == 'Urgent':
        admission = 'urgently'
    elif row['admission_type'] == 'Elective':
        admission = 'electively'
    elif row['admission_type'] == 'Newborn':
        admission = 'as a newborn'
    elif row['admission_type'] == 'Trauma Center':
        admission = 'through the trauma center'
    else:
        admission = ''
    
    # SENTENCE 2
    if row['insulin'] == 'Steady':
        insulin = 'No change in insulin.'
    elif row['insulin'] == 'Up':
        insulin = 'Insulin was increased.'
    elif row['insulin'] == 'Down':
        insulin = 'Insulin was decreased.'
    else: 
        insulin=''

    # SENTENCE 3
    if row['discharge_status'] != 'Unknown':
        discharge_status = f'{row['discharge_status']}.'
    else:
        discharge_status = ''
    
    summary = (
        f"A {age} {race} {gender} was admitted {admission} with {primary}. "
        f"{insulin} "
        f"{discharge_status}"
    )
    return summary

diabetes_dropped['Summary'] = diabetes_dropped.apply(create_summary, axis=1)
diabetes_dropped['Summary'] = diabetes_dropped['Summary'].str.replace('  ', ' ')
diabetes_dropped[['Summary']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diabetes_dropped['Summary'] = diabetes_dropped.apply(create_summary, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diabetes_dropped['Summary'] = diabetes_dropped['Summary'].str.replace('  ', ' ')


Unnamed: 0,Summary
0,A 0-10-year-old Caucasian female was admitted ...
1,A 10-20-year-old Caucasian female was admitted...
2,A 20-30-year-old African American female was a...
3,A 30-40-year-old Caucasian male was admitted t...
4,A 40-50-year-old Caucasian male was admitted t...
...,...
101761,A 70-80-year-old African American male was adm...
101762,A 80-90-year-old African American female was a...
101763,A 70-80-year-old Caucasian male was admitted t...
101764,A 80-90-year-old Caucasian female was admitted...


In [11]:
# load in pretrained model and tokenizer
model_name = 'emilyalsentzer/Bio_ClinicalBERT'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [None]:
# Tokenize Summary column
def get_bert_embedding(text):
    encoded_inputs = tokenizer(
        diabetes_dropped['Summary'].to_list(),
        padding=True, #all inputs are the same length
        truncation=True, #cuts off tokens if the input is too long
        max_length=128, #max 128 tokens
        return_tensors='pt' #return pyTorch sensors --> what BERT takes in as input
    )
    with torch.no_grad():
        outputs = model(**encoded_inputs)
        cls_embedding = outputs.last_hidden_state[:,0,:].squeeze().numpy()
    return cls_embedding

: 

In [None]:
embeddings = np.stack(diabetes_dropped['Summary'].apply(get_bert_embedding).values)
embeddings.shape