In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pymongo import MongoClient
client = MongoClient() # Creates a client that uses the default port on localhost.
database = client.medical_notes_kaggle # Connect to medical_notes_kaggle database

# Import Data

In [2]:
# import labeled data
mongodb_query = database.train.find({})
train_tuple = list(mongodb_query)

In [3]:
# Store data in a dataframe
train_df = pd.DataFrame(columns=['index_', 'note_text', 'section_headers', 'clinical_domain']) # Create empty dataframe
keys_to_exclude = set(('_id', 'index_', 'clinical_domain'))

for i in range(len(train_tuple)):
    index_int = int(str({v for k,v in train_tuple[i].items() if k in 'index_'})[2:-2])
    note_text_str = str({v for k,v in train_tuple[i].items() if k not in keys_to_exclude})[1:-1]
    section_headers_str = str({k for k,v in train_tuple[i].items() if k not in keys_to_exclude})[1:-1]
    clinical_domain_str = str({v for k,v in train_tuple[i].items() if k in 'clinical_domain'})[2:-2]

    train_df = train_df.append(
        {'index_': index_int,
         'clinical_domain': clinical_domain_str,
         'note_text': note_text_str,
         'section_headers': section_headers_str}, ignore_index=True)

In [4]:
train_df = train_df.sort_values('index_').reset_index(drop=True) # Sort values to ensure data is in the same order

In [5]:
# Drop missing values
train_df.section_headers = train_df.section_headers.replace('et(', np.nan)
train_df.note_text = train_df.note_text.replace('et(', np.nan)
train_df = train_df.dropna().reset_index(drop=True)

In [6]:
train_df['text_length'] = train_df.note_text.apply(len) # Add text_length column
train_df['section_headers_count'] = train_df.section_headers.apply(lambda x: x.count(',') +1) # Count section headers

In [7]:
train_df # View data

Unnamed: 0,index_,note_text,section_headers,clinical_domain,text_length,section_headers_count
0,1001,"'Ortho-Novum 7-7-7 (started 2/3/96), and ASA (...","'SHX', 'PMH', 'FHX', 'COURSE', 'EXAM', 'MEDS',...",Neurology,5433,8
1,1002,"'After a successful anesthetic, the patient wa...","'POSTOPERATIVE DIAGNOSIS', 'ESTIMATED BLOOD LO...",Orthopedic,1432,5
2,1003,"'Round French 10 JP drain.', 'Cervical myelopa...","'FLUIDS', 'INDICATIONS FOR THE OPERATION', 'CO...",Orthopedic,6452,13
3,1004,'There is extensive supraspinatus tendinosis a...,"'CLINICAL', 'EXAM', 'FINDINGS', 'IMPRESSION'",Radiology,2765,4
4,1005,'X-rays taken today; three views to the right ...,"'PLAN/TREATMENT', 'PHYSICAL EXAMINATION', 'ASS...",Orthopedic,3119,8
5,1006,'The patient is a 61-year-old white female sta...,'INSTRUCTIONS GIVEN TO THE PATIENT AT THE TIME...,Orthopedic,6717,9
6,1007,'Informed consent was obtained prior to the pr...,"'IMPRESSION', 'INDICATIONS', 'PROCEDURE', 'PRO...",Gastroenterology,3040,5
7,1008,"'Transient visual field loss.', 'Colace, Quini...","'SHX', 'PMH', 'FHX', 'COURSE', 'EXAM', 'MEDS',...",Radiology,1645,8
8,1009,'This 34-year-old gentleman has come to the of...,"'POSTOPERATIVE DIAGNOSIS', 'COMPLICATIONS', 'I...",Urology,1985,8
9,1010,'The patient is married and lives with his wif...,"'ALLERGIES', 'SOCIAL HISTORY', 'PHYSICAL EXAMI...",Orthopedic,4657,7


# Modeling

In [9]:
train_df.columns

Index(['index_', 'note_text', 'section_headers', 'clinical_domain',
       'text_length', 'section_headers_count'],
      dtype='object')