In [1]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, auc, roc_auc_score, roc_curve, precision_score, recall_score, accuracy_score, precision_recall_curve, confusion_matrix
from sklearn.preprocessing import StandardScaler, label_binarize, normalize
from sklearn.base import BaseEstimator, TransformerMixin

from pymongo import MongoClient
client = MongoClient() # Creates a client that uses the default port on localhost.
database = client.medical_notes_kaggle # Connect to medical_notes_kaggle database

In [2]:
# Classes

class TextSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]

    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

In [3]:
def compute_multiclass_ROC(y_test, y_score):
    """
    Takes "y_test" and "y_score" as inputs and returns the fpr, tpr, and roc_auc scores dictionary.
    
    Taken from scikit-learn's plot_roc tutorial. See below:
    https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
    """
    
    fpr, tpr, thresholds, roc_auc = dict(), dict(), dict(), dict()
    
    # Compute micro-average ROC curve and ROC AUC
    for i in range(5):
        fpr[i], tpr[i], thresholds[i] = roc_curve(y_test[:, i], y_score[:, i])
        roc_auc[i] = roc_auc_score(y_test[:,i], y_score[:,i])

    # Collect results
    fpr["micro"], tpr["micro"], thresholds["micro"] = roc_curve(y_test.ravel(), y_score.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    
    # Compute macro-average ROC curve and ROC area
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(5)])) # Aggregate all false positive rates

    # Interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(5):
        mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
    mean_tpr /= 5 # Average it

    # Collect Results
    fpr["macro"], tpr["macro"] = all_fpr, mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
    
    return fpr, tpr, thresholds, roc_auc

# Import Data

In [4]:
# import labeled data
mongodb_query = database.train.find({})
train_tuple = list(mongodb_query)

# Store data in a dataframe
train_df = pd.DataFrame(columns=['index_', 'note_text', 'section_headers', 'clinical_domain']) # Create empty dataframe
keys_to_exclude = set(('_id', 'index_', 'clinical_domain'))

for i in range(len(train_tuple)):
    index_int = int(str({v for k,v in train_tuple[i].items() if k in 'index_'})[2:-2])
    note_text_str = str({v for k,v in train_tuple[i].items() if k not in keys_to_exclude})[1:-1]
    section_headers_str = str({k for k,v in train_tuple[i].items() if k not in keys_to_exclude})[1:-1]
    clinical_domain_str = str({v for k,v in train_tuple[i].items() if k in 'clinical_domain'})[2:-2]

    train_df = train_df.append(
        {'index_': index_int,
         'clinical_domain': clinical_domain_str,
         'note_text': note_text_str,
         'section_headers': section_headers_str}, ignore_index=True)
    
train_df = train_df.sort_values('index_').reset_index(drop=True) # Sort values to ensure data is in the same order

# Drop missing values
train_df.section_headers = train_df.section_headers.replace('et(', np.nan)
train_df.note_text = train_df.note_text.replace('et(', np.nan)
train_df = train_df.dropna().reset_index(drop=True)

train_df['text_length'] = train_df.note_text.apply(len) # Add text_length column
train_df['section_headers_count'] = train_df.section_headers.apply(lambda x: x.count(',') +1) # Count section headers

In [5]:
train_df.head() # View data

Unnamed: 0,index_,note_text,section_headers,clinical_domain,text_length,section_headers_count
0,1001,"'Difficulty with word finding.', 'She did not ...","'CC', 'EXAM', 'FHX', 'HX', 'COURSE', 'SHX', 'M...",Neurology,5433,8
1,1002,"'General.', 'Ganglion of the left wrist.', 'Le...","'PREOPERATIVE DIAGNOSIS', 'OPERATION', 'ESTIMA...",Orthopedic,1432,5
2,1003,"'Less than 100 mL.', 'None.', ""The patient was...","'PREOPERATIVE DIAGNOSIS', 'CONDITION', 'INDICA...",Orthopedic,6452,13
3,1004,"'MRI LEFT SHOULDER', 'There is extensive supra...","'EXAM', 'CLINICAL', 'IMPRESSION', 'FINDINGS'",Radiology,2765,4
4,1005,"'1. Recurrent bunion deformity, right forefoo...","'ASSESSMENT', 'PLAN/TREATMENT', 'PHYSICAL EXAM...",Orthopedic,3119,8


In [26]:
re.findall(r"with", train_df.note_text[0])

['with',
 'with',
 'with',
 'with',
 'with',
 'with',
 'with',
 'with',
 'with',
 'with',
 'with',
 'with',
 'with',
 'with',
 'with',
 'with',
 'with',
 'with']