In [None]:
import numpy as np
import pandas as pd
from fastai import *
from fastai.text import *

In [None]:
data_pth = "/home/littlefield/MIMIC-NLP/readmission-prediction/data/"

In [None]:
clinical_data = (TextList.from_csv(data_pth, "train_valid_fastai.csv", cols='TEXT').split_from_df(col=10)
        .label_from_df(cols=9))

In [None]:
print("Number of processed tokens:", len(clinical_data.vocab.stoi))

In [None]:
clinical_data.train.x[13]

#### Map Clinical Notes to Vector Space

In [None]:
n_terms = len(clinical_data.vocab.itos)
n_docs = len(clinical_data.train.x)

print("There are", n_terms, "terms")
print("There are", n_docs, "documents")

In [None]:
clinical_token_counter = lambda clinical_index: Counter(clinical_data.train.x[clinical_index].data)

In [None]:
# input: clinical note index, n_terms, and tokenizer function
# output: embedding vector for the review
def count_vectorizer(clinical_index, n_terms, make_token_counter):
    embedding_vector = np.zeros(n_terms)   
    term_toknzr = make_token_counter(clinical_index)
    keys = list(term_toknzr.keys())
    values = list(term_toknzr.values())
    embedding_vector[keys] = values
    return embedding_vector

embedding_vector = count_vectorizer(0, n_terms, clinical_token_counter)

In [None]:
embedding_vector

In [None]:
print(f'The clinical note is embedded in a {len(embedding_vector)} dimensional vector')
embedding_vector

In [None]:
# Define a function to build the full document-term matrix
print(f'there are {n_docs} reviews, and {n_terms} unique tokens in the vocabulary')
def make_full_doc_term_matrix(count_vectorizer,n_terms,n_docs):
    
    # loop through the movie reviews
    for doc_index in range(n_docs):
        
        # make the embedding vector for the current review
        embedding_vector = count_vectorizer(doc_index,n_terms, clinical_token_counter)    
            
        # append the embedding vector to the document-term matrix
        if(doc_index == 0):
            A = embedding_vector
        else:
            A = np.vstack((A,embedding_vector))
            
    # return the document-term matrix
    return A

# Build the full document term matrix for the movie_reviews training set
A = make_full_doc_term_matrix(count_vectorizer, n_terms, n_docs)

#### Sparse Matrix Representation

In [None]:
NNZ = np.count_nonzero(A)
sparsity = (A.size-NNZ)/A.size
print(f'Only {NNZ} of the {A.size} elements in the document-term matrix are nonzero')
print(f'The sparsity of the document-term matrix is {sparsity}')

In [None]:
# construct the document-term matrix in CSR format
# i.e. return (values, column_indices, row_pointer)
def get_doc_term_matrix(text_list, n_terms):
    
    # inputs:
    #    text_list, a TextList object
    #    n_terms, the number of tokens in our IMDb vocabulary
    
    # output: 
    #    the CSR format sparse representation of the document-term matrix in the form of a
    #    scipy.sparse.csr.csr_matrix object

    
    # initialize arrays
    values = []
    column_indices = []
    row_pointer = []
    row_pointer.append(0)

    # from the TextList object
    for _, doc in enumerate(text_list):
        feature_counter = Counter(doc.data)
        column_indices.extend(feature_counter.keys())
        values.extend(feature_counter.values())
        # Tack on N (number of nonzero elements in the matrix) to the end of the row_pointer array
        row_pointer.append(len(values))
        
    return scipy.sparse.csr_matrix((values, column_indices, row_pointer),
                                   shape=(len(row_pointer) - 1, n_terms),
                                   dtype=int)

In [None]:
train_x = get_doc_term_matrix(clinical_data.train.x, n_terms)
train_y = clinical_data.train.y.items

In [None]:
valid_x = get_doc_term_matrix(clinical_data.valid.x, n_terms)
valid_y = clinical_data.valid.y.items

#### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
clf=LogisticRegression(C = 0.0001, penalty = 'l2', random_state = 999)
clf.fit(train_x, train_y)

In [None]:
preds = clf.predict(valid_x)

In [None]:
clf.score(valid_x, valid_y)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(valid_y, preds)

In [None]:
print("Training Score:", clf.score(train_x, train_y))
print("Validation Score:", clf.score(valid_x, valid_y))

In [None]:
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
print(precision_recall_fscore_support(valid_y, preds))
print(roc_auc_score(valid_y, preds))