# Text Vectorization

In [1]:
# Import libraries
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import random

from sklearn.model_selection import train_test_split

from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation
import dill

In [2]:
try:
  from google.colab import drive
  IN_COLAB=True
except:
  IN_COLAB=False

if IN_COLAB:
  print("We're running Colab")

if IN_COLAB:  
  # Mount the Google Drive at mount
  mount='/content/gdrive'
  print("Colab: mounting Google drive on ", mount)
  # connect your colab with the drive
  drive.mount(mount)

 # Switch to the directory on the Google Drive that you want to use
  import os
  path_to_repo = mount + "/My Drive/MIMIC-III Text Mining/mimim_iii_readmission"

else:
   path_to_repo = os.path.dirname(os.getcwd())

  
print(path_to_repo)

C:\Users\luca9\Documents\MIMIC-III Text Mining\mimim_iii_readmission


In [3]:
# PARAMETERS

session_seed = 42 # set seed for our session
include_val = False # set to True if we want to also create a validation set
test_proportion = 0.2
val_proportion = 0.1

MAX_FEATURES = 10000 # maximum number of features
min_df = 5 # minimum frequency
max_df = 0.8 # maximum frequency
N_GRAM = (1,2) # n_gram range

icu_stays = True # set to TRUE if we want to have only ICU stays
lemmatize = True # set to false if we want to do stemming
lemma_tag = str(np.where(lemmatize, "_lemma",""))
heavier_proc = True # if we want a heavier processing
if heavier_proc:
    heavier_tag = '_heavier'
else:
        heavier_tag = ''
spacy = True
if spacy: lemma_tag = str(np.where(lemmatize, "_lemma_spacy",""))

if include_val == True:
    train_proportion = 1 - test_proportion - val_proportion
else:
    train_proportion = 1 - test_proportion

seed_tag = f'_{session_seed}'

random.seed(session_seed)

med_7 = False # set to True if we want to use our Med7 preprocessing

if med_7:
    med_tag = "_med7"
else:
    med_tag = ''
    
    
expanded_def = True # set to True if we want to consider future readmissions and avoid using CMS 

if icu_stays == True:
    icu_folder = 'icu_only'
    if expanded_def:
        icu_folder = 'expanded'
else:
    icu_folder = 'all_hosp'

In [4]:
path_to_data = os.path.join(path_to_repo, "data", icu_folder,"")
print(path_to_data)

C:\Users\luca9\Documents\MIMIC-III Text Mining\mimim_iii_readmission\data\expanded\


In [5]:
path_to_processed = os.path.join(path_to_data,"processed","")
os.makedirs(path_to_processed, exist_ok=True) # we create the directory if it does not exist
print(path_to_processed)

C:\Users\luca9\Documents\MIMIC-III Text Mining\mimim_iii_readmission\data\expanded\processed\


### Train/Test Split

In [24]:
df = pd.read_feather(os.path.join(path_to_data,f"df_cleaned{lemma_tag}{med_tag}{heavier_tag}"))

In [17]:
if med_7:
    df['clean'] = df.text_def.apply(lambda x: ' '.join(list(x))) # we need to join all elements into a text

In [18]:
# split into train and test
%time train, test = train_test_split(df, test_size = test_proportion, random_state = session_seed, stratify = df.target)
if include_val == True:
    # furtherly split into validation and train
    %time train, val = train_test_split(train, test_size = val_proportion, random_state = session_seed, stratify = train.target)
else:
    val = ''

CPU times: total: 391 ms
Wall time: 425 ms


In [19]:
print('Test:{}'.format(test.shape))
if include_val: print('Val:{}'.format(val.shape))
print('Train:{}'.format(train.shape))

Test:(8565, 16)
Train:(34260, 16)


In [8]:
def vectorize_to_dataframe(df, vectorizer_obj):
    """
    Function to return a dataframe from our vectorizer results
    """
    df = pd.DataFrame(data = df.toarray(), columns = vectorizer_obj.get_feature_names())
    return df

In [6]:
def vectorize_features(X_train, X_test, method = 'frequency', include_val = False, X_val = ''):
    """
    Function to perform vectorization of our test sets
    X_train, X_test, X_val: our dataframes
    method: either 'frequency', 'tf_idf', 'onehot' to employ a different BoW technique
    include_val: set to True if we also have a validation dataset
    """
    # initialize our vectorizer
    if method == 'tf_idf':
        vectorizer = TfidfVectorizer(ngram_range=N_GRAM, min_df=min_df, max_df=max_df, max_features=MAX_FEATURES)
    elif method == 'frequency':
        vectorizer = CountVectorizer(ngram_range=N_GRAM, min_df=min_df, max_df=max_df, max_features=MAX_FEATURES)
    elif method == 'onehot':
        vectorizer = CountVectorizer(ngram_range=N_GRAM, min_df=min_df, max_df=max_df, max_features=MAX_FEATURES, binary = True)
        
    X_train = vectorizer.fit_transform(X_train.clean)
    X_train = vectorize_to_dataframe(X_train, vectorizer)
    X_test = vectorizer.transform(X_test.clean)
    X_test = vectorize_to_dataframe(X_test, vectorizer)
    if include_val: 
        X_val = vectorizer.transform(X_val.clean)
        X_val = vectorize_to_dataframe(X_val, vectorizer)
    return X_train, X_test, X_val

In [7]:
def save_dataframes(train, test, method, include_val = False, val = '', target = False):
    """
    Function to save our dataframes
    train: train set to be saved
    val: validation set to be saved
    method: method through which we have processed the dataframes, needed as save keyword
    include_val: True if we want to save also the test set
    test: test set to be saved
    """
    if target == True: 
        target = 'y_'
    else: 
        target = ''
    # need to reset the index
    train.reset_index(inplace=True, drop = True)
    # save our dataset up to now in feather format
    train.to_feather('{}{}train_{}{}{}{}{}'.format(path_to_processed, target, method, seed_tag, lemma_tag, med_tag, heavier_tag))
    # need to reset the index
    test.reset_index(inplace=True, drop = True)
    # save our dataset up to now in feather format
    test.to_feather('{}{}test_{}{}{}{}{}'.format(path_to_processed, target, method, seed_tag, lemma_tag, med_tag, heavier_tag))
    if include_val:
        # need to reset the index
        val.reset_index(inplace=True, drop = True)
        # save our dataset up to now in feather format
        val.to_feather('{}{}val_{}{}{}{}{}'.format(path_to_processed, target, method, seed_tag, lemma_tag, med_tag, heavier_tag))

In [23]:
# perform vectorization
method_list = ['frequency', 'onehot','tf_idf']

for method in method_list:
    print(method)
    # for each method we perform vectorization
    %time x_train, x_test, x_val = vectorize_features(train, test, method = method, include_val = include_val, X_val = val)
    # and save the dataframes
    save_dataframes(x_train, x_test, method = method, include_val = include_val, val = x_val)

frequency
CPU times: total: 1min 11s
Wall time: 1min 11s
onehot
CPU times: total: 1min 29s
Wall time: 1min 30s
tf_idf
CPU times: total: 1min 19s
Wall time: 1min 20s


In [24]:
# finally we also save our target variables
if include_val:
    save_dataframes(pd.DataFrame(train.target), pd.DataFrame(test.target), method = '', include_val = include_val, val = pd.DataFrame(val.target), target = True)
else:
    save_dataframes(pd.DataFrame(train.target), pd.DataFrame(test.target), method = '', include_val = include_val, val = '', target = True)

### Dimensionality Reduction

In [9]:
def load_datasets(method, include_val = True, target = False):
    """
    Function to load train, test and validation set based on the chosen method
    method: string for the processing method we want to load
    include_diag: if we want to load the dataframes with the diagnosis text, default True
    include_test: if we want to load also the test set, default True
    target: if we are importing our target variables
    """
    global path_to_processed
    if target == True: 
        target = 'y_'
    else: 
        target = ''
    # load it back
    train = pd.read_feather(f'{path_to_processed}{target}train_{method}{seed_tag}{lemma_tag}{med_tag}{heavier_tag}')
    test = pd.read_feather(f'{path_to_processed}{target}test_{method}{seed_tag}{lemma_tag}{med_tag}{heavier_tag}')
    if include_val == True:
        val = pd.read_feather(f'{path_to_processed}{target}val_{method}{seed_tag}{lemma_tag}{med_tag}{heavier_tag}')
    else: val = []
    return train, test, val

Before performing any technique of dimensionality reduction, we re-load our dataset. In particular we will be applying dimensionality reduction to our standard BoW dataframe (*i.e. with frequency encoding*).

In [10]:
train, test, val = load_datasets('frequency', include_val = include_val)

We firstly perform truncated SVD - ie. LSA.

In [27]:
svd = TruncatedSVD(n_components = 300, random_state = session_seed)

In [28]:
%time train_svd = svd.fit_transform(train)
train_svd = pd.DataFrame(train_svd)
train_svd.columns=["F"+str(i) for i in range(0, len(train_svd.columns))] # we need to have column names otherwise feather can't be used
%time test_svd = svd.transform(test)
test_svd = pd.DataFrame(test_svd)
test_svd.columns=["F"+str(i) for i in range(0, len(test_svd.columns))]
if include_val:
    %time val_svd = svd.transform(val)
    val_svd = pd.DataFrame(val_svd)
    val_svd.columns=["F"+str(i) for i in range(0, len(val_svd.columns))]
else:
    val_svd = ''
save_dataframes(train_svd, test_svd, method = 'svd', include_val = include_val, val = val_svd)

CPU times: total: 4min 6s
Wall time: 1min 44s
CPU times: total: 4.06 s
Wall time: 1.88 s


We then perform LDA for topic allocation, with an equivalent number of topics from LSA.

In [11]:
from sklearn.decomposition import LatentDirichletAllocation

In [12]:
lda = LatentDirichletAllocation(n_components = 300, random_state = session_seed, n_jobs = -1)

In [13]:
%time train_lda = lda.fit_transform(train)
train_lda = pd.DataFrame(train_lda)
train_lda.columns=["F"+str(i) for i in range(0, len(train_lda.columns))]
%time test_lda = lda.transform(test)
test_lda = pd.DataFrame(test_lda)
test_lda.columns=["F"+str(i) for i in range(0, len(test_lda.columns))]
if include_val:
    %time val_lda = lda.transform(val)
    val_lda = pd.DataFrame(val_lda)
    val_lda.columns=["F"+str(i) for i in range(0, len(val_lda.columns))]
else:
    val_lda = ''
save_dataframes(train_lda, test_lda, method = 'lda', include_val = include_val, val = val_lda)

CPU times: total: 4min 59s
Wall time: 33min 32s
CPU times: total: 3.42 s
Wall time: 25.9 s


In [33]:
with open(f'{path_to_data}lda{seed_tag}{lemma_tag}{med_tag}{heavier_tag}', 'wb') as file: # and save the fitted model
    dill.dump(lda, file)