# Text Vectorization

In [19]:
# Import libraries
import numpy as np
import pandas as pd
import os
import psycopg2
import matplotlib.pyplot as plt

%matplotlib inline

import string
import random
import re

import spacy
import gensim
import pprint
from gensim import corpora
from gensim.utils import simple_preprocess
from gensim import utils
from gensim.models import KeyedVectors
from gensim.test.utils import datapath
import gensim.downloader
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

In [20]:
# PARAMETERS

icu_stays = True # set to TRUE if we want to have only ICU stays
med_7 = True # set to false if we want to avoid using Med7 preprocessing

if med_7 == False: 
    tag_med7 = '_nomed7'
else:
    tag_med7 = ''
    
if icu_stays == True:
    tag_icu = '_icu'
    icu_folder = 'icu_only'
else:
    tag_icu = ''
    icu_folder = 'all_hosp'

In [21]:
try:
  from google.colab import drive
  IN_COLAB=True
except:
  IN_COLAB=False

if IN_COLAB:
  print("We're running Colab")

if IN_COLAB:  
  # Mount the Google Drive at mount
  mount='/content/gdrive'
  print("Colab: mounting Google drive on ", mount)
  # connect your colab with the drive
  drive.mount(mount)

 # Switch to the directory on the Google Drive that you want to use
  import os
  path_to_repo = mount + "/My Drive/MIMIC-III Text Mining"

else:
  path_to_repo = os.path.dirname(os.getcwd())

  
print(path_to_repo)

C:\Users\luca9\Documents\MIMIC-III Text Mining


In [22]:
path_to_data = os.path.join(path_to_repo,"Readmission","data", icu_folder,"")
print(path_to_data)

C:\Users\luca9\Documents\MIMIC-III Text Mining\Readmission\data\icu_only\


In [23]:
path_to_processed = os.path.join(path_to_data,"processed","")
os.makedirs(path_to_processed, exist_ok=True) # we create the directory if it does not exist
print(path_to_processed)

C:\Users\luca9\Documents\MIMIC-III Text Mining\Readmission\data\icu_only\processed\


In [6]:
# load it back
df_final = pd.read_feather(f'{path_to_data}df_stem{tag_med7}')

## Processing

First we need to split our dataset into training and test set.

In [24]:
# PARAMETERS

session_seed = 42 # set seed for our session
include_diag = True # set to True if we want to also process the diagnosis column
include_test = True # set to True if we want to also process the test set
test_proportion = 0.2
val_proportion = 0.1
train_proportion = 1 - test_proportion - val_proportion

random.seed(session_seed)

In [8]:
# split into train and test
%time train, test = train_test_split(df_final, test_size = test_proportion, random_state = session_seed, stratify = df_final.target)
# furtherly split into validation and train
%time train, val = train_test_split(train, test_size = val_proportion, random_state = session_seed, stratify = train.target)

Wall time: 33.7 ms
Wall time: 18 ms


In [9]:
print('Test:{}'.format(test.shape))
print('Val:{}'.format(val.shape))
print('Train:{}'.format(train.shape))

Test:(6662, 21)
Val:(2665, 21)
Train:(23982, 21)


In [10]:
def corpus_to_df(corpus, dictionary):
    """
    Utility to transofmr the corpus to a dataframe
    """
    corpus_matrix = gensim.matutils.corpus2dense(corpus, num_terms = len(dictionary)).T
    print(corpus_matrix.shape)

    df_corpus = pd.DataFrame(corpus_matrix, columns = dictionary.values())
    return df_corpus
    
    
def gensim_vectorizer(df_clean, id2word, method = 'frequency'):# Create Dictionary
    """
    Create our dataframe through the usage of gensim
    id2word : dictionary from our training dataset
    method : additional argument to perform one hot or TF-IDF encoding instead of frequency encoding,
    arguments are 'frequency', 'one_hot', 'tf_idf'
    """
   
    if method == 'frequency':
        # Term Document Frequency
        corpus = [id2word.doc2bow(text) for text in df_clean]
    elif method == 'one_hot':
        # One Hot Encoding
        corpus = [[(token[0],1) for token in id2word.doc2bow(doc)]for doc in df_clean]
    elif method == 'tf_idf':
        tfidf = gensim.models.TfidfModel(dictionary=id2word, normalize=True)
        corpus = [tfidf[id2word.doc2bow(doc)] for doc in df_clean]

    # We transform our corpus to a numpy matrix - WE NEED TO TRANSPOSE IT
    df_corpus = corpus_to_df(corpus, dictionary = id2word)
    return df_corpus, corpus

def gensim_vectorize_diagnosis(df, id2word, include_diag = True, id2word_diag = '', method = 'frequency'):
    """ 
    Function to performe vectorization of our dataframe
    df: our dataframe
    id2word: main dictionary from the full text
    include_diag: set to True if we want to include vectorization of our diagnosis column
    id2word_diag: dictionary of the diagnosis column
    method: see gensim_vectorizer
    """
    df_vectorized, _ = gensim_vectorizer(df.text_def, id2word, method = method)
    if include_diag == True:
        df_vectorized_diagnosis, _ = gensim_vectorizer(df.diagnosis_def, id2word_diag, method = method)
        # then merge the two datasets
        df_vectorized = df_vectorized.merge(df_vectorized_diagnosis, left_index = True, right_index = True, suffixes = ("","_diag"))
    return df_vectorized


def gensim_traintest_vectorizer(train, val, id2word, include_diag = True, id2word_diag = '', method = 'frequency', include_test = True, test = ''):
    """
    Function to perform joint vectorization of train, validation and test set
    train: our training dataset
    val: our validation dataset
    id2word: dictionary of our main final text column
    include_diag: set to True if we want to include vectorization of our diagnosis column
    id2word_diag: dictionary of the diagnosis column
    method: see gensim_vectorizer
    include_test: set to True if we want to also perform vectorization of our test set
    test: our test set
    """
    train_vectorized = gensim_vectorize_diagnosis(train, id2word, include_diag = include_diag, id2word_diag = id2word_diag, method = method)
    val_vectorized = gensim_vectorize_diagnosis(val, id2word, include_diag = include_diag, id2word_diag = id2word_diag, method = method)
    if include_test == True:
        test_vectorized = gensim_vectorize_diagnosis(test, id2word, include_diag = include_diag, id2word_diag = id2word_diag, method = method)
    else:
        # empty list
        test_vectorized = []
    return train_vectorized, val_vectorized, test_vectorized

In [16]:
def save_dataframes(train_processed, val_processed, method, include_test = True, include_diag = True, test_processed = ''):
    """
    Function to save our dataframes
    train_processed: train set to be saved
    val_processed: validation set to be saved
    method: method through which we have processed the dataframes, needed as save keyword
    include_test: True if we want to save also the test set
    include_diag: True if we have included the diagnosis
    test_processed: test set to be saved
    """
    if include_diag == True: diag_tag = '_diag'
    else: diag_tag = ''
    # need to reset the index
    train_processed.reset_index(inplace=True, drop = True)
    # save our dataset up to now in feather format
    train_processed.to_feather('{}train_{}{}{}'.format(path_to_processed, method, diag_tag, tag_med7))
    # need to reset the index
    val_processed.reset_index(inplace=True, drop = True)
    # save our dataset up to now in feather format
    val_processed.to_feather('{}val_{}{}{}'.format(path_to_processed, method, diag_tag, tag_med7))
    if include_test:
        # need to reset the index
        test_processed.reset_index(inplace=True, drop = True)
        # save our dataset up to now in feather format
        test_processed.to_feather('{}test_{}{}{}'.format(path_to_processed, method, diag_tag, tag_med7))

In [12]:
# Create Dictionary for main text
id2word = corpora.Dictionary(train.text_def)
# Create Dictionary for diagnosis
id2word_diag = corpora.Dictionary(train.diagnosis_def)

In [13]:
# perform vectorization
method_list = ['frequency', 'one_hot','tf_idf']

for i in method_list:
    method = i
    print(i)
    %time train_processed, val_processed, test_processed = gensim_traintest_vectorizer(train, val, id2word, include_diag = include_diag, id2word_diag = id2word_diag, method = method, include_test = include_test, test = test)
    save_dataframes(train_processed, val_processed, method = method, include_test = include_test, include_diag = include_diag, test_processed = test_processed)

frequency
(23982, 58456)
(23982, 1704)
(2665, 58456)
(2665, 1704)
(6662, 58456)
(6662, 1704)
Wall time: 58.8 s
one_hot
(23982, 58456)
(23982, 1704)
(2665, 58456)
(2665, 1704)
(6662, 58456)
(6662, 1704)
Wall time: 1min 5s
tf_idf
(23982, 58456)
(23982, 1704)
(2665, 58456)
(2665, 1704)
(6662, 58456)
(6662, 1704)
Wall time: 1min 47s


In [14]:
# need to reset the index
y_train = pd.DataFrame(train.target)
y_train.reset_index(inplace=True, drop = True)
# save our dataset up to now in feather format
y_train.to_feather('{}y_train{}'.format(path_to_processed, tag_med7))
# need to reset the index
y_val = pd.DataFrame(val.target)
y_val.reset_index(inplace=True, drop = True)
# save our dataset up to now in feather format
y_val.to_feather('{}y_val{}'.format(path_to_processed, tag_med7))
if include_test:
    # need to reset the index
    y_test = pd.DataFrame(test.target)
    y_test.reset_index(inplace=True, drop = True)
    # save our dataset up to now in feather format
    y_test.to_feather('{}y_test{}'.format(path_to_processed, tag_med7))

## Word Embeddings

http://nadbordrozd.github.io/blog/2016/05/20/text-classification-with-word2vec/

In [15]:
train.text_def

12933    [family, patient, increase, short, temper, orn...
1098     [surgery, patient, record, know, allergy, drug...
9062     [recurrent, right, pleural, effusion, trap, ri...
26494    [percocet, severe, number, vessel, coronary, a...
27062    [medicine, penicillin, chest, pain, transfer, ...
                               ...                        
4180     [medicine, patient, record, know, allergy, dru...
12541    [codeine, atenolol, time, day, chest, pain, lo...
15585    [medicine, penicillin, withdrawl, rhabdomyolys...
19714    [penicillin, chest, pain, transfer, cardiac, c...
26578    [medicine, patient, record, know, allergy, dru...
Name: text_def, Length: 23982, dtype: object

In [16]:
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [17]:
try:
    # path for the model
    glove_path = os.path.join(path_to_repo, "Readmission", "data", "word_embeddings", "glove-wiki-gigaword-300", "glove-wiki-gigaword-300.txt")
    # load the model
    glove_wiki = KeyedVectors.load_word2vec_format(datapath(glove_path), binary = False)
    print("Loaded from repository")
except:
    # if the code above gives permission denied error, simply load the model (or download it) from the default directory
    glove_wiki = gensim.downloader.load("glove-wiki-gigaword-300")

Loaded from repository


In [18]:
try:
    # path for the model
    word2vec_path = os.path.join(path_to_repo, "Readmission", "data", "word_embeddings", "word2vec_google_news_300", "GoogleNews_vectors_negative300.bin")
    # load the model
    w2v_google_news = KeyedVectors.load_word2vec_format(datapath(word2vec_path), binary = True)
    print("Loaded from repository")
except:
    # if the code above gives permission denied error, simply load the model (or download it) from the default directory
    w2v_google_news = gensim.downloader.load("word2vec-google-news-300")

Loaded from repository


In [19]:
try:
    # path for the model
    word2vec_med_path = os.path.join(path_to_repo, "Readmission", "data", "word_embeddings", "wikipedia_pubmed_and_PMC_w2v.bin")
    # load the model
    w2v_med = KeyedVectors.load_word2vec_format(datapath(word2vec_med_path), binary = True)
    print("Loaded from repository")
except:
    # if the code above gives permission denied error, simply load the model (or download it) from the default directory
    print("No Embeddings Found!")

Loaded from repository


In [20]:
try:
    # path for the model
    bio_vec_path = os.path.join(path_to_repo, "Readmission", "data", "word_embeddings", "BioWordVec_PubMed_MIMICIII_d200.vec.bin")
    # load the model
    bio_w2v = KeyedVectors.load_word2vec_format(datapath(bio_vec_path), binary = True)
    print("Loaded from repository")
except:
    # if the code above gives permission denied error, simply load the model (or download it) from the default directory
    print("No Embeddings Found!")

Loaded from repository


In [21]:
def embedding_feats(list_of_lists, DIMENSION, w2v_model):
    """
    Function that takes in the input text dataset in form of list of lists (or pandas Series) where each sentence is a 
    list of words all the sentences are inside a list 
    list_of_lists: our list of sentences
    DIMENSION: the dimension of the word embeddings
    w2w_model: our word embedding model
    credits - https://medium.com/analytics-vidhya/text-classification-from-bag-of-words-to-bert-part-2-word2vec-35c8c3b34ee3
    """
    zeros_vector = np.zeros(DIMENSION)
    feats = []
    missing = set()
    missing_sentences = set()
    #Traverse over each sentence
    for tokens in tqdm(list_of_lists):
        # Initially assign zeroes as the embedding vector for the sentence
        feat_for_this = zeros_vector
        #Count the number of words in the embedding for this sentence
        count_for_this = 0
        #Traverse over each word of a sentence
        for token in tokens:
            #Check if the word is in the embedding vector
            if token in w2v_model:
                #Add the vector of the word to vector for the sentence
                feat_for_this += w2v_model[token]
                count_for_this +=1
            #Else assign the missing word to missing set just to have a look at it
            else:
                missing.add(token)
        #If no words are found in the embedding for the sentence
        if count_for_this == 0:
            #Assign all zeroes vector for that sentence
            feats.append(feat_for_this)
            #Assign the missing sentence to missing_sentences just to have a look at it
            missing_sentences.add(' '.join(tokens))
        #Else take average of the values of the embedding for each word to get the embedding of the sentence
        else:
            feats.append(feat_for_this/count_for_this)
    print("Total missing words: {}".format(len(missing)))
    print("Total missing sentences: {}".format(len(missing_sentences)))
    # convert our list of arrays to a DataFrame
    feats = pd.DataFrame(feats)
    return feats, missing, missing_sentences

def embedding_diag(list_of_lists, DIMENSION, w2v_model, include_diag = True, lemmatization = True):
    """
    Function to apply word embeddins to both final text and diagnosis
    include_diag: True if we want to apply it also to diagnosis embeddings
    lemmatization: True if we want to use the lemmatized text
    """
    print("\nFinal Text:")
    if lemmatization == True:
      list_vectorization, missing, missing_sentences = embedding_feats(list_of_lists.text_def, DIMENSION, w2v_model)
    else:
      list_vectorization, missing, missing_sentences = embedding_feats(list_of_lists.text_def_nolemma, DIMENSION, w2v_model)
    missing = list(missing)
    missing_sentences = list(missing_sentences)
    if include_diag == True:
        print("\nDiagnosis:")
        if lemmatization == True:
          df_vectors_diag, missing_diag, missing_sentences_diag = embedding_feats(list_of_lists.diagnosis_def, DIMENSION, w2v_model)
        else:
          df_vectors_diag, missing_diag, missing_sentences_diag = embedding_feats(list_of_lists.diagnosis_def_nolemma, DIMENSION, w2v_model)
        df_final = pd.merge(list_vectorization, df_vectors_diag, left_index = True, right_index = True, suffixes = ("","_diag"))
        missing.append(list(missing_diag))
        missing_sentences.append(list(missing_sentences_diag))
    else:
        df_final = df_vectors.copy()
    return df_final, missing, missing_sentences

In [22]:
# perform vectorization WITHOUT LEMMAS
embedding_dict = {'word2vec': w2v_google_news, 'GloVe': glove_wiki, 'W2V_Med': w2v_med, 'Bio_W2V': bio_w2v}


for method, embedding in embedding_dict.items():
  if method == 'W2V_Med' or method == 'Bio_W2V':
    vector_dim = 200
  else:
    vector_dim = 300      
  print(method)
  print("Train Set:")
  %time train_processed, missing_train, missing_sentences_train = embedding_diag(train, vector_dim, embedding, include_diag = include_diag, lemmatization = False)
  print("Validation Set:")
  %time val_processed, missing_val, missing_sentences_val = embedding_diag(val, vector_dim, embedding, include_diag = include_diag, lemmatization = False)
  if include_test == True:
    print("Test Set:")
    %time test_processed, missing_test, missing_sentences_test = embedding_diag(test, vector_dim, embedding, include_diag = include_diag, lemmatization = False)
  else:
    test_processed = []
  save_dataframes(train_processed, val_processed, method = method+'_nolemma', include_test = include_test, include_diag = include_diag, test_processed = test_processed)

  0%|                                                                                        | 0/23982 [00:00<?, ?it/s]

word2vec
Train Set:

Final Text:


100%|███████████████████████████████████████████████████████████████████████████| 23982/23982 [01:46<00:00, 225.18it/s]


Total missing words: 55656
Total missing sentences: 0


 18%|█████████████▍                                                            | 4371/23982 [00:00<00:00, 42125.28it/s]


Diagnosis:


100%|█████████████████████████████████████████████████████████████████████████| 23982/23982 [00:00<00:00, 51598.91it/s]


Total missing words: 1082
Total missing sentences: 158


  1%|▊                                                                              | 29/2665 [00:00<00:09, 274.71it/s]

Wall time: 1min 52s
Validation Set:

Final Text:


100%|█████████████████████████████████████████████████████████████████████████████| 2665/2665 [00:10<00:00, 254.79it/s]


Total missing words: 12589
Total missing sentences: 0


100%|███████████████████████████████████████████████████████████████████████████| 2665/2665 [00:00<00:00, 48846.85it/s]


Diagnosis:
Total missing words: 172
Total missing sentences: 27



  0%|▍                                                                              | 32/6662 [00:00<00:22, 293.96it/s]

Wall time: 11 s
Test Set:

Final Text:


100%|█████████████████████████████████████████████████████████████████████████████| 6662/6662 [00:24<00:00, 268.99it/s]


Total missing words: 24078
Total missing sentences: 0


100%|███████████████████████████████████████████████████████████████████████████| 6662/6662 [00:00<00:00, 72683.33it/s]


Diagnosis:
Total missing words: 419
Total missing sentences: 59





Wall time: 25.8 s


  0%|                                                                                        | 0/23982 [00:00<?, ?it/s]

GloVe
Train Set:

Final Text:


100%|███████████████████████████████████████████████████████████████████████████| 23982/23982 [01:32<00:00, 257.98it/s]


Total missing words: 56450
Total missing sentences: 0


 12%|█████████▏                                                                | 2987/23982 [00:00<00:00, 29607.29it/s]


Diagnosis:


100%|█████████████████████████████████████████████████████████████████████████| 23982/23982 [00:00<00:00, 34606.72it/s]


Total missing words: 1069
Total missing sentences: 141


  1%|▉                                                                              | 32/2665 [00:00<00:09, 286.78it/s]

Wall time: 1min 37s
Validation Set:

Final Text:


100%|█████████████████████████████████████████████████████████████████████████████| 2665/2665 [00:09<00:00, 283.65it/s]
100%|███████████████████████████████████████████████████████████████████████████| 2665/2665 [00:00<00:00, 52913.01it/s]

Total missing words: 12894
Total missing sentences: 0

Diagnosis:
Total missing words: 161
Total missing sentences: 24



  0%|▎                                                                              | 24/6662 [00:00<00:28, 235.09it/s]

Wall time: 9.74 s
Test Set:

Final Text:


100%|█████████████████████████████████████████████████████████████████████████████| 6662/6662 [00:23<00:00, 281.70it/s]
  0%|                                                                                         | 0/6662 [00:00<?, ?it/s]

Total missing words: 24649
Total missing sentences: 0

Diagnosis:


100%|███████████████████████████████████████████████████████████████████████████| 6662/6662 [00:00<00:00, 53119.81it/s]


Total missing words: 407
Total missing sentences: 51
Wall time: 24.5 s


  0%|                                                                              | 1/23982 [00:00<1:04:22,  6.21it/s]

W2V_Med
Train Set:

Final Text:


100%|███████████████████████████████████████████████████████████████████████████| 23982/23982 [01:47<00:00, 224.08it/s]


Total missing words: 39758
Total missing sentences: 0


 22%|████████████████                                                          | 5200/23982 [00:00<00:00, 51279.45it/s]


Diagnosis:


100%|█████████████████████████████████████████████████████████████████████████| 23982/23982 [00:00<00:00, 56519.62it/s]


Total missing words: 571
Total missing sentences: 55


  1%|▉                                                                              | 32/2665 [00:00<00:09, 277.94it/s]

Wall time: 1min 49s
Validation Set:

Final Text:


100%|█████████████████████████████████████████████████████████████████████████████| 2665/2665 [00:10<00:00, 264.87it/s]
100%|███████████████████████████████████████████████████████████████████████████| 2665/2665 [00:00<00:00, 65557.52it/s]


Total missing words: 7136
Total missing sentences: 0

Diagnosis:
Total missing words: 69
Total missing sentences: 9


  0%|▎                                                                              | 26/6662 [00:00<00:28, 231.83it/s]

Wall time: 10.3 s
Test Set:

Final Text:


100%|█████████████████████████████████████████████████████████████████████████████| 6662/6662 [00:25<00:00, 258.21it/s]
  0%|                                                                                         | 0/6662 [00:00<?, ?it/s]

Total missing words: 15231
Total missing sentences: 0

Diagnosis:


100%|███████████████████████████████████████████████████████████████████████████| 6662/6662 [00:00<00:00, 55782.04it/s]


Total missing words: 185
Total missing sentences: 24


  0%|                                                                                        | 0/23982 [00:00<?, ?it/s]

Wall time: 26.5 s
Bio_W2V
Train Set:

Final Text:


100%|███████████████████████████████████████████████████████████████████████████| 23982/23982 [01:54<00:00, 209.24it/s]


Total missing words: 1234
Total missing sentences: 0


 22%|████████████████▍                                                         | 5333/23982 [00:00<00:00, 47242.37it/s]


Diagnosis:


100%|█████████████████████████████████████████████████████████████████████████| 23982/23982 [00:00<00:00, 49509.89it/s]


Total missing words: 71
Total missing sentences: 5


  2%|█▏                                                                             | 42/2665 [00:00<00:12, 206.27it/s]

Wall time: 1min 57s
Validation Set:

Final Text:


100%|█████████████████████████████████████████████████████████████████████████████| 2665/2665 [00:11<00:00, 239.89it/s]
100%|███████████████████████████████████████████████████████████████████████████| 2665/2665 [00:00<00:00, 48325.03it/s]
  0%|                                                                                         | 0/6662 [00:00<?, ?it/s]

Total missing words: 179
Total missing sentences: 0

Diagnosis:
Total missing words: 13
Total missing sentences: 1
Wall time: 11.4 s
Test Set:

Final Text:


100%|█████████████████████████████████████████████████████████████████████████████| 6662/6662 [00:27<00:00, 242.72it/s]
100%|███████████████████████████████████████████████████████████████████████████| 6662/6662 [00:00<00:00, 50824.79it/s]

Total missing words: 435
Total missing sentences: 0

Diagnosis:





Total missing words: 21
Total missing sentences: 3
Wall time: 28.1 s


In [23]:
# perform vectorization
embedding_dict = {'word2vec': w2v_google_news, 'GloVe': glove_wiki, 'W2V_Med': w2v_med, 'Bio_W2V': bio_w2v}

for method, embedding in embedding_dict.items():
    if method == 'W2V_Med' or method == 'Bio_W2V':
      vector_dim = 200
    else:
      vector_dim = 300   
    print(method)
    print("Train Set:")
    %time train_processed, missing_train, missing_sentences_train = embedding_diag(train, vector_dim, embedding, include_diag = include_diag)
    print("Validation Set:")
    %time val_processed, missing_val, missing_sentences_val = embedding_diag(val, vector_dim, embedding, include_diag = include_diag)
    if include_test == True:
        print("Test Set:")
        %time test_processed, missing_test, missing_sentences_test = embedding_diag(test, vector_dim, embedding, include_diag = include_diag)
    else:
        test_processed = []
    save_dataframes(train_processed, val_processed, method = method, include_test = include_test, include_diag = include_diag, test_processed = test_processed)

  0%|                                                                              | 30/23982 [00:00<01:26, 276.01it/s]

word2vec
Train Set:

Final Text:


100%|███████████████████████████████████████████████████████████████████████████| 23982/23982 [01:12<00:00, 330.87it/s]


Total missing words: 35868
Total missing sentences: 0


 38%|████████████████████████████▎                                             | 9182/23982 [00:00<00:00, 82372.83it/s]


Diagnosis:


100%|█████████████████████████████████████████████████████████████████████████| 23982/23982 [00:00<00:00, 85874.05it/s]


Total missing words: 447
Total missing sentences: 96


  1%|█▏                                                                             | 39/2665 [00:00<00:06, 381.82it/s]

Wall time: 1min 16s
Validation Set:

Final Text:


100%|█████████████████████████████████████████████████████████████████████████████| 2665/2665 [00:06<00:00, 399.97it/s]
100%|███████████████████████████████████████████████████████████████████████████| 2665/2665 [00:00<00:00, 88835.54it/s]


Total missing words: 8173
Total missing sentences: 0

Diagnosis:
Total missing words: 81
Total missing sentences: 16


  1%|▌                                                                              | 44/6662 [00:00<00:15, 428.56it/s]

Wall time: 7 s
Test Set:

Final Text:


100%|█████████████████████████████████████████████████████████████████████████████| 6662/6662 [00:16<00:00, 405.63it/s]


Total missing words: 15568
Total missing sentences: 0

Diagnosis:


100%|███████████████████████████████████████████████████████████████████████████| 6662/6662 [00:00<00:00, 88842.99it/s]


Total missing words: 177
Total missing sentences: 39
Wall time: 17.2 s


  0%|▏                                                                             | 41/23982 [00:00<01:05, 364.68it/s]

GloVe
Train Set:

Final Text:


100%|███████████████████████████████████████████████████████████████████████████| 23982/23982 [00:59<00:00, 404.32it/s]


Total missing words: 36883
Total missing sentences: 0


 44%|███████████████████████████████▌                                        | 10521/23982 [00:00<00:00, 105137.66it/s]


Diagnosis:


100%|████████████████████████████████████████████████████████████████████████| 23982/23982 [00:00<00:00, 105885.35it/s]


Total missing words: 478
Total missing sentences: 106


  2%|█▍                                                                             | 47/2665 [00:00<00:06, 428.65it/s]

Wall time: 1min 2s
Validation Set:

Final Text:


100%|█████████████████████████████████████████████████████████████████████████████| 2665/2665 [00:06<00:00, 425.37it/s]
100%|███████████████████████████████████████████████████████████████████████████| 2665/2665 [00:00<00:00, 84250.27it/s]


Total missing words: 8632
Total missing sentences: 0

Diagnosis:
Total missing words: 82
Total missing sentences: 15


  1%|▌                                                                              | 52/6662 [00:00<00:14, 453.36it/s]

Wall time: 6.61 s
Test Set:

Final Text:


100%|█████████████████████████████████████████████████████████████████████████████| 6662/6662 [00:15<00:00, 422.27it/s]
  0%|                                                                                         | 0/6662 [00:00<?, ?it/s]

Total missing words: 16246
Total missing sentences: 0

Diagnosis:


100%|███████████████████████████████████████████████████████████████████████████| 6662/6662 [00:00<00:00, 82152.52it/s]


Total missing words: 182
Total missing sentences: 39
Wall time: 16.6 s


  0%|                                                                              | 36/23982 [00:00<01:15, 316.98it/s]

W2V_Med
Train Set:

Final Text:


100%|███████████████████████████████████████████████████████████████████████████| 23982/23982 [01:01<00:00, 390.68it/s]


Total missing words: 25255
Total missing sentences: 0


 85%|█████████████████████████████████████████████████████████████▍          | 20469/23982 [00:00<00:00, 100249.19it/s]


Diagnosis:


100%|████████████████████████████████████████████████████████████████████████| 23982/23982 [00:00<00:00, 100407.87it/s]


Total missing words: 222
Total missing sentences: 42


  2%|█▎                                                                             | 43/2665 [00:00<00:06, 413.15it/s]

Wall time: 1min 3s
Validation Set:

Final Text:


100%|█████████████████████████████████████████████████████████████████████████████| 2665/2665 [00:07<00:00, 380.00it/s]
100%|███████████████████████████████████████████████████████████████████████████| 2665/2665 [00:00<00:00, 68364.62it/s]


Total missing words: 4641
Total missing sentences: 0

Diagnosis:
Total missing words: 34
Total missing sentences: 4


  0%|▎                                                                              | 29/6662 [00:00<00:25, 260.12it/s]

Wall time: 7.31 s
Test Set:

Final Text:


100%|█████████████████████████████████████████████████████████████████████████████| 6662/6662 [00:16<00:00, 397.76it/s]


Total missing words: 9738
Total missing sentences: 0


100%|██████████████████████████████████████████████████████████████████████████| 6662/6662 [00:00<00:00, 111034.32it/s]


Diagnosis:
Total missing words: 82
Total missing sentences: 14



  0%|                                                                                        | 0/23982 [00:00<?, ?it/s]

Wall time: 17.3 s
Bio_W2V
Train Set:

Final Text:


100%|███████████████████████████████████████████████████████████████████████████| 23982/23982 [01:02<00:00, 382.53it/s]


Total missing words: 4559
Total missing sentences: 0


 43%|██████████████████████████████▉                                         | 10303/23982 [00:00<00:00, 100615.17it/s]


Diagnosis:


100%|████████████████████████████████████████████████████████████████████████| 23982/23982 [00:00<00:00, 101600.86it/s]


Total missing words: 42
Total missing sentences: 8


  2%|█▏                                                                             | 42/2665 [00:00<00:07, 362.74it/s]

Wall time: 1min 4s
Validation Set:

Final Text:


100%|█████████████████████████████████████████████████████████████████████████████| 2665/2665 [00:06<00:00, 396.79it/s]
100%|███████████████████████████████████████████████████████████████████████████| 2665/2665 [00:00<00:00, 91894.15it/s]


Total missing words: 813
Total missing sentences: 0

Diagnosis:
Total missing words: 9
Total missing sentences: 1


  1%|▌                                                                              | 44/6662 [00:00<00:16, 407.46it/s]

Wall time: 6.94 s
Test Set:

Final Text:


100%|█████████████████████████████████████████████████████████████████████████████| 6662/6662 [00:16<00:00, 402.22it/s]


Total missing words: 1754
Total missing sentences: 0


100%|███████████████████████████████████████████████████████████████████████████| 6662/6662 [00:00<00:00, 89776.10it/s]


Diagnosis:
Total missing words: 18
Total missing sentences: 3





Wall time: 17.1 s


# Create DFs without diagnosis

In [17]:
def load_datasets(method, include_diag = True, include_test = True):
    """
    Function to load train, test and validation set based on the chosen method
    method: string for the processing method we want to load
    include_diag: if we want to load the dataframes with the diagnosis text, default True
    include_test: if we want to load also the test set, default True
    """
    global path_to_processed
    if include_diag == True: diag_tag = '_diag'
    else: diag_tag = ''
    # load it back
    train = pd.read_feather(f'{path_to_processed}train_{method}{diag_tag}{tag_med7}')
    val = pd.read_feather(f'{path_to_processed}val_{method}{diag_tag}{tag_med7}')
    if include_test == True:
        test = pd.read_feather(f'{path_to_processed}test_{method}{diag_tag}{tag_med7}')
    else: test = []
    return train, val, test

In [25]:
method_list = ['frequency', 'one_hot','tf_idf', 'word2vec', 'GloVe', 'W2V_Med', 'Bio_W2V']

for method in method_list:
    print(method)
    train, val, test = load_datasets(method, include_diag = True, include_test = include_test)
    diag_exclusion = [x for x in train.columns if "_diag" not in x]
    train = train[diag_exclusion]
    val = val[diag_exclusion]
    if include_test == True: 
        test = test[diag_exclusion]
    save_dataframes(train, val, method = method, include_test = include_test, include_diag = False, test_processed = test)
    print("Dataframes saved")

frequency
Dataframes saved
one_hot
Dataframes saved
tf_idf
Dataframes saved
word2vec
Dataframes saved
GloVe
Dataframes saved
W2V_Med
Dataframes saved
Bio_W2V
Dataframes saved
