# Project | Data Preprocessing
Author: Jeremy Gygi



## Load the data

In [7]:
# mount my Google Drive
from google.colab import drive
drive.mount('/content/gdrive')


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [8]:
# Import pandas (as pd):
import pandas as pd

# load the data file:
data = pd.read_csv('/content/gdrive/MyDrive/cbb750/project/messages_Data_class.csv')

In [None]:
data2 = pd.read_csv('/content/sample_data/messages_Data_class-2.csv', encoding = "ISO-8859-1")

FileNotFoundError: ignored

## Import libraries and modules
Import the Word2Vec module from gensim built-in models as well as any other libraries/packages:

In [9]:
# Import the Word2Vec and KeyedVectors modules and the nltk library:
from gensim.models import Word2Vec, KeyedVectors
import nltk

## Initial Dataset Analysis:

In [None]:
# How many rows are there (samples)?
data.shape[0]

38743

In [None]:
# How many columns are there (features)?
data.shape[1]

10

In [None]:
# What are the column names?
data.columns 

Index(['ENC_PROV_ID', 'PROV_TYPE', 'ENC_TYPE_DISPLAY', 'MYC_MSG_TYP_DISPLAY',
       'REPLY_YN', 'TO_PAT_YN', 'MESSAGE_ID', 'MESSAGE_SUB_ID', 'INDEX',
       'MESSAGE_CL'],
      dtype='object')

In [None]:
# Preview the data. What does the first line look like?
data.iloc[0,]

ENC_PROV_ID                                                         5741
PROV_TYPE                                                      Physician
ENC_TYPE_DISPLAY                                         Patient Message
MYC_MSG_TYP_DISPLAY                       Patient Medical Advice Request
REPLY_YN                                                               N
TO_PAT_YN                                                              N
MESSAGE_ID                                                       2415358
MESSAGE_SUB_ID                                                         1
INDEX                                                                  1
MESSAGE_CL             I just refilled my last refill of thyroid med ...
Name: 0, dtype: object

In [None]:
# How many have missing (nan) messages?
sum(pd.isnull(data['MESSAGE_CL']))

44

In [10]:
# Let's remove the 44 rows missing messages:
data = data.dropna()
data.shape

(38699, 10)

## Data Overview:

- **ENC_PROV_ID** - patient identifier (data are deidentified)
- **PROV_TYPE** - "Fellow", "Nurse Practitioner", "Physician", "Physician Assistant", "Registered Nurse"
- **ENC_TYPE_DISPLAY** - always "Patient Message"
- **MYC_MSG_TYP_DISPLAY** - "Case Reminder Message", "General Questionnaire Submission", "Patient Medical Advice Request", "User Message"
- **REPLY_YN** - "Y" or "N" (is this message a reply?)
- **TO_PAT_YN** - "Y" or "N" (was this sent to the patient?)
- **MESSAGE_ID** - identifier for message (
- **MESSAGE_SUB_ID** - subidentifier (always 1)
- **INDEX** - index (all unique)
- **MESSAGE_CL** - message (text, free form)

## Column Extraction
Extract the **MESSAGE_CL** column from the data and print the values in that column

In [11]:
# extract the MESSAGE_CL column
messages = data["MESSAGE_CL"].values

# print the values of the "MESSAGE_CL" column
print(messages)

# How many messages do we have?
print(len(messages))

# How many unique patients do we have?
print(len(pd.unique(data['ENC_PROV_ID'])))

['I just refilled my last refill of thyroid med 100 mcg'
 'Has the fleet enema and magnesium citrate been ordered so I can pick them up from the Yale Health Plan Pharmacy Please Let me know Thanks Sent from iPhone'
 'Dr Is there any chance I could be given a summary of the procedure and what was done without having to wait until March 24th as the wait is driving me nuts with the thought of what could be It would just greatly put my mind at ease and thank you for a great procedure as there was absolutely no pain or discomfort once I awoke in recovery'
 ... 'We have scheduled you for April 14th at 9:30am'
 'I am still waiting to hear about my new appointment'
 'Good morning quick question are you talking about an appointment for your hepatitis C We can not make another appointment until you see them']
38699
202


## Tokenize the sentences

In [12]:
# Download the 'punkt' package:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [13]:
tokenized_messages = [nltk.word_tokenize(row) for row in messages]

In [None]:
tokenized_messages

## Word2Vec model parameter experimentation (based on HW 4)
Define the Word2Vec model parameters and experiment with different combinations (with a maximum possible count of 12) of the following parameters: min_count (3 or 5), size (100, 200 or 300) and window (3 or 5).

**Make the models:**

I iterate through all possible (12) combinations to make different models, which are stored in a list called "models".

In [None]:
# Make the models:
min_counts = [3, 5, 3, 5, 3, 5, 3, 5, 3, 5, 3, 5]
sizes = [100, 100, 200, 200, 300, 300, 100, 100, 200, 200, 300, 300]
windows = [3, 3, 3, 3, 3, 3, 5, 5, 5, 5, 5, 5]
models = list()

# Iterate through all 12 combinations:
for i in range(12):
  model = Word2Vec(tokenized_messages, min_count = min_counts[i], size = sizes[i], window = windows[i])
  # Train the model:
  model.train
  models.append(model)

In [None]:
# Find the top 10 similar words for depression:
for i, model in enumerate(models):
  print("min_counts = " + str(min_counts[i]) + " | size = " + str(sizes[i]) + " | window = " + str(windows[i]))
  print([x[0] for x in model.wv.most_similar('depression', topn = 10)])
  print()

min_counts = 3 | size = 100 | window = 3
['hair', 'neuropathy', 'edema', 'bloating', 'numbness', 'anxiety', 'inflammation', 'memory', 'fingertips', 'effusion']

min_counts = 5 | size = 100 | window = 3
['anxiety', 'hair', 'neuropathy', 'balance', 'itching', 'joints', 'edema', 'role', 'bloating', 'mental']

min_counts = 3 | size = 200 | window = 3
['anxiety', 'anemia', 'edema', 'bloating', 'neuropathy', 'inflammation', 'limited', 'dizziness', 'hair', 'balance']

min_counts = 5 | size = 200 | window = 3
['neuropathy', 'edema', 'itch', 'anemia', 'balance', 'joints', 'itching', 'dizziness', 'tenderness', 'memory']

min_counts = 3 | size = 300 | window = 3
['anxiety', 'inflammation', 'edema', 'neuropathy', 'hair', 'memory', 'balance', 'bloating', 'dizziness', 'headaches']

min_counts = 5 | size = 300 | window = 3
['edema', 'numbness', 'neuropathy', 'anxiety', 'sensation', 'hair', 'bloating', 'dizziness', 'anemia', 'survival']

min_counts = 3 | size = 100 | window = 5
['anxiety', 'balance', 

## Skip-Gram approach
* Repeat the same models as above using the Skip-Gram approach.
* Compare the words from corresponding models and determine based on your best judgement which model is better: continuous bag of words (CBOW) or the skip-gram model.

**Make the models:**

Same as above, I iterate through all possible (12) combinations to make different models, which are stored in a list called "sg_models".

In [None]:
# Make the models:
min_counts = [3, 5, 3, 5, 3, 5, 3, 5, 3, 5, 3, 5]
sizes = [100, 100, 200, 200, 300, 300, 100, 100, 200, 200, 300, 300]
windows = [3, 3, 3, 3, 3, 3, 5, 5, 5, 5, 5, 5]
sg_models = list()

# Iterate through all 12 combinations:
for i in range(12):
  model = Word2Vec(tokenized_messages, sg = 1, min_count = min_counts[i], size = sizes[i], window = windows[i])
  # Train the model:
  model.train
  sg_models.append(model)

In [None]:
# Find the top 10 similar words for depression:
for i, model in enumerate(sg_models):
  print("min_counts = " + str(min_counts[i]) + " | size = " + str(sizes[i]) + " | window = " + str(windows[i]))
  print([x[0] for x in model.wv.most_similar('depression', topn = 10)])
  print()

min_counts = 3 | size = 100 | window = 3
['constant', 'odor', 'anxiety', 'role', 'memory', 'fatigue', 'incontinence', 'controlling', 'itch', 'extreme']

min_counts = 5 | size = 100 | window = 3
['seizures', 'memory', 'constant', 'controlling', 'overwhelming', 'odor', 'lingering', 'thinning', 'heartburn', 'lack']

min_counts = 3 | size = 200 | window = 3
['controlling', 'overwhelming', 'heartburn', 'seizures', 'memory', 'nerves', 'exhaustion', 'fatigue', 'anxiety', 'balance']

min_counts = 5 | size = 200 | window = 3
['anxiety', 'itch', 'memory', 'success', 'odor', 'toxicity', 'controlling', 'overwhelming', 'constant', 'nerves']

min_counts = 3 | size = 300 | window = 3
['controlling', 'constant', 'thinning', 'memory', 'anxiety', 'overwhelming', 'bruising', 'seizures', 'odor', 'urgency']

min_counts = 5 | size = 300 | window = 3
['controlling', 'exhaustion', 'constant', 'balance', 'anxiety', 'debilitating', 'odor', 'seizures', 'memory', 'lumps']

min_counts = 3 | size = 100 | window = 5

## Printing a word vector
Print the word vector of the word **depression**

In [None]:
# I'm using the first sg_model:
sg_models[0].wv.word_vec('depression')

array([-0.24142617, -0.09168992,  0.1969962 , -0.84428513,  0.10786492,
        0.06604993, -0.4666908 , -0.02788314,  0.05880716, -0.16011088,
       -0.27510494,  0.25662747,  0.14882384, -0.11448403, -0.2427383 ,
       -0.5336986 ,  0.05724718, -0.06443694, -0.02665311,  0.38244545,
       -0.18953815, -0.11412697,  0.27794892, -0.07679817,  0.07283618,
       -0.11890692, -0.20509028, -0.34019482, -0.14106245, -0.11510722,
       -0.18644409, -0.09260967,  0.3227264 , -0.19595759,  0.23746783,
       -0.2060384 ,  0.02293366, -0.08148553,  0.04159328, -0.19038856,
       -0.3778743 ,  0.18160646, -0.14174743, -0.11083119,  0.21741325,
        0.29977205, -0.24806644, -0.20463176,  0.7196312 , -0.62994254,
       -0.13098384, -0.28923067,  0.31648716,  0.44224384,  0.03229413,
       -0.04759492,  0.34190816,  0.2544161 ,  0.11031759, -0.08683079,
       -0.06736122, -0.28802243, -0.08172561, -0.12040719, -0.08176079,
       -0.16858752,  0.03612233,  0.06022159, -0.19626412,  0.18

## A word graph:

Let's look at a t-SNE of depression:

In [None]:
# Adapted from https://www.kaggle.com/pierremegret/gensim-word2vec-tutorial
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
 
import seaborn as sns
sns.set_style("darkgrid")

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

def tsnescatterplot(model, word, list_names):
    """ Plot in seaborn the results from the t-SNE dimensionality reduction algorithm of the vectors of a query word,
    its list of most similar words, and a list of words.
    """
    arrays = np.empty((0, len(model.wv.__getitem__([word])[0])), dtype='f')
    word_labels = [word]
    color_list  = ['red']

    # adds the vector of the query word
    arrays = np.append(arrays, model.wv.__getitem__([word]), axis=0)
    
    # gets list of most similar words
    close_words = model.wv.most_similar([word])
    
    # adds the vector for each of the closest words to the array
    #for wrd_score in close_words:
    #    wrd_vector = model.wv.__getitem__([wrd_score[0]])
    #    word_labels.append(wrd_score[0])
    #    color_list.append('blue')
    #    arrays = np.append(arrays, wrd_vector, axis=0)
    
    # adds the vector for each of the words from list_names to the array
    for wrd in list_names:
        wrd_vector = model.wv.__getitem__([wrd])
        word_labels.append(wrd)
        color_list.append('green')
        arrays = np.append(arrays, wrd_vector, axis=0)
    
    # Finds t-SNE coordinates for 2 dimensions
    np.set_printoptions(suppress=True)
    
    Y = TSNE(n_components=2, random_state=0, perplexity=15).fit_transform(arrays)
    
    # Sets everything up to plot
    df = pd.DataFrame({'x': [x for x in Y[:, 0]],
                       'y': [y for y in Y[:, 1]],
                       'words': word_labels,
                       'color': color_list})
    
    fig, _ = plt.subplots()
    fig.set_size_inches(9, 9)
    
    # Basic plot
    p1 = sns.regplot(data=df,
                     x="x",
                     y="y",
                     fit_reg=False,
                     marker="o",
                     scatter_kws={'s': 40,
                                  'facecolors': df['color']
                                 }
                    )
    
    # Adds annotations one by one with a loop
    for line in range(0, df.shape[0]):
         p1.text(df["x"][line],
                 df['y'][line],
                 '  ' + df["words"][line].title(),
                 horizontalalignment='left',
                 verticalalignment='bottom', size='medium',
                 color=df['color'][line],
                 weight='normal'
                ).set_size(15)

    
    plt.xlim(Y[:, 0].min()-50, Y[:, 0].max()+50)
    plt.ylim(Y[:, 1].min()-50, Y[:, 1].max()+50)
            
    plt.title('t-SNE visualization for {}'.format(word.title()))

In [None]:
current_model = models[11]
word = terms[0]
similar_words = [x[0] for x in current_model.wv.most_similar(word, topn = 25)]
#similar_words = ["happy", "sad", "angry"]

tsnescatterplot(current_model, word, similar_words)

NameError: ignored

In [None]:
current_model = sg_models[11]
word = 'depression'
similar_words = [x[0] for x in current_model.wv.most_similar(word, topn = 25)]
#similar_words = ["happy", "sad", "angry"]

tsnescatterplot(current_model, word, similar_words)

NameError: ignored