In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


Load the data / packages:

In [2]:
# Import pandas (as pd):
import pandas as pd
# itertools
import itertools
# Import the Word2Vec and KeyedVectors modules and the nltk library:
from gensim.models import Word2Vec, KeyedVectors
import nltk
nltk.download('punkt')

# load the data file with date, deidentified PID, conversation ID:
data = pd.read_csv('/content/gdrive/MyDrive/cbb750/project/118106-Messages-class_DEIDENTIFIED.csv', 
                   encoding = "ISO-8859-1",
                   na_values = ['--', 'N/A', 'na','NaN'])

# convert date type to correct type
data['DATE_OF_MESSAGE'] = pd.to_datetime(data['DATE_OF_MESSAGE'])

data['PROV_TYPE'] = data['PROV_TYPE'].astype('category') 

data['ENC_TYPE_DISPLAY'] = data['ENC_TYPE_DISPLAY'].astype('category') 

data['MYC_MSG_TYP_DISPLAY'] = data['MYC_MSG_TYP_DISPLAY'].astype('category')

data['YEAR'] = data['DATE_OF_MESSAGE'].dt.year

data = data.dropna()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Frequencies of Emotional Terms Overtime

In [3]:
data_drop_duplicated = data.drop([2307,3328,16192,19534,101,3478,9745,10279,15806,16535,18087,19392,30377,30715,31267,33824])
provider = data_drop_duplicated[data_drop_duplicated['TO_PAT_YN']=="Y"]

# Split into Dr. and Resident Nurse:
provider_phys = provider[provider['PROV_TYPE'] == "Physician"]
provider_nurse = provider[provider['PROV_TYPE'] == "Nurse Practitioner"]
provider_phys.shape
provider_nurse.shape

(2495, 14)

In [4]:
def get_lem_list(provider):
  prov_2012 = provider[provider['YEAR']==2012]
  prov_2013 = provider[provider['YEAR']==2013]
  prov_2014 = provider[provider['YEAR']==2014]
  prov_2015 = provider[provider['YEAR']==2015]
  prov_2016 = provider[provider['YEAR']==2016]
  prov_2017 = provider[provider['YEAR']==2017]
  prov_2018 = provider[provider['YEAR']==2018]
  prov_2019 = provider[provider['YEAR']==2019]
  prov_2020 = provider[provider['YEAR']==2020]

  df_12 = prov_2012['MESSAGE_CL'].values
  df_13 = prov_2013['MESSAGE_CL'].values
  df_14 = prov_2014['MESSAGE_CL'].values
  df_15 = prov_2015['MESSAGE_CL'].values
  df_16 = prov_2016['MESSAGE_CL'].values
  df_17 = prov_2017['MESSAGE_CL'].values
  df_18 = prov_2018['MESSAGE_CL'].values
  df_19 = prov_2019['MESSAGE_CL'].values
  df_20 = prov_2020['MESSAGE_CL'].values

  token_12=[nltk.word_tokenize(row) for row in df_12]
  token_13=[nltk.word_tokenize(row) for row in df_13]
  token_14=[nltk.word_tokenize(row) for row in df_14]
  token_15=[nltk.word_tokenize(row) for row in df_15]
  token_16=[nltk.word_tokenize(row) for row in df_16]
  token_17=[nltk.word_tokenize(row) for row in df_17]
  token_18=[nltk.word_tokenize(row) for row in df_18]
  token_19=[nltk.word_tokenize(row) for row in df_19]
  token_20=[nltk.word_tokenize(row) for row in df_20]

  def make_lower(token):
    for i in range(len(token)):
      token[i] = [t.lower() for t in token[i]]
    return(token)

  final_12 = make_lower(token_12)
  final_13 = make_lower(token_13)
  final_14 = make_lower(token_14)
  final_15 = make_lower(token_15)
  final_16 = make_lower(token_16)
  final_17 = make_lower(token_17)
  final_18 = make_lower(token_18)
  final_19 = make_lower(token_19)
  final_20 = make_lower(token_20)

  def make_stemmed(token):
    for i in range(len(token)):
      token[i] = [nltk.PorterStemmer().stem(t) for t in token[i]]
    return(token)

  stemmed_12 = make_stemmed(final_12)
  stemmed_13 = make_stemmed(final_13)
  stemmed_14 = make_stemmed(final_14)
  stemmed_15 = make_stemmed(final_15)
  stemmed_16 = make_stemmed(final_16)
  stemmed_17 = make_stemmed(final_17)
  stemmed_18 = make_stemmed(final_18)
  stemmed_19 = make_stemmed(final_19)
  stemmed_20 = make_stemmed(final_20)

  stem_list =[stemmed_12,
              stemmed_13,
              stemmed_14,
              stemmed_15,
              stemmed_16,
              stemmed_17,
              stemmed_18,
              stemmed_19,
              stemmed_20]

  nltk.download('wordnet')
  from nltk.stem import WordNetLemmatizer
  lemmatizer = WordNetLemmatizer()

  def make_lemm(token):
    for i in range(len(token)):
      token[i] = [WordNetLemmatizer().lemmatize(t,pos='v') for t in token[i]]
    return(token)

  lem_12 = make_lemm(stemmed_12)
  lem_13 = make_lemm(stemmed_13)
  lem_14 = make_lemm(stemmed_14)
  lem_15 = make_lemm(stemmed_15)
  lem_16 = make_lemm(stemmed_16)
  lem_17 = make_lemm(stemmed_17)
  lem_18 = make_lemm(stemmed_18)
  lem_19 = make_lemm(stemmed_19)
  lem_20 = make_lemm(stemmed_20)

  lem_list = [lem_12,
              lem_13,
              lem_14,
              lem_15,
              lem_16,
              lem_17,
              lem_18,
              lem_19,
              lem_20]

  return lem_list



In [5]:
prov_lem_list = get_lem_list(provider_phys)
nurse_lem_list = get_lem_list(provider_nurse)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


**Question 2:** what is the frequency distribution of the Physician vs. Nurse Practitioner's use of the terms we are interested in over time?

In [None]:
# Make a function to get a W2V model and a CBOW model:
def get_w2v_models(tokenized_messages, min_count = 3, size = 300, window = 7):
  # CBOW:
  print(tokenized_messages)
  model = Word2Vec(tokenized_messages, min_count = min_count, size = size, window = window)
  model.train
  # Skip-Gram:
  sg_model = Word2Vec(tokenized_messages, min_count = min_count, size = size, window = window)
  sg_model.train
  return (model, sg_model)

models_per_year_phys = []
for i in range(len(prov_lem_list)):
  lem = prov_lem_list[i]
  models_per_year_phys.append(get_w2v_models(lem))

models_per_year_nurse = []
for i in range(len(nurse_lem_list)):
  lem = nurse_lem_list[i]
  if(len(lem)) > 0:
    models_per_year_nurse.append(get_w2v_models(lem))
  else:
    models_per_year_nurse.append("None...")

Output hidden; open in https://colab.research.google.com to view.

In [None]:
print(models_per_year[0][0].wv.vocab)

{'mr': <gensim.models.keyedvectors.Vocab object at 0x7f0287ed1290>, 'your': <gensim.models.keyedvectors.Vocab object at 0x7f0287ed1450>, 'to': <gensim.models.keyedvectors.Vocab object at 0x7f0287ed1710>, 'you': <gensim.models.keyedvectors.Vocab object at 0x7f0287ed1f50>, 'and': <gensim.models.keyedvectors.Vocab object at 0x7f0287ebfcd0>, 'dr': <gensim.models.keyedvectors.Vocab object at 0x7f0287ebf590>, 'have': <gensim.models.keyedvectors.Vocab object at 0x7f0287ebf710>, 'i': <gensim.models.keyedvectors.Vocab object at 0x7f0287ebc050>, 'she': <gensim.models.keyedvectors.Vocab object at 0x7f0287ebf150>, 'the': <gensim.models.keyedvectors.Vocab object at 0x7f0287ebf050>, 'be': <gensim.models.keyedvectors.Vocab object at 0x7f0287ebc190>, 'not': <gensim.models.keyedvectors.Vocab object at 0x7f0287ec0310>, 'for': <gensim.models.keyedvectors.Vocab object at 0x7f0287ec06d0>, 'at': <gensim.models.keyedvectors.Vocab object at 0x7f0287ec0050>}


Code to plot the most similar words:

In [None]:
# Adapted from https://www.kaggle.com/pierremegret/gensim-word2vec-tutorial
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
 
import seaborn as sns
sns.set_style("darkgrid")

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

def tsnescatterplot(model, word, list_names, phys_type, model_type, year):
    """ Plot in seaborn the results from the t-SNE dimensionality reduction algorithm of the vectors of a query word,
    its list of most similar words, and a list of words.
    """
    arrays = np.empty((0, len(model.wv.__getitem__([word])[0])), dtype='f')
    word_labels = [word]
    color_list  = ['red']

    # adds the vector of the query word
    arrays = np.append(arrays, model.wv.__getitem__([word]), axis=0)
    
    # gets list of most similar words
    close_words = model.wv.most_similar([word])
    
    # adds the vector for each of the closest words to the array
    #for wrd_score in close_words:
    #    wrd_vector = model.wv.__getitem__([wrd_score[0]])
    #    word_labels.append(wrd_score[0])
    #    color_list.append('blue')
    #    arrays = np.append(arrays, wrd_vector, axis=0)
    
    # adds the vector for each of the words from list_names to the array
    for wrd in list_names:
        wrd_vector = model.wv.__getitem__([wrd])
        word_labels.append(wrd)
        color_list.append('green')
        arrays = np.append(arrays, wrd_vector, axis=0)
    
    # Finds t-SNE coordinates for 2 dimensions
    np.set_printoptions(suppress=True)
    
    Y = TSNE(n_components=2, random_state=0, perplexity=15).fit_transform(arrays)
    
    # Sets everything up to plot
    df = pd.DataFrame({'x': [x for x in Y[:, 0]],
                       'y': [y for y in Y[:, 1]],
                       'words': word_labels,
                       'color': color_list})
    
    fig, _ = plt.subplots()
    fig.set_size_inches(9, 9)
    
    # Basic plot
    p1 = sns.regplot(data=df,
                     x="x",
                     y="y",
                     fit_reg=False,
                     marker="o",
                     scatter_kws={'s': 40,
                                  'facecolors': df['color']
                                 }
                    )
    
    # Adds annotations one by one with a loop
    for line in range(0, df.shape[0]):
         p1.text(df["x"][line],
                 df['y'][line],
                 '  ' + df["words"][line].title(),
                 horizontalalignment='left',
                 verticalalignment='bottom', size='medium',
                 color=df['color'][line],
                 weight='normal'
                ).set_size(15)

    
    plt.xlim(Y[:, 0].min()-50, Y[:, 0].max()+50)
    plt.ylim(Y[:, 1].min()-50, Y[:, 1].max()+50)
            
    plt.title(year + " | " + model_type + ' t-SNE visualization for {}'.format(word.title()))

    plt.savefig(fname = "/content/gdrive/MyDrive/cbb750/project/figs/" + phys_type + "_top_20_" + word + "_" + model_type +  "_" + year + ".png")

In [None]:
terms = ['happy', 'glad', ':) ', ':(', '!', 'sorry', 'sad', 'delight', 'joy', 'pleased', 'thrilled', 'terrible', 'frustrate', 'good', 'great']


Make plots for the words we are interested in:

In [None]:
phys_type = "Physician"

models_per_year = models_per_year_phys

# Years:
years = ["2012",
         "2013",
         "2014",
         "2015",
         "2016",
         "2017",
         "2018",
         "2019",
         "2020"]

terms_modified = ['hello', 'glad', 'sorri', 'joy', 'thank', ':', ')', '(', 'frustrat', 'good', 'great', 'enjoy', 'hope']

for i in range(len(years)):
  year = years[i]
  models = models_per_year[i]

  # CBOW:
  for term in terms_modified:
    current_model = models[0]
    try:
      similar_words = [x[0] for x in current_model.wv.most_similar(term, topn = 25)]
    except:
      print(term + " is not in vocabulary for year " + year + "...")
      continue
    model_type = "CBOW"
    tsnescatterplot(current_model, term, similar_words, phys_type, model_type, year)

  # Skip-Gram:
  for term in terms_modified:
    current_model = models[1]
    try:
      similar_words = [x[0] for x in current_model.wv.most_similar(term, topn = 25)]
    except:
      print(term + " is not in vocabulary for year " + year + "...")
      continue
    model_type = "SG"
    tsnescatterplot(current_model, term, similar_words, phys_type, model_type, year)

Output hidden; open in https://colab.research.google.com to view.

In [None]:
phys_type = "Nurse"

models_per_year = models_per_year_nurse

# Years:
years = ["2012",
         "2013",
         "2014",
         "2015",
         "2016",
         "2017",
         "2018",
         "2019",
         "2020"]

terms_total = ['hello', 'glad', 'sorri', 'joy', 'thank', ':', ')', '(', 'frustrat', 'good', 'great', 'enjoy', 'hope']

for i in range(len(years)):
  year = years[i]
  models = models_per_year[i]

  # CBOW:
  for term in terms_modified:
    current_model = models[0]
    try:
      similar_words = [x[0] for x in current_model.wv.most_similar(term, topn = 25)]
    except:
      print(term + " is not in vocabulary for year " + year + "...")
      continue
    model_type = "CBOW"
    tsnescatterplot(current_model, term, similar_words, phys_type, model_type, year)

  # Skip-Gram:
  for term in terms_modified:
    current_model = models[1]
    try:
      similar_words = [x[0] for x in current_model.wv.most_similar(term, topn = 25)]
    except:
      print(term + " is not in vocabulary for year " + year + "...")
      continue
    model_type = "SG"
    tsnescatterplot(current_model, term, similar_words, phys_type, model_type, year)

Output hidden; open in https://colab.research.google.com to view.

Frequencies:

In [None]:
years = ["2012",
         "2013",
         "2014",
         "2015",
         "2016",
         "2017",
         "2018",
         "2019",
         "2020"]

terms_total = ['hello', 'glad', 'sorri', 'joy', 'thank', ':', ')', '(', 'frustrat', 'good', 'great', 'enjoy', 'hope', 'total']

# CBOW = 0, SG = 1
for model_type in range(2):
  # Rows = years
  # Cols = terms + "total"
  mat = np.zeros(shape = (len(years), len(terms_total)))
  models_per_year = [models_per_year_phys, models_per_year_nurse][model_type]
  for i, year in enumerate(years):
    current_model = models_per_year[i][0]
    for j, word in enumerate(terms_total):
      if(isinstance(current_model, str)):
        mat[i, j] = 0
      else:
        if word != "total":
          try:
            mat[i, j] = current_model.wv.vocab[word].count
          except:
            mat[i, j] = 0
        else:
          total_freq = 0
          for word, vocab_obj in current_model.wv.vocab.items():
            total_freq = total_freq + current_model.wv.vocab[word].count
          mat[i, j] = total_freq

  df = pd.DataFrame(mat, index = years, columns = terms_total)
  df.to_csv("/content/gdrive/MyDrive/cbb750/project/tables/Q2_" + ['Physician', 'Nurse'][model_type] + ".csv")

In [7]:
years = ["2012",
         "2013",
         "2014",
         "2015",
         "2016",
         "2017",
         "2018",
         "2019",
         "2020"]

terms_modified = ['hello', 'glad', 'sorri', 'joy', 'thank', ':', ')', '(', 'frustrat', 'good', 'great', 'enjoy', 'hope']

for model_type in range(2):
  totalmeans = {}
  if model_type == 0:
    lem_list = prov_lem_list
  else:
    lem_list = nurse_lem_list
  for i, lem in enumerate(lem_list):
    meanvec = []
    for message in lem:
      m = {}
      totalindices = []
      for term in terms_modified:
        indices = []
        for ii, j in enumerate(message):
            if j == term:
                indices.append(ii)
        [totalindices.append(x) for x in indices]
        m[term] = len(indices)/len(message)
      m["total"] = (len(totalindices) / len(message))
      meanvec.append(m)
    print(years[i])  
    totalmeans[years[i]] = meanvec

  import json
  with open('/content/gdrive/MyDrive/cbb750/project/Q2_' + ['Physician', 'Nurse'][model_type] + '_json.txt', 'w') as outfile:
      json.dump(totalmeans, outfile)

2012
2013
2014
2015
2016
2017
2018
2019
2020
2012
2013
2014
2015
2016
2017
2018
2019
2020
