In [15]:
import pandas as pd
import numpy as np
import pickle

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

## Oct. 2, 2024

This file contains code that iterates through all possible note pairings, subsets the data to patients who have clinical notes for both categories, and finds the most common words in the concatenated text data as well as the separate text data.

Total folders:
- ECG-Echo
- ECG-Nursing
- ECG-Radiology
- Echo-Nursing
- Echo-Radiology
- Nursing-Radiology

In [None]:
diagnoses_df = pd.read_csv("../diagnoses_df.csv")
diagnoses_df

In [3]:
output_A = pd.read_csv("../output_A.csv")

In [None]:
output_A = output_A[['HADM_ID', 'gender', 'age']]
output_A = output_A.drop_duplicates(subset=['HADM_ID'])
output_A

In [None]:
# merge age and gender values from output_A to diagnoses_df by matching on HADM_ID
UandC_data = diagnoses_df.merge(output_A, how='inner', on='HADM_ID')
UandC_data

In [None]:
pairs = [('ECG', 'Echo'), ('ECG', 'Nursing'), ('ECG', 'Radiology'), ('Echo', 'Nursing'), ('Echo', 'Radiology'), ('Nursing', 'Radiology')]

# create a dictionary to store all of the data that we want to plot, this
# dictionary will store dictionaries
table_data = dict()

for pair in pairs:
    # this dictionary will contain all of the data we want to plot for this pair of notes data
    data = dict()

    text_data_T_1 = pd.read_csv('../text_csv_files/text_data_' + pair[0] + '.csv')[['HADM_ID', pair[0]]]
    text_data_T_2 = pd.read_csv('../text_csv_files/text_data_' + pair[1] + '.csv')[['HADM_ID', pair[1]]]

    # combine the Us, Cs, and T_1 T_2 text data into one dataframe
    combined_data = UandC_data.merge(text_data_T_1, how='inner', on='HADM_ID')
    combined_data = combined_data.merge(text_data_T_2, how='inner', on='HADM_ID')

    # use a tfidf vectorizer so that we can ignore stop words that appear in all the notes
    # or in very few of the notes

    # throw away words that appear in 90th percentile or more
    # binary=False and use_idf=True by default
    vectorizer = TfidfVectorizer(max_features=5, min_df=int(0.1*len(combined_data)), max_df=int(0.9*len(combined_data)))

    # create a new column that represents the concatenation of the two columns of text data
    combined_data["concated_text"] = combined_data[pair[0]] + combined_data[pair[1]]

    # train the vectorizer on the concatenation of the two notes data
    vectorizer.fit(combined_data["concated_text"])
    print(pair[0])
    print(pair[1])
    print("combined text", vectorizer.get_feature_names_out())
    data["combined_text_words"] = list(vectorizer.get_feature_names_out())

    # create the bag of words matrices using the overlapping vocabulary
    T_1 = (vectorizer.transform(combined_data[pair[0]]) > 0).astype(int)
    T_2 = (vectorizer.transform(combined_data[pair[1]]) > 0).astype(int)

    # print out the positivity ratio of the combined arrays
    print("T_1 column averages", np.mean(T_1.toarray(), axis=0))
    data["T_1_column_averages"] = list(np.mean(T_1.toarray(), axis=0))
    print("T_2 column averages", np.mean(T_2.toarray(), axis=0))
    data["T_2_column_averages"] = list(np.mean(T_2.toarray(), axis=0))

    # train the vectorizer on just one notes data
    vectorizer.fit(combined_data[pair[0]])
    print(pair[0], vectorizer.get_feature_names_out())
    data["T_1_words"] = list(vectorizer.get_feature_names_out())

    # train the vecotrizer on just one notes data
    vectorizer.fit(combined_data[pair[1]])
    print(pair[1], vectorizer.get_feature_names_out())
    data["T_2_words"] = list(vectorizer.get_feature_names_out())

    table_data[pair] = data

pickle.dump(table_data, open("table_data.p", "wb"))