This repository displays code from my research during the Vanderbilt Biomedical Informatics Summer Program (2021) [https://t.e2ma.net/message/t444jf/12cgjjq]. Please view the attached abstract for a more concise summary.
- More information on the program [https://www.vumc.org/dbmi/summer-research-internship-program-biomedical-informatics]. 

For privacy reasons, I cannot share the dataset (collection of clinical notes), nor the notebooks used for the preprocessing of the data; however, I’ve selected a few functions that were particularly useful. 
- The following functions were utilized during the initial preprocessing of the data, prior to model selection.
- Major (Python) steps included: collecting frequencies of various properties associated with the notes, automatically detecting misannotated notes based on developed guidelines, and labeling them.

In [1]:
# helped with collecting frequencies

def create_T_dict_list(folder): 
    T_dict_list = []
    
    for file in ann:
        T_dict = {}  
        with open(f"{folder}{file}") as file_encoded: 
            for line in file_encoded.readlines():
                if (line.startswith("T")) and (any(char.isdigit() for char in line)) == True:
                    T_line = line.split("\t")
                    T_dict.update({T_line[0]:T_line[1]})           
        T_dict_list.append(T_dict)
        
    return T_dict_list

In [7]:
# helped with identifying misannotated files

def find_drugs(folder):
    drugs_files = []
    
    for file in ann: 
        with open(f"{folder}{file}") as file_encoded:
            for line in file_encoded.readlines(): 
                if "Drugs" in line:
                    drugs_files.append(file)
                    
    drugs_files = sorted(list(set(drugs_files)))
    return drugs_files

In [5]:
# helped with identifying misannotated files 

def find_multis(folder):
    multis_files = []
    
    for file in ann: 
        smoker_counter = 0; drinker_counter = 0; duser_counter = 0
        with open(f"{folder}{file}") as file_encoded:
            for line in file_encoded.readlines(): 
                smoker_counter += line.count("Smoker")
                drinker_counter += line.count("Drinker")
                duser_counter += line.count("DUser")
                if smoker_counter > 1: 
                    multis_files.append(file)
                if drinker_counter > 1:
                    multis_files.append(file)
                if duser_counter > 1:
                    multis_files.append(file)
                    
    multis_files = sorted(list(set(multis_files)))
    return multis_files

In [6]:
# helped with labeling files 

def create_label_dict(folder): 
    # -------------------------------------------- #
    
    smoker_dict = {}; drinker_dict = {}; duser_dict = {}
    
    for file in ann:
        with open(f"{folder}{file}") as file_encoded: 
            for line in file_encoded.readlines(): 
                if "Smoker" in line: 
                    smoker_ant = [string for string in line.split(" ") if "Smoker" in string]
                    smoker_ant = "".join([str(elem) for elem in smoker_ant])
                    smoker_ant = smoker_ant.split("\t")[-1] 
                    smoker_dict.update({file:smoker_ant})
                if "Drinker" in line:
                    drinker_ant = [string for string in line.split(" ") if "Drinker" in string]
                    drinker_ant = "".join([str(elem) for elem in drinker_ant])
                    drinker_ant = drinker_ant.split("\t")[-1] 
                    drinker_dict.update({file:drinker_ant})
                if "DUser" in line:
                    duser_ant = [string for string in line.split(" ") if "DUser" in string]
                    duser_ant = "".join([str(elem) for elem in duser_ant])
                    duser_ant = duser_ant.split("\t")[-1] 
                    duser_dict.update({file:duser_ant})
                    
    print(len(smoker_dict), len(drinker_dict), len(duser_dict))
    
    # -------------------------------------------- #
    
    files_dict = {} 
    for file in ann:
        files_dict.update({file:""})
    
    dd = defaultdict(list)

    for d in (files_dict, smoker_dict, drinker_dict, duser_dict):
        for key, value in d.items():
            dd[key].append(value)

    for i in dd:
        dd[i].pop(0)
        dd[i] = " | ".join([str(elem) for elem in dd[i]])

    substance_ants = ["Smoker", "Drinker", "DUser"]

    for key in dd.keys():
        for ant in substance_ants:
            if (ant not in dd[key]):
                dd[key] = dd[key] + f" | Unknown{ant}"
                
    # -------------------------------------------- #

    return dd