# Transform the Downloaded DATASET

Take the dataset from the N2C2 Challenges and transform them into a coherent dataset

The years that have .txt files are: 2009, 2010, 2011, 2012, 2018-ADE

In [1]:
import os
import sys
import pandas as pd

In [2]:
r_2009 = "DataSet (Medecine)/2009 - Medication"
r_2010 = "DataSet (Medecine)/2010 - Relations"
r_2011 = "DataSet (Medecine)/2011 - Conference"
r_2012 = "DataSet (Medecine)/2012 - Temporal"
r_2018_ADE = "DataSet (Medecine)/2018 - ADE"

In [3]:
#    if file.endwith(.txt):
def txt_to_dataset(file) -> list:
    """
    INPUT:
    file: a file that needs to be read

    OUTPUT:
    lines: the string of the EHR

    Return the string version of the EHR from the .txt file
    """

    with open(file, encoding="utf-8-sig") as f:
        lines = f.readlines()
    lines = " ".join(lines)
    lines = lines.replace("\n", "")
    return lines

In [4]:
def search_for_files(root):
    """
    For all the files that end in .txt in the DataSet (Medecine) folder are put into one list
    """
    ls = []
    for path, subdirs, files in os.walk(root):
        for name in files:
            file = os.path.join(path, name)
            if file.endswith(".txt"):
                ls.append(file)
    return ls

In [5]:
def make_dict(ls):
    """
    Turn the list of .txt files into a dictionnary with the idx being the name of the file and the key be the text
    """
    dict_of_str = {}
    dup = []
    for file in ls:
        dup.append(txt_to_dataset(file))
        dict_of_str[file] = txt_to_dataset(file)
    return dict_of_str

In [6]:
df_2009 = pd.DataFrame.from_dict(
    make_dict(search_for_files("DataSet (Medecine)/2009 - Medication")), orient="index"
)
df_2010 = pd.DataFrame.from_dict(
    make_dict(search_for_files("DataSet (Medecine)/2010 - Relations")), orient="index"
)
df_2011 = pd.DataFrame.from_dict(
    make_dict(search_for_files("DataSet (Medecine)/2011 - Conference")), orient="index"
)
df_2012 = pd.DataFrame.from_dict(
    make_dict(search_for_files("DataSet (Medecine)/2012 - Temporal")), orient="index"
)
df_2018_ADE = pd.DataFrame.from_dict(
    make_dict(search_for_files("DataSet (Medecine)/2018 - ADE")), orient="index"
)

In [12]:
df = pd.concat([df_2009, df_2010, df_2011, df_2012, df_2018_ADE])

In [13]:
df

Unnamed: 0,0
DataSet (Medecine)/2009 - Medication/779976.txt,RECORD #779976 079220999 | LMC | 92547244 | | ...
DataSet (Medecine)/2009 - Medication/896721.txt,RECORD #896721 010732991 | HMC | 51210886 | ...
DataSet (Medecine)/2009 - Medication/639827.txt,RECORD #639827 152184884 | SAMC | 03895233 | |...
DataSet (Medecine)/2009 - Medication/818884.txt,RECORD #818884 571742072 | MMC | 97450845 | | ...
DataSet (Medecine)/2009 - Medication/321886.txt,RECORD #321886 179879476 | CGH | 74875587 | | ...
...,...
DataSet (Medecine)/2018 - ADE/103074.txt,Admission Date: [**2125-10-10**] Discha...
DataSet (Medecine)/2018 - ADE/101739.txt,"Name: [**Known lastname **],[**Known firstnam..."
DataSet (Medecine)/2018 - ADE/101276.txt,Admission Date: [**2120-10-4**] ...
DataSet (Medecine)/2018 - ADE/112226.txt,Admission Date: [**2175-6-10**] ...
