In [8]:
import pandas as pd
import numpy as np
from subprocess import run
import os
import shutil

#Initial clone

if not os.path.exists("covid-19-image-repository"):
    run("git clone https://github.com/ml-workgroup/covid-19-image-repository.git".split(" "))


In [13]:
def where_offset_zero(val, offset):
    val = val.copy()
    val[offset!=0] = pd.NA
    return val

def tf_to_yn(arg):
    if pd.isna(arg):
        return arg
    else:
        return "Y" if arg else "N"

def coerce_numeric(row):
    row = row.replace([np.inf, -np.inf], np.nan)
    return pd.to_numeric(row, errors="coerce")
    
def convert_metadata(data, starting_ptid):
    #Id: Count up from highest in metadata.csv
    id_ = data["patient_id"].astype("category").cat.codes
    id_ += starting_ptid + 1
    
    #Use Admission Offset as offset
    offset = -pd.to_numeric(data["admission_offset"],errors="coerce")
    
    #Image names from image ids
    file_col = data["image_id"].apply(lambda i: i + ".png")

    #ICU data
    
    went_icu = data.groupby("patient_id").aggregate({"icu_admission_offset":lambda column: any(~column.isna())})
    went_icu = went_icu.loc[data["patient_id"]]["icu_admission_offset"].reset_index(drop=True)

    in_icu = went_icu & (data["icu_admission_offset"] <= 0) & (data["icu_release_offset"] > 0)
    
    went_icu = went_icu.apply(tf_to_yn)
    in_icu = in_icu.apply(tf_to_yn)
    
    #COVID-19 or No Finding
    
    finding = data["admission_offset"].apply(lambda i: "COVID-19" if i <= 0 else "No Finding")
    
    #Survival data
    
    def survived(row):
        if not pd.isna(row["death_offset"]):
            return "N"
        elif not pd.isna(row["icu_release_offset"]):
            return "Y"
        else:
            return pd.NA
    
    survival = data.apply(survived,axis=1)
    
    #Take clinical data where the offset is zero
    
    lymph = where_offset_zero(
        data["lymphocytes_val"],
        data["lymphocytes_offset"]
    )
    lymph = coerce_numeric(lymph)
    
    
    po2 = where_offset_zero(
        data["po2_val"],
        data["po2_offset"]
    )
    po2 = coerce_numeric(po2)
    
    neutro = where_offset_zero(
        data["neutrophils_val"],
        data["neutrophils_offset"]
    )
    neutro = coerce_numeric(neutro)
    
    #Uppercase
    sex = data["sex"].str.upper()
    projection = data["projection"].str.upper()
    
    new_data = pd.DataFrame({
                         "patientid":id_,
                         "sex":sex,
                         "view":projection,
                         "offset":offset,
                         "lymphocyte_count":lymph,
                         "pO2_saturation":po2,
                         "neutrophil_count":neutro,
                         "in_icu":in_icu,
                         "went_icu":went_icu,
                         "survival":survival,
                         "url":"https://github.com/ml-workgroup/covid-19-image-repository",
                         "license":"CC BY 3.0",
                         "location":"Hannover Medical School, Hannover, Germany",
                         "doi":"10.6084/m9.figshare.12275009",
                         "finding":finding,
                         "filename":file_col,
                         "folder":"images",
                         "modality":"X-ray",
                         "date":2020
    })
    
    new_data.sort_values(["patientid","offset"])

    return new_data


def pull_repo(repo):
    curr = os.getcwd()
    os.chdir(repo)
    run("git pull origin master".split(" "))
    os.chdir(curr)

def add_hannover(hannover_repo,
                 mila_repo,
                 filename=None):
    
    pull_repo(hannover_repo)
    #Open hannover data
    hannover_csv_path = os.path.join(hannover_repo, "data.csv") 
    data = pd.read_csv(hannover_csv_path)
       
    #Open mila data
    mila_csv_path = os.path.join(mila_repo, "metadata.csv")
    metadata = pd.read_table(mila_csv_path,sep=",")
    
    #Convert and append hannover data
    new_data = convert_metadata(data,
                                starting_ptid=int(max(metadata["patientid"])))
    existing_images = metadata["filename"]
    new_data = new_data.query("not filename in @existing_images")

    #Copy images
    mila_img_path = os.path.join(mila_repo, "images")
    hannover_img_path = os.path.join(hannover_repo, "png")
    for image in new_data["filename"]:
        print(image)
        shutil.copyfile(os.path.join(hannover_img_path, image),
                        os.path.join(mila_img_path, image))
        
    #Write new metadata
    data_with_blank_cols = pd.DataFrame(columns=metadata.columns)
    data_with_blank_cols = data_with_blank_cols.append(new_data)
    data_with_blank_cols.to_csv(mila_csv_path, index=False, mode="a", header=False)
    
    return new_data

In [14]:
#Add directly to metadata
add_hannover("covid-19-image-repository", "..")

b343e657.png
c9280a30.png
53c9be49.png
6b5af975.png
a4318ac9.png
bd3ceeb6.png
9f3f2d91.png
b6e58409.png
2168a917.png
61bc50d1.png
bd85e252.png
87d50e40.png
cae2c0a9.png
5f619d7e.png
a38e1877.png
4d98e1de.png
80b5f00f.png
3c8a0876.png
7a030330.png
00870a9c.png
d15bf071.png
35b446ce.png
00d96e05.png
90070cbc.png
f567c33c.png
c0f74558.png
f6575117.png
ac4f6e4e.png
2cd63b76.png
ffe8b4cb.png
c08a4f41.png
6c5b3802.png
d22964a4.png
262a70ca.png
19abe1f3.png
9a9b2393.png
b39206a9.png
bb0e626a.png
6f7008af.png
441c9cdd.png
add529f3.png
f64132c8.png
bfefde5d.png
bace1e45.png
9d36404d.png
3dedeb92.png
bb4c4038.png
563118e4.png
5f7a99b2.png
a132d8b6.png
88267e40.png
59cb1744.png
fce2b5d4.png
7a2d2695.png
b10c49ca.png
0578e08b.png
3161e216.png
1930e42f.png
1bc3008e.png
93fd0adb.png
483f2ad8.png
bd10d5e2.png
cf35d0c4.png
a7abee59.png
3d388a98.png
8e438fce.png
17ad0a56.png
a2eba651.png
f8335316.png
4fed5061.png
3b46de94.png
d9d6ca9a.png
a361d7b7.png
ac00512e.png
d6b8d378.png
b606e1d0.png
b4e9a53a.png

Unnamed: 0,patientid,sex,view,offset,lymphocyte_count,pO2_saturation,neutrophil_count,in_icu,went_icu,survival,url,license,location,doi,finding,filename,folder,modality,date
0,219,M,AP,0,,55.0,,Y,Y,Y,https://github.com/ml-workgroup/covid-19-image...,CC BY 3.0,"Hannover Medical School, Hannover, Germany",10.6084/m9.figshare.12275009,COVID-19,b343e657.png,images,X-ray,2020
1,219,M,AP,2,,80.0,,Y,Y,Y,https://github.com/ml-workgroup/covid-19-image...,CC BY 3.0,"Hannover Medical School, Hannover, Germany",10.6084/m9.figshare.12275009,COVID-19,c9280a30.png,images,X-ray,2020
2,219,M,AP,5,,105.0,,N,Y,Y,https://github.com/ml-workgroup/covid-19-image...,CC BY 3.0,"Hannover Medical School, Hannover, Germany",10.6084/m9.figshare.12275009,COVID-19,53c9be49.png,images,X-ray,2020
3,220,M,AP,0,2.7,165.0,6.2,N,Y,Y,https://github.com/ml-workgroup/covid-19-image...,CC BY 3.0,"Hannover Medical School, Hannover, Germany",10.6084/m9.figshare.12275009,COVID-19,6b5af975.png,images,X-ray,2020
4,221,M,AP,-10,,,,N,Y,Y,https://github.com/ml-workgroup/covid-19-image...,CC BY 3.0,"Hannover Medical School, Hannover, Germany",10.6084/m9.figshare.12275009,No Finding,a4318ac9.png,images,X-ray,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
167,269,M,AP,18,,45.0,,Y,Y,Y,https://github.com/ml-workgroup/covid-19-image...,CC BY 3.0,"Hannover Medical School, Hannover, Germany",10.6084/m9.figshare.12275009,COVID-19,76093afc.png,images,X-ray,2020
168,270,M,PA,-23,,,,N,N,,https://github.com/ml-workgroup/covid-19-image...,CC BY 3.0,"Hannover Medical School, Hannover, Germany",10.6084/m9.figshare.12275009,No Finding,00cfd1c1.png,images,X-ray,2020
169,270,M,PA,-12,,,,N,N,,https://github.com/ml-workgroup/covid-19-image...,CC BY 3.0,"Hannover Medical School, Hannover, Germany",10.6084/m9.figshare.12275009,No Finding,eb8e8a5b.png,images,X-ray,2020
170,270,M,PA,-1,,,,N,N,,https://github.com/ml-workgroup/covid-19-image...,CC BY 3.0,"Hannover Medical School, Hannover, Germany",10.6084/m9.figshare.12275009,No Finding,9eca9ade.png,images,X-ray,2020
