In [None]:
# Load Data
import pandas as pd
df = pd.read_excel("HistoQuant/muscle_atlas_2_7.xlsx", "Atlas 1")
df_text = pd.read_csv("EHRoes/text_reports.csv")

# Add Biopsy ID Col to Excel
filename_list = df["File name"].to_list()
patient_ID = []
for index, value in enumerate(filename_list):
    if isinstance(value, str):    
        if value.split(" ")[0] == str(df.iloc[index,0]):
            patient_ID.append(value.split(" ")[1])
        else: 
            patient_ID.append(value.split(" ")[0])
    else:
        patient_ID.append("")
df["biopsy_ID"] = patient_ID

# DF Homo Sapiens
df_homo = df[df["Organism"] == "Homo sapiens"]

# DF Coloration
#coloration = ["ATP 9.4", "COX", "COX/SDH", "HE","TEM","TG", "NADH", "PAS", "SDH"]
coloration = ["ATP 9.4", "COX", "COX/SDH", "HE","TG", "NADH", "PAS", "SDH"]
df_color = df[df["Staining method"].isin(coloration)] 
# DF Congenital Myopthy
df_myocon = df[df["Disease Group"] == "Congenital Myopathies"]

# Diagram de Venn: Myopathy Cong + Coloration de choix + Homme 

# Valeur Recouvrement ID Biop & Images
biopid_list = df_text["biopsie_id"].to_list()
biopid = []
for index, value in enumerate(biopid_list):
    if isinstance(value, str):
        val_split1 = value.split("-")
        val_split2 = val_split1[0].split("/")
        biopid.append(val_split2[0])
    else:
        biopid.append("")
df_text["biopsie_id"] = biopid
merged_df = pd.merge(df_myocon, df_text, how="right", right_on="biopsie_id", left_on="biopsy_ID")
couple_txt_img = merged_df.dropna(subset = ["ordered"])
print("Total number of images: ", len(df))
print("#####")
print("Total number of myo.con img: ", len(df_myocon))
print("Total number of stain img: ", len(df_color))
print("Total number of homo sap: ", len(df_homo))
print("#####")
print("Total number of reports: ", len(df_text))
print("Total number of images of reports: ", len(couple_txt_img["patient_id"]))
print("Total number of report with images: ", len(couple_txt_img["patient_id"].unique()))
print("#####")
print("Total number of myo.con+stain: ", len(pd.merge(df_myocon, df_color, how="inner")))
print("Total number of myo.con+homo: ", len(pd.merge(df_myocon, df_homo, how="inner")))
print("Total number of stain+homo: ", len(pd.merge(df_color, df_homo, how="inner")))
df_merge_color_homo = pd.merge(df_color, df_homo, how="inner")
print("#####")
print("Total number of myo.con+stain+homo: ", len(pd.merge(df_merge_color_homo, df_myocon, how="inner")))

In [None]:
import matplotlib.pyplot as plt
from upsetplot import from_memberships, UpSet

triple_count = len(pd.merge(df_merge_color_homo, df_myocon, how="inner"))
color_homo = len(pd.merge(df_color, df_homo, how="inner")) - triple_count
myocon_homo = len(pd.merge(df_myocon, df_homo, how="inner")) - triple_count
myocon_color = len(pd.merge(df_myocon, df_color, how="inner")) - triple_count
homo = len(df_homo) - triple_count - color_homo - myocon_homo
myocon = len(df_myocon) - triple_count - myocon_homo - myocon_color
color = len(df_color) - triple_count - myocon_color - color_homo
no_cat = len(df) - sum([triple_count, color_homo, myocon_homo, homo, myocon, color])

plt.rcParams["figure.figsize"] = (10,10)
example = from_memberships(
    [[],
    ["Myo. Congénitale"],
    ["Homo sapiens"],
    ["Coloration dans Rapport"],
    ["Myo. Congénitale", "Homo sapiens"],
    ["Coloration dans Rapport", "Homo sapiens"],
    ["Myo. Congénitale", "Coloration dans Rapport"],
    ["Myo. Congénitale", "Coloration dans Rapport", "Homo sapiens"]
    ], 
    data=[no_cat, myocon, homo, color, myocon_homo, color_homo, myocon_color, triple_count]
    )
upset = UpSet(example, show_counts='%d')
upset.style_subsets(present=["Myo. Congénitale", "Homo sapiens"],
                    facecolor="red",
                    label="90% EM")
upset.style_subsets(present=["Myo. Congénitale", "Coloration dans Rapport", "Homo sapiens"],
                    facecolor="blue")
upset.plot()
plt.suptitle("Muscle Atlas Image Count (n=5470)")
plt.savefig("muscle_atlas_summary.jpg",dpi=300)
plt.show()

In [None]:
muscle_triple_filt = pd.merge(df_merge_color_homo, df_myocon, how="inner")
#muscle_triple_filt.to_csv("data/muscle_atlas_2_7_filt_triple.csv", index=False)

In [None]:
import os
import shutil
muscle_triple_filt = pd.merge(df_merge_color_homo, df_myocon, how="inner")
he_files = muscle_triple_filt[muscle_triple_filt["Staining method"]=="HE"]["Number"].to_list()
for file in he_files:
    shutil.copyfile(os.path.join("HistoQuant","raw",file), os.path.join("HistoQuant","he_only",file))


In [None]:
import random
random_subset = random.sample(he_files, 100)
train_subset = random_subset[:50]
test_subset = random_subset[50:]
for file in train_subset:
    shutil.copyfile(os.path.join("HistoQuant","he_only",file), os.path.join("HistoQuant","he_train",file))
for file in test_subset:
    shutil.copyfile(os.path.join("HistoQuant","he_only",file), os.path.join("HistoQuant","he_test",file))
