In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv("../../data/dataset_metadata.csv")

In [None]:
def filter_fn(fn):
    if fn.lower() in ["reg", "cov", "pne", "vir"]:
        return fn.lower()
    else:
        print(fn)
        return "reg" # all wrong ones are reg write now
label = [filter_fn(fn[:3]) for fn in data["Filename"].values] #  if fn[:3] in ["Reg", "reg", "Cov", "cov", "pne", "Pne"]]

In [None]:
data.columns

In [None]:
# make label column
lab_dict = {"reg":"regular", "cov":"COVID-19", "pne":"Bacterial pneumonia", "vir":"Viral pneumonia"}
for l in label:
    print(lab_dict[l])

In [None]:
# effusion, consolidated, blines, alines, irregular = [], [], [], []
found_pattern = np.array(["(sub)pleural effusion", "consolidation", "B-line(s)", "A-lines", "irregular pleural line", "air bronchogram", "normal"])
pathologies = np.zeros((len(data), 7))
pat_dict = [["effusion"], ["consol"], ["b line", "b-line", "b - line"], ["a line", "a-line", "a - line"], 
            ["pleural irregular", "irregular pleural", "pleural thickening"], ["bronchogram"],
            ["normal", "healthy"]]

for i, row in data.iterrows():
    if "artifacts" in row["Current location"] or "uncertain" in row["Current location"] or  "not" in row["Current location"]:
        print("ueberspringe", row["Filename"])
        continue
    all_comments = (str(row['Comments first medical doctor (MD1)']) + " " +  str(row['MD2']) + " " +  str(row['Comments from web site'])).lower()
    # Effusion?
    for pat_ind in range(7):
        for pat in pat_dict[pat_ind]:
            if (not "no "+pat in all_comments) and (not "not "+ pat in all_comments) and pat in all_comments:
                pathologies[i,pat_ind] = 1
    # print(str(row['Comments first medical doctor (MD1)'])+ " " + str(row['MD2']))
    one_pathologies = np.where(pathologies[i]>0)[0]
    # print(found_pattern[one_pathologies])
    # print()
    if (row["Filename"]).lower()[:3]=="reg" and ("effusion" in all_comments or "consol" in all_comments or "pleural irregular" in all_comments):
        print(row["Filename"].lower())
    

In [None]:
mddf = pd.DataFrame(pathologies.astype(int), columns = found_pattern)

In [None]:
mddf.to_csv("../../data/pathologies.csv")

In [None]:
mddf["label"] = label

In [None]:
overview = mddf.groupby("label").aggregate("mean")

In [None]:
overview.index[1]

## Main plot

In [None]:
  # the label locations
width = 0.7  # the width of the bars
num = len(found_pattern)
x = np.arange(4)

uni, counts = np.unique(mddf["label"].values, return_counts=True)
xtick_labs = ["COVID-19", "Bacterial pneu.", "Healthy", "Viral pneu."]
new_xtick_labs = []
for i in range(4):
    new_xtick_labs.append(xtick_labs[i]+" (n="+str(int(counts[i]))+")")
    
fig, ax = plt.subplots(figsize=(15,8))

rects = list()
for i in range(num):
    rect = ax.bar(x - width/2 + (i+1)*width/num, overview[found_pattern[i]].values, width/num, label=found_pattern[i])
    rects.append(rect)
# rects2 = ax.bar(x + width/4, women_means, width, label='Women')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Percentage of pathology occurence', fontsize=20)
ax.set_xticks(x)
ax.set_xticklabels(new_xtick_labs, fontsize=20)
ax.legend(fontsize=20, loc="upper center", ncol=4)
ax.set_ylim(0,1)
ax.set_yticklabels(range(0,100, 20),fontsize=18)


fig.tight_layout()
plt.savefig("../../pocovidnet/results_oct/plots/pathologies.pdf")
plt.show()

### Plot the other way round

In [None]:
x = np.arange(len(overview.columns))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots(figsize=(10,8))

rects = list()
for i in range(4):
    rect = ax.bar(x - width/2 + i*width/4, overview.iloc[i].values, width/4, label=overview.index[i])
    rects.append(rect)
# rects2 = ax.bar(x + width/4, women_means, width, label='Women')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Occurences in comments of medical experts', fontsize=20)
ax.set_xticks(x)
ax.set_xticklabels(found_pattern, fontsize=20)
ax.legend(fontsize=20)
ax.set_ylim(0,0.8)

fig.tight_layout()

plt.show()

## Further analysis

In [None]:
data.columns

In [None]:
data_vids = data[data["Type"]=="video"]
print("Average number of frames and std")
np.nanmean(data_vids["Length (frames)"]), np.nanstd(data_vids["Length (frames)"])

In [None]:
def not_nan(data):
    return np.array([d for d in data if not pd.isnull(d) and d!="n/A" and d!="nd"])
        

In [None]:
notnan_ages = not_nan(data["Age"]).astype(int) # data[np.logical_and(~pd.isnull(data["Age"]), data["Age"]!="n/A")]
print("Age filled in for ", len(notnan_ages)/len(data), "% (len data:", len(data))
sns.distplot(notnan_ages)
plt.xlabel("Patient age", fontsize=15)
plt.yticks([])
plt.savefig("../results_oct/plots/age_dist.pdf")

In [None]:
np.median(notnan_ages), np.mean(notnan_ages), np.std(notnan_ages)

In [None]:
gender = not_nan(data["Gender"])
print("Gender filled in for", round(len(gender)/ len(data), 2), "%")
print(np.unique(gender, return_counts=True))
print(np.sum(gender=="m") / len(gender))

## Symptoms

In [None]:
symptom_cols = ['Current location', 'Fever','Cough', 'Respiratory problems', 'Headache', 'Fatigue', 'Asymptomatic']
# 'Sore throat', , 'Loss of smell/taste',
symptoms = data[symptom_cols]
# convert labels to int
labs_uni = list(np.unique(label))
label_int = [labs_uni.index(l) for l in label]
symptoms["label"] = label_int
symptoms = symptoms.dropna()
# drop all that have no symptom at all
symptoms = symptoms[symptoms["Fever"]!="n/A"]
symptoms = symptoms[~symptoms["Current location"].str.contains("not")]
symptoms = symptoms[~symptoms["Current location"].str.contains("artifacts")]
# print(list(symptoms["Current location"]))
symptoms = symptoms.drop(columns=["Current location"])
symptom_cols = symptom_cols[1:]
for col in symptom_cols:
    symptoms.loc[symptoms[col]=="n/A", col] = 0
symptoms = symptoms.astype(int)
# symptoms = symptoms[cols].apply(pd.to_numeric)
# symptoms = symptoms.where(symptoms == "n/A", "0")


In [None]:
len(symptoms) / len(data)

In [None]:
symptoms_grouped = symptoms.groupby("label").aggregate("mean")

uni, counts = np.unique(symptoms["label"].values, return_counts=True)
xtick_labs = ["Covid-19", "Bacterial pneumonia", "Healthy lung", "Viral pneumonia"]
new_xtick_labs = []
for i in range(4):
    new_xtick_labs.append(xtick_labs[i]+" ("+str(int(counts[i]))+")")

# the label locations
width = 0.7  # the width of the bars
num = len(symptom_cols)
x = np.arange(4)

fig, ax = plt.subplots(figsize=(15,8))

rects = list()
for i in range(num):
    rect = ax.bar(x - width/2 + (i+1)*width/num, symptoms_grouped[symptom_cols[i]].values, width/num, label=symptom_cols[i])
    rects.append(rect)
# rects2 = ax.bar(x + width/4, women_means, width, label='Women')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Percentage of reported symptoms', fontsize=20)
ax.set_xticks(x)
ax.set_xticklabels(new_xtick_labs, fontsize=20)
ax.legend(fontsize=20, loc="upper left", ncol=3)
ax.set_yticklabels(range(0,100, 20),fontsize=18)
ax.set_ylim(0,1)


fig.tight_layout()
plt.savefig("../../pocovidnet/results_oct/plots/symptoms.pdf")
plt.show()

In [None]:
data_fever = (data["Fever"]=="1").astype(int) * (np.array(label_int)==2).astype(int) > 0
print(np.any(data_fever))
data[data_fever]
# np.logical_and(np.array(label_int)==2,  data["Respiratory problems"]==1)

### Avi's data:

In [None]:
data.columns

In [None]:
avi = data[np.logical_or(data["Filename"].str.contains("Avi"), data["Filename"].str.contains("orthumbria"))]
avi_gender = avi[~pd.isnull(avi["Gender"])]
np.unique(avi_gender["Gender"], return_counts= True)

## Debug ICLUS stuff

In [None]:
with open("../../data/ICLUS/ICLUS_cropping.json", "r") as infile:
    frame_cut = json.load(infile)
best_crop_dir = "../results_oct/iclus/best_of_both_crops/"

In [None]:
arr = np.random.rand(20)
print(arr)
np.quantile(arr, 0.9)