In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv("../../data/dataset_metadata.csv")

In [None]:
data.columns

In [None]:
# effusion, consolidated, blines, alines, irregular = [], [], [], []
found_pattern = np.array(["(sub)pleural effusion", "consolidation", "B-line(s)", "A-lines", "irregular pleural line", "air bronchogram", "normal"])
pathologies = np.zeros((len(data), 7))
pat_dict = [["effusion"], ["consol"], ["b line", "b-line", "b - line"], ["a line", "a-line", "a - line"], 
            ["pleural irregular", "irregular pleural", "pleural thickening"], ["bronchogram"],
            ["normal", "healthy"]]

for i, row in data.iterrows():
    if "artifacts" in row["Current location"] or "uncertain" in row["Current location"]:
        print("ueberspringe", row["Filename"])
        continue
    all_comments = (str(row['Comments first medical doctor (MD1)']) + " " +  str(row['MD2'])).lower()
    # Effusion?
    for pat_ind in range(7):
        for pat in pat_dict[pat_ind]:
            if (not "no "+pat in all_comments) and (not "not "+ pat in all_comments) and pat in all_comments:
                pathologies[i,pat_ind] = 1
    # print(str(row['Comments first medical doctor (MD1)'])+ " " + str(row['MD2']))
    one_pathologies = np.where(pathologies[i]>0)[0]
    print(found_pattern[one_pathologies])
    # print()
    if (row["Filename"]).lower()[:3]=="reg" and ("effusion" in all_comments or "consol" in all_comments or "pleural irregular" in all_comments):
        print(row["Filename"].lower())
    

In [None]:
mddf = pd.DataFrame(pathologies.astype(int), columns = found_pattern)

In [None]:
mddf.to_csv("../../data/pathologies.csv")

In [None]:
def filter_fn(fn):
    if fn.lower() in ["reg", "cov", "pne", "vir"]:
        return fn.lower()
    else:
        print(fn)
        return "reg" # all wrong ones are reg write now
label = [filter_fn(fn[:3]) for fn in data["Filename"].values] #  if fn[:3] in ["Reg", "reg", "Cov", "cov", "pne", "Pne"]]

In [None]:
mddf["label"] = label

In [None]:
overview = mddf.groupby("label").aggregate("mean")

In [None]:
overview.index[1]

## Main plot

In [None]:
  # the label locations
width = 0.7  # the width of the bars
num = len(found_pattern)
x = np.arange(4)

fig, ax = plt.subplots(figsize=(15,8))

rects = list()
for i in range(num):
    rect = ax.bar(x - width/2 + (i+1)*width/num, overview[found_pattern[i]].values, width/num, label=found_pattern[i])
    rects.append(rect)
# rects2 = ax.bar(x + width/4, women_means, width, label='Women')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Percent of pathology occurence (MD comments)', fontsize=20)
ax.set_xticks(x)
ax.set_xticklabels(["Covid-19", "Bacterial pneumonia", "Healthy", "Viral pneumonia"], fontsize=20)
ax.legend(fontsize=20, loc="upper center", ncol=4)
ax.set_ylim(0,0.65)


fig.tight_layout()
plt.savefig("../../data/pathologies.pdf")
plt.show()

### Plot the other way round

In [None]:
x = np.arange(len(overview.columns))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots(figsize=(10,8))

rects = list()
for i in range(4):
    rect = ax.bar(x - width/2 + i*width/4, overview.iloc[i].values, width/4, label=overview.index[i])
    rects.append(rect)
# rects2 = ax.bar(x + width/4, women_means, width, label='Women')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Occurences in comments of medical experts', fontsize=20)
ax.set_xticks(x)
ax.set_xticklabels(found_pattern, fontsize=20)
ax.legend(fontsize=20)
ax.set_ylim(0,0.8)

fig.tight_layout()

plt.show()

## Further analysis

In [None]:
data.columns

In [None]:
data_vids = data[data["Type"]=="video"]
print("Average number of frames and std")
np.nanmean(data_vids["Length (frames)"]), np.nanstd(data_vids["Length (frames)"])

In [None]:
notnan_ages = data[~np.isnan(data["Age"])]
sns.distplot(notnan_ages["Age"])
plt.xlabel("Patient age", fontsize=15)