In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

### Report numbers of each source

In [None]:
data = pd.read_csv("../../data/dataset_metadata.csv")

In [None]:
pubs = data[~pd.isnull(data["DOI"])]

In [None]:
online1 = data[pd.isnull(data["DOI"])]
online = online1[online1["Filename"].str.contains("Avi")]
online2 = online1[online1["Filename"].str.contains("orthumbr")]
# online = online[~online["Filename"].str.contains("olzano")]
# online = online[~online["URL (Video Name)"].str.contains("lotte")]

In [None]:
label = [f[:3].lower() for f in online["Filename"]]
label2 = [f[:3].lower() for f in online2["Filename"]]
np.unique(label, return_counts = True), np.unique(label2, return_counts = True)

In [None]:
len(pubs), np.unique(pubs["Type"], return_counts = True)

### Pathologies plot

In [None]:
def filter_fn(fn):
    if fn.lower() in ["reg", "cov", "pne", "vir"]:
        return fn.lower()
    else:
        print(fn)
        return "reg" # all wrong ones are reg write now
label = [filter_fn(fn[:3]) for fn in data["Filename"].values] #  if fn[:3] in ["Reg", "reg", "Cov", "cov", "pne", "Pne"]]

In [None]:
data.columns

In [None]:
# effusion, consolidated, blines, alines, irregular = [], [], [], []
found_pattern = np.array(["(sub)pleural effusion", "consolidation", "B-line(s)", "A-lines", "irregular pleural line", "air bronchogram", "normal"])
pathologies = np.zeros((len(data), 7))
pat_dict = [["effusion"], ["consol"], ["b line", "b-line", "b - line"], ["a line", "a-line", "a - line"], 
            ["pleural irregular", "irregular pleural", "pleural thickening"], ["bronchogram"],
            ["normal", "healthy"]]

skipped = []
for i, row in data.iterrows():
    if "artifacts" in row["Current location"] or "uncertain" in row["Current location"] or  "not" in row["Current location"].lower():
        #  print("ueberspringe", row["Filename"])
        skipped.append(i)
        continue
    all_comments = (str(row['Comments first medical doctor (MD1)']) + " " +  str(row['MD2']) + " " +  str(row['Comments from web site'])).lower()
    # Effusion?
    for pat_ind in range(7):
        for pat in pat_dict[pat_ind]:
            if (not "no "+pat in all_comments) and (not "not "+ pat in all_comments) and pat in all_comments:
                pathologies[i,pat_ind] = 1
    # print(str(row['Comments first medical doctor (MD1)'])+ " " + str(row['MD2']))
    one_pathologies = np.where(pathologies[i]>0)[0]
    # print(found_pattern[one_pathologies])
    # print( row["Current location"])
    ## check abnormal healthy
    # if (row["Filename"]).lower()[:3]=="reg" and ("effusion" in all_comments or "consol" in all_comments or "pleural irregular" in all_comments):
    #    print(row["Filename"].lower())
    

In [None]:
filtered_pathologies = np.delete(pathologies, skipped, axis=0)
filtered_labels = np.delete(label, skipped)

In [None]:
mddf = pd.DataFrame(filtered_pathologies.astype(int), columns = found_pattern)

In [None]:
mddf.to_csv("../../data/pathologies.csv")

In [None]:
mddf["label"] = filtered_labels

In [None]:
overview = mddf.groupby("label").aggregate("mean")

In [None]:
overview.index[1]

## Main plot

In [None]:
  # the label locations
width = 0.7  # the width of the bars
num = len(found_pattern)
x = np.arange(4)

uni, counts = np.unique(mddf["label"].values, return_counts=True)
xtick_labs = ["COVID-19", "Bacterial pneu.", "Healthy", "Viral pneu."]
new_xtick_labs = []
for i in range(4):
    new_xtick_labs.append(xtick_labs[i]+"\n (n="+str(int(counts[i]))+")")
    
fig, ax = plt.subplots(figsize=(15,8))

rects = list()
for i in range(num):
    rect = ax.bar(x - width/2 + (i+1)*width/num, overview[found_pattern[i]].values, width/num, label=found_pattern[i])
    rects.append(rect)
# rects2 = ax.bar(x + width/4, women_means, width, label='Women')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Pathology occurence (%)', fontsize=30)
ax.set_xticks(x)
ax.set_xticklabels(new_xtick_labs, fontsize=26,) #  rotation=5)
ax.legend(fontsize=22.2,loc='upper right', bbox_to_anchor=(1, 1.02), ncol=3, framealpha=0.5) # bbox_to_anchor=(0.25, 0.16, 0.81, 0.88)
ax.set_ylim(0,1.02)
ax.set_yticklabels(range(0,101, 20),fontsize=23)


fig.tight_layout()
plt.savefig("../../pocovidnet/results_oct/plots/pathologies.pdf")
plt.show()

##### 79/261

### Plot the other way round

In [None]:
x = np.arange(len(overview.columns))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots(figsize=(10,8))

rects = list()
for i in range(4):
    rect = ax.bar(x - width/2 + i*width/4, overview.iloc[i].values, width/4, label=overview.index[i])
    rects.append(rect)
# rects2 = ax.bar(x + width/4, women_means, width, label='Women')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Occurences in comments of medical experts', fontsize=20)
ax.set_xticks(x)
ax.set_xticklabels(found_pattern, fontsize=20)
ax.legend(fontsize=20)
ax.set_ylim(0,0.8)

fig.tight_layout()

plt.show()

## Further analysis

In [None]:
data.columns

In [None]:
fr = [float(fr) for fr in data_vids["Framerate"].values if fr is not None and ~np.isnan(float(fr))]
print("avg framerate", np.mean(fr), np.std(fr), np.min(fr), np.max(fr))

In [None]:
data_vids = data[data["Type"]=="video"]
print("Average number of frames and std")
np.nanmean(data_vids["Length (frames)"]), np.nanstd(data_vids["Length (frames)"])

In [None]:
def not_nan(data):
    return np.array([d for d in data if not pd.isnull(d) and d!="n/A" and d!="nd"])
        

In [None]:
notnan_ages = not_nan(data["Age"]).astype(int) # data[np.logical_and(~pd.isnull(data["Age"]), data["Age"]!="n/A")]
print("Age filled in for ", len(notnan_ages)/len(data), "% (len data:", len(data))
sns.distplot(notnan_ages)
plt.xlabel("Patient age", fontsize=15)
plt.yticks([])
plt.savefig("../results_oct/plots/age_dist.pdf")

In [None]:
np.median(notnan_ages), np.mean(notnan_ages), np.std(notnan_ages)

In [None]:
gender = not_nan(data["Gender"])
print("Gender filled in for", round(len(gender)/ len(data), 2), "%")
print(np.unique(gender, return_counts=True))
print(np.sum(gender=="m") / len(gender))

## Symptoms

In [None]:
symptom_cols = ['Current location', 'Fever','Cough', 'Respiratory problems', 'Headache', 'Fatigue', 'Asymptomatic']
# 'Sore throat', , 'Loss of smell/taste',
symptoms = data[symptom_cols]
# convert labels to int
labs_uni = list(np.unique(label))
label_int = [labs_uni.index(l) for l in label]
symptoms["label"] = label_int
symptoms = symptoms.dropna()
# drop all that have no symptom at all
symptoms = symptoms[symptoms["Fever"]!="n/A"]
symptoms = symptoms[~symptoms["Current location"].str.contains("not")]
symptoms = symptoms[~symptoms["Current location"].str.contains("Not")]
symptoms = symptoms[~symptoms["Current location"].str.contains("artifacts")]
# print(list(symptoms["Current location"]))
symptoms = symptoms.drop(columns=["Current location"])
symptom_cols = symptom_cols[1:]
for col in symptom_cols:
    symptoms.loc[symptoms[col]=="n/A", col] = 0
symptoms = symptoms.astype(int)
# symptoms = symptoms[cols].apply(pd.to_numeric)
# symptoms = symptoms.where(symptoms == "n/A", "0")


In [None]:
len(symptoms) / len(filtered_pathologies)

In [None]:
symptoms_grouped = symptoms.groupby("label").aggregate("mean")

uni, counts = np.unique(symptoms["label"].values, return_counts=True)
xtick_labs = ["COVID-19", "Bacterial pneu.", "Healthy", "Viral pneu."]
new_xtick_labs = []
for i in range(4):
    new_xtick_labs.append(xtick_labs[i]+"\n (n="+str(int(counts[i]))+")")

# the label locations
width = 0.7  # the width of the bars
num = len(symptom_cols)
x = np.arange(4)

fig, ax = plt.subplots(figsize=(15,8))

rects = list()
for i in range(num):
    rect = ax.bar(x - width/2 + (i+1)*width/num, symptoms_grouped[symptom_cols[i]].values, width/num, label=symptom_cols[i])
    rects.append(rect)
# rects2 = ax.bar(x + width/4, women_means, width, label='Women')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Reported symptoms (%)', fontsize=30)
ax.set_xticks(x)
ax.set_xticklabels(new_xtick_labs, fontsize=26)
ax.legend(fontsize=24, loc="upper left", bbox_to_anchor=(.05, 1.02), ncol=2)
ax.set_ylim(0,1.02)
ax.set_yticklabels(range(0,101, 20),fontsize=23)


fig.tight_layout()
plt.savefig("../../pocovidnet/results_oct/plots/symptoms.pdf")
plt.show()

In [None]:
data_fever = (data["Fever"]=="1").astype(int) * (np.array(label_int)==2).astype(int) > 0
print(np.any(data_fever))
data[data_fever]
# np.logical_and(np.array(label_int)==2,  data["Respiratory problems"]==1)

### Avi's data:

In [None]:
data.columns

In [None]:
avi = data[data["URL (Video Name)"].str.contains("humbria")]
# data[np.logical_or(data["Filename"].str.contains("Avi"), data["Filename"].str.contains("orthumbria"))]
avi_gender = avi[~pd.isnull(avi["Gender"])]
avi_gender = avi_gender[~avi_gender["Current location"].str.contains("not")]
avi_gender = avi_gender[~avi_gender["Current location"].str.contains("Not")]
avi_gender = avi_gender[~avi_gender["Current location"].str.contains("artifacts")]
np.unique(avi_gender["Type"], return_counts= True)

## Get numbers of data

In [None]:
data.columns

In [None]:
dois = data["DOI"]
titles = data[~pd.isnull(dois)]
list(np.unique(titles["Title"]))

In [None]:
data[data["Title"]=="Usefulness of lung ultrasound in diagnosing causes of exacerbation in patients with chronic dyspnea"]

In [None]:
data_image = data[data["Type"]=="image"]
data_image = data_image[data_image["URL (Video Name)"].str.contains("humbria")]
data_image = data_image[~data_image["Current location"].str.contains("not")]
data_image = data_image[~data_image["Current location"].str.contains("Not")]
data_image = data_image[~data_image["Current location"].str.contains("artifacts")]
data_image

In [None]:
data_image = data[data["Type"]=="image"]
data_image[data["Probe"]=="convex"]
# 

In [None]:
# GET WEB VIDEOS
data_image = data[data["Type"]=="image"]
data_image = data_image[pd.isnull(data_image["DOI"])]
data_image = data_image[~data_image["Filename"].str.contains("Avi")]
data_image = data_image[~data_image["Filename"].str.contains("olzano")]
data_image = data_image[~data_image["Filename"].str.contains("orthumbria")]
data_image = data_image[~data_image["Current location"].str.contains("utterfly")]
data_image = data_image[~data_image["Current location"].str.contains("not")]
data_image = data_image[~data_image["Current location"].str.contains("Not")]
data_image = data_image[~data_image["Current location"].str.contains("artifacts")]
data_image = data_image[~data_image["URL (Video Name)"].str.contains("atlas")]
data_image = data_image[~data_image["URL (Video Name)"].str.contains("grepmed")]
data_image = data_image[~data_image["URL (Video Name)"].str.contains("litfl")]
data_image = data_image[~data_image["URL (Video Name)"].str.contains("charlotte")]
data_image
# data_image[~pd.isnull(data_image["DOI"])]

In [None]:
data_image = data[data["Type"]=="video"]
data_image= data_image[data_image["Probe"]=="linear"]
data_image = data_image[~data_image["Filename"].str.contains("olzano")]
data_image

In [None]:
# GET publications videos and images
data_image = data[data["Type"]=="video"]
data_image = data_image[~pd.isnull(data_image["DOI"])]
data_image = data_image[~data_image["Current location"].str.contains("not")]
data_image = data_image[~data_image["Current location"].str.contains("Not")]
data_image = data_image[~data_image["Current location"].str.contains("artifacts")]
data_image

In [None]:
# get grep
data_notnan = data[~pd.isnull(data["URL (Video Name)"])]
data_notnan[data_notnan["URL (Video Name)"].str.contains("grep")]

In [None]:
rm_files = data[data["License"]=="None"]
for name, path in zip(rm_files["Filename"], rm_files["Current location"]):
    if path.startswith("data"):
        new_path = path[5:]
    else:
        new_path = path
    if "butterfly" in new_path or "not" in new_path:
        continue
    print(f'rm "{new_path.lower()}/{name}"')

In [None]:
import json
with open("../../data/crop.json", "r") as infile:
    crop = json.load(infile)
for key in crop.keys():
    print("data/"+key)

In [None]:
git rm data/pocus_images/convex/Cov_ablines_covidmanifestations_paper1.png
git rm data/pocus_images/convex/Cov_blines_covidmanifestation_paper2.png
git rm data/pocus_images/linear/Cov_irregularpleural_covidmanifestations_paper3.png
git rm data/pocus_videos/convex/Reg-nephropocus.mp4
git rm data/pocus_videos/linear/Reg-NormalLung.mp4
git rm data/pocus_images/linear/Cov_blines_acutemedicine.png
git rm data/pocus_images/convex/Reg_bikus.png
git rm data/pocus_images/convex/Pneu_bikus2.png
git rm data/pocus_images/convex/Pneu_bikus3.png
git rm data/pocus_images/convex/Reg_acutemedicine.png

In [None]:
import os
path1 = "../../data/"
path2 = "/Users/ninawiedemann/Projects/backup_covid19_pocus_ultrasound/data/"
for modality in ["convex", "linear"]:
    for datatype in ["videos", "images"]:
        path_new = os.path.join(path1, "pocus_" + datatype, modality)
        path_old = os.path.join(path2, "pocus_" + datatype, modality)
        print("---------", modality, datatype)
        new_files = os.listdir(path_new)
        old_files = os.listdir(path_old)
        print("\n not in old \n")
        for f in new_files:
            fn_gif = f.split(".")[0]+".gif"
            fn_avi = f.split(".")[0]+".avi"
            fn_mov = f.split(".")[0]+".mov"
            if f not in old_files and fn_gif not in old_files and fn_avi not in old_files and fn_mov not in old_files:
                print(f)
        print("\n not in new \n")
        for f in old_files:
            fn_mp4 = f.split(".")[0]+".mp4"
            if f not in new_files and fn_mp4 not in new_files and "utterfly" not in f:
                print(f)

### print without filename

In [None]:
data = pd.read_csv("../../data/dataset_metadata.csv")
import cv2

In [None]:
column = "Length (frames)"

for i, row in data.iterrows():
    loc = row["Current location"].lower()
    if loc.startswith("data"):
        loc = loc[5:]
    filename_without = row["Filename"].split(".")[0]
    
    # if row["Filename"][-3:] in ["png", "jpg"]:
    #     print(1)
    #     continue
    
    if os.path.exists(os.path.join("../../data/", loc, row["Filename"])):
        path = os.path.join("../../data/", loc, row["Filename"])
    elif os.path.exists(os.path.join("../../data/", loc, filename_without+".mp4")):
        path = os.path.join("../../data/", loc, filename_without+".mp4")
    else:
        if pd.isnull(row[column]) or row[column]=="-":
            print("n/a")
        else:
            print(row[column])
        continue
    cap = cv2.VideoCapture(path)
    # print(filename_without)
    # print(round(cap.get(5)))
    # print(f"{round(cap.get(3))}x{round(cap.get(4))}")
    print(int(cap.get(7)))