In [1]:
import os
import pandas as pd
from IPython.display import display_html, HTML

In [2]:
prefixes_path = "../server/embeddings/analysis/prefixes.txt"
data_path = "../data"

In [3]:
with open(prefixes_path, "r") as f:
    prefixes = [line.rstrip() for line in f.readlines()]

In [4]:
index_set_single = [file.split(".")[0] for file in os.listdir(os.path.join(data_path, "indexed/single_chart"))]
index_set_multiple = [file.split(".")[0] for file in os.listdir(os.path.join(data_path, "indexed/multiple_chart/imgs"))]

index_set = []
index_set.extend(index_set_single)
index_set.extend(index_set_multiple)
index_set.sort()

len(index_set)

161

In [5]:
all_data = [file.split(".")[0] for file in os.listdir(os.path.join(data_path, "unified/specs"))]
all_data.sort()

In [6]:
df = pd.DataFrame(all_data, columns=["file_name"])

In [7]:
df['class'] = df['file_name'].apply(lambda x: next((pre for pre in prefixes if x.startswith(pre)), None))
df['index'] = df['file_name'].apply(lambda x: True if x in index_set else False)

In [8]:
chromoscope = ["BOCA", "BRCA", "BTCA", "GBM", "LIRI", "PBCA"]
chromoscope2 = ["breast-", "gastric", "kidney", "ovarian serous cystadenocarcinoma", "ovarian", "prostate adenocarcinoma", "sarcoma"]
missed = [] #temp

def get_original(file_name, prefix):
    if file_name in index_set:
        return file_name
    if prefix.split("-")[0] in chromoscope or prefix.split("-")[0] in chromoscope2:
        return 'BOCA-UK-f83fc777-5416-c3e9-e040-11ac0d482c8e'
    if "_oc" or "_cc_" in file_name:
        root = file_name.split("_oc")[0] if "_oc" in file_name else file_name.split("_cc_")[0]
        matches = [index for index in index_set if prefix in index]
        if len(matches) == 0: 
            # print("error on no matches at all?", file_name)
            missed.append(file_name)
            return None
        elif len(matches) == 1: 
            return matches[0]
        
        if "_oc" in file_name:
            filtered_matches = [match for match in matches if "_oc" in match]
            if len(filtered_matches) == 0: 
                print("error on _oc?", file_name)
            return filtered_matches[0]
        else:
            color = file_name.split("_cc_")[-1]
            filtered_matches = [match for match in matches if f"_cc_{color}" in match] 
            if len(filtered_matches) == 0:
                return matches[0]
            return filtered_matches[0]
    missed.append(file_name)
    return None

In [9]:
df['original'] = df.apply(lambda row: get_original(row['file_name'], row['class']), axis=1)

In [10]:
len(missed)

666

In [11]:
with open("missed.txt", "w") as f:
    for m in missed: 
        f.write(m)
        f.write("\n")

In [12]:
df.to_csv("siblings.csv")

In [13]:
result = df.groupby('original')['file_name'].apply(list).to_dict()
result = {key : [v for v in value if v != key] for key, value in result.items()}
import json

with open("siblings.json", "w") as f:
    f.write(json.dumps(result))

In [14]:
df["image"] = df["file_name"].apply(lambda x: f"{data_path}/unified/imgs/{x}.png")
df["image_original"] = df["original"].apply(lambda x: f"{data_path}/unified/imgs/{x}.png")

def display_images_in_table(df):
    html = """<table border='1' style='border-collapse: collapse; text-align: left;'>
              <tr>"""  

    html += "".join(f"<th style='padding: 2px; text-align: left;'>{col}</th>" for col in df.columns if (col != "image" and col != "image_original"))
    html += "<th style='padding: 2px;'>Image</th></tr>"

    for _, row in df.iterrows():
        html += "<tr>"
        html += "".join(f"<td style='padding: 2px; text-align: left; white-space: nowrap;'>{row[col]}</td>" for col in df.columns if (col != "image" and col != "image_original"))
        html += f"<td style='padding: 2px;'><img src='{row['image']}' style='width:100px;'></td>"
        html += f"<td style='padding: 2px;'><img src='{row['image_original']}' style='width:100px;'></td>"
        html += "</tr>"

    html += "</table>"

    display_html(HTML(html))

# display_images_in_table(df)