In [1]:
import os
import pandas as pd
from IPython.display import display_html, HTML

In [2]:
prefixes_path = "../server/embeddings/analysis/prefixes.txt"
data_path = "../data"

In [3]:
with open(prefixes_path, "r") as f:
    prefixes = [line.rstrip() for line in f.readlines()]

In [4]:
index_set_single = [file.split(".")[0] for file in os.listdir(os.path.join(data_path, "indexed/single_chart"))]
index_set_multiple = [file.split(".")[0] for file in os.listdir(os.path.join(data_path, "indexed/multiple_chart/imgs"))]

index_set = []
index_set.extend(index_set_single)
index_set.extend(index_set_multiple)
index_set.sort()

In [5]:
all_data = [file.split(".")[0] for file in os.listdir(os.path.join(data_path, "unified/specs"))]
all_data.sort()

In [6]:
df = pd.DataFrame(all_data, columns=["file_name"])

In [7]:
df['class'] = df['file_name'].apply(lambda x: next((pre for pre in prefixes if x.startswith(pre)), None))
df['index'] = df['file_name'].apply(lambda x: True if x in index_set else False)

In [8]:
df['original'] = df['file_name'].apply(lambda x: x if x in index_set else None)

In [9]:
df.head(10)

Unnamed: 0,file_name,class,index,original
0,AREA_sw_0_7_s_0_7_cc_0,AREA,True,AREA_sw_0_7_s_0_7_cc_0
1,AREA_sw_0_7_s_0_7_cc_1,AREA,False,
2,AREA_sw_0_7_s_0_7_cc_2,AREA,False,
3,AREA_sw_0_7_s_0_7_cc_3,AREA,False,
4,AREA_sw_0_7_s_0_7_oc,AREA,False,
5,AREA_sw_0_7_s_1_0_cc_0,AREA,False,
6,AREA_sw_0_7_s_1_0_cc_1,AREA,True,AREA_sw_0_7_s_1_0_cc_1
7,AREA_sw_0_7_s_1_0_cc_2,AREA,False,
8,AREA_sw_0_7_s_1_0_cc_3,AREA,False,
9,AREA_sw_0_7_s_1_0_oc,AREA,False,


In [10]:
chromoscope = ["BOCA", "BRCA", "BTCA", "GBM", "LIRI", "PBCA"]
missed = [] #temp

def get_original(file_name, prefix):
    if file_name in index_set:
        return file_name
    if prefix.split("-")[0] in chromoscope:
        return 'BOCA-UK-f83fc777-5416-c3e9-e040-11ac0d482c8e'
    if "_oc" or "_cc_" in file_name:
        root = file_name.split("_oc")[0] if "_oc" in file_name else file_name.split("_cc_")[0]
        matches = [index for index in index_set if prefix in index]
        if len(matches) == 0: 
            # print("error on no matches at all?", file_name)
            missed.append(file_name)
            return None
        elif len(matches) == 1: 
            return matches[0]
        
        if "_oc" in file_name:
            filtered_matches = [match for match in matches if "_oc" in match]
            if len(filtered_matches) == 0: 
                print("error on _oc?", file_name)
            return filtered_matches[0]
        else:
            color = file_name.split("_cc_")[-1]
            filtered_matches = [match for match in matches if f"_cc_{color}" in match] 
            if len(filtered_matches) == 0:
                return matches[0]
            return filtered_matches[0]
    missed.append(file_name)
    return None

In [11]:
# row = df.iloc[2]
# print(row['file_name'])
# get_original(row['file_name'], row['class'])

In [12]:
df['original'] = df.apply(lambda row: get_original(row['file_name'], row['class']), axis=1)

In [13]:
len(missed)

675

In [14]:
with open("missed.txt", "w") as f:
    for m in missed: 
        f.write(m)
        f.write("\n")

In [15]:
df.to_csv("siblings.csv")

In [16]:
df["image"] = df["file_name"].apply(lambda x: f"{data_path}/unified/imgs/{x}.png")

def display_images_in_table(df):
    html = """<table border='1' style='border-collapse: collapse; text-align: left;'>
              <tr>"""  

    html += "".join(f"<th style='padding: 2px; text-align: left;'>{col}</th>" for col in df.columns if col != "image")
    html += "<th style='padding: 2px;'>Image</th></tr>"

    for _, row in df.iterrows():
        html += "<tr>"
        html += "".join(f"<td style='padding: 2px; text-align: left; white-space: nowrap;'>{row[col]}</td>" for col in df.columns if col != "image")
        html += f"<td style='padding: 2px;'><img src='{row['image']}' style='width:100px;'></td>"
        html += "</tr>"

    html += "</table>"

    display_html(HTML(html))

display_images_in_table(df)

file_name,class,index,original,Image
AREA_sw_0_7_s_0_7_cc_0,AREA,True,AREA_sw_0_7_s_0_7_cc_0,
AREA_sw_0_7_s_0_7_cc_1,AREA,False,AREA_sw_0_7_s_1_0_cc_1,
AREA_sw_0_7_s_0_7_cc_2,AREA,False,AREA_sw_1_2_s_0_7_cc_2,
AREA_sw_0_7_s_0_7_cc_3,AREA,False,AREA_sw_0_7_s_1_2_cc_3,
AREA_sw_0_7_s_0_7_oc,AREA,False,AREA_sw_1_0_s_0_7_oc,
AREA_sw_0_7_s_1_0_cc_0,AREA,False,AREA_sw_0_7_s_0_7_cc_0,
AREA_sw_0_7_s_1_0_cc_1,AREA,True,AREA_sw_0_7_s_1_0_cc_1,
AREA_sw_0_7_s_1_0_cc_2,AREA,False,AREA_sw_1_2_s_0_7_cc_2,
AREA_sw_0_7_s_1_0_cc_3,AREA,False,AREA_sw_0_7_s_1_2_cc_3,
AREA_sw_0_7_s_1_0_oc,AREA,False,AREA_sw_1_0_s_0_7_oc,


In [17]:
# df_index = df[df["original"].notna()]
# display_images_in_table(df_index)