In [1]:
import os
import pandas as pd

In [2]:
# actual data repo
data_repo = "../../../data"

In [3]:
with open("prefixes.txt", "r") as f:
    prefixes = [line.rstrip() for line in f]

In [4]:
pre_simple = ['AREA', 'BAR', 'BRUSH', 'POINT', 'RECT', 'TEXT', 'TRIANGLE', 'arcs', 'LINE', "basic-link", "VCF_POINT_MUTATIONS", "GFF_DEMO"]

pre_pre_chromoscope = ["BOCA", "BRCA", "BTCA", "GBM", "LIRI", "PBCA"]
pre_pre_chromoscope2 = ["breast", "gastric", "kidney", "ovarian serous cystadenocarcinoma", "ovarian", "prostate adenocarcinoma", "sarcoma"]
pre_chromoscope = [pre for pre in prefixes if any(pre.startswith(chromo) for chromo in pre_pre_chromoscope + pre_pre_chromoscope2)] + ["EX_SPEC_CANCER_VARIANT_PROTOTYPE"]

pre_circular = [pre for pre in prefixes if pre.startswith("circular")] + ["EX_SPEC_CIRCOS_BETWEEN_LINK", "EX_SPEC_CIRCULR_RANGE", "simple_circular", "multi_layer_circular", "multi_view_circular_ideograms", "responsive-circular", "EX_SPEC_CIRCOS"]

pre_pre_linear = ["stratified", "stacked-bar", "overview-landing", "combination", "example", "LINKING_TRACKS", "OVERLAY_TRACKS_LINE_POINT", "SEMANTIC_ZOOM_SEQUENCE", "scatterplot", "viridis-heatmap", "VCF_INDELS"]
pre_linear = [pre for pre in prefixes if any(pre.startswith(lin) for lin in pre_pre_linear)]

pre_gene_annotation = [pre for pre in prefixes if pre.startswith("gene_annotation")] + ["DUMMY_TRACK", "EX_SPEC_GENE_ANNOTATION"]

pre_ideograms = ["EX_SPEC_CYTOBANDS", "BED_DEMO", "SEMANTIC_ZOOM_CYTO", "responsive-ideogram"]

pre_matrix = ["hi-c-matrix-alt", "hi-c-matrix", "hic", 'comparative-matrix', 'gray_heatmap', "heatmap", "EX_SPEC_RESPONSIVE_COMPARATIVE_MATRICES", "EX_SPEC_MATRIX_HFFC6", "EX_SPEC_MATRIX"]

In [5]:
#others
[pre for pre in prefixes if pre not in pre_simple + pre_chromoscope + pre_circular + pre_linear + pre_gene_annotation + pre_ideograms + pre_matrix]

['EX_SPEC_ALIGNMENT_CHART',
 'EX_SPEC_CIRCULAR_OVERVIEW_LINEAR_DETAIL',
 'EX_SPEC_CLINVAR_LOLLIPOP',
 'EX_SPEC_GIVE',
 'EX_SPEC_GREMLIN',
 'EX_SPEC_LINKING',
 'EX_SPEC_MARK_DISPLACEMENT',
 'EX_SPEC_MOUSE_EVENT',
 'EX_SPEC_PERF_ALIGNMENT',
 'EX_SPEC_PILEUP',
 'EX_SPEC_RESPONSIVE_COMPARATIVE_VIEWS',
 'EX_SPEC_RESPONSIVE_TRACK_WISE_COMPARISON',
 'EX_SPEC_SARS_COV_2',
 'EX_SPEC_SEQUENCE_TRACK',
 'EX_SPEC_TEMPLATE',
 'OVERLAY_TRACKS_BAR_POINT',
 'band_connection',
 'multiple_view',
 'responsive-multivec',
 'rule-mark',
 'single-cell-epi-corces']

In [6]:
files = [f.split(".json")[0] for f in os.listdir(os.path.join(data_repo, "unified", "specs"))]
len(files)

3200

In [7]:
df = pd.DataFrame(files, columns=["file_name"])
df['class'] = df['file_name'].apply(lambda x: next((pre for pre in prefixes if x.startswith(pre)), None))
df['image'] = df['file_name'].apply(lambda x: f"{data_repo}/unified/imgs/{x}.png")

df_1_class = df.groupby('class').first()
df_1_class = df_1_class.reset_index()
df_1_class.head()

Unnamed: 0,class,file_name,image
0,AREA,AREA_sw_1_2_s_1_2_oc,../../../data/unified/imgs/AREA_sw_1_2_s_1_2_o...
1,BAR,BAR_sw_0_7_s_1_2_cc_3,../../../data/unified/imgs/BAR_sw_0_7_s_1_2_cc...
2,BED_DEMO,BED_DEMO_sw_0_7_s_0_7_oc,../../../data/unified/imgs/BED_DEMO_sw_0_7_s_0...
3,BOCA-UK,BOCA-UK-f86e2d80-911b-7a19-e040-11ac0d486900,../../../data/unified/imgs/BOCA-UK-f86e2d80-91...
4,BRCA-EU,BRCA-EU-fc8130df-897d-5404-e040-11ac0d485e0a,../../../data/unified/imgs/BRCA-EU-fc8130df-89...


In [8]:
def class_to_group(class_name):
    if class_name in pre_simple:
        return "simple"
    elif class_name in pre_chromoscope:
        return "chromoscope"
    elif class_name in pre_circular:
        return "circular"
    elif class_name in pre_linear:
        return "linear"
    elif class_name in pre_gene_annotation:
        return "gene_annotation"
    elif class_name in pre_ideograms:
        return "ideogram"
    elif class_name in pre_matrix:
        return "matrix"
    else:
        return "complex"

df_1_class['group'] = df_1_class['class'].apply(class_to_group)
df_1_class.sort_values(by='group', inplace=True)

In [9]:
df_1_class[['class', 'group']].to_csv("class_group.csv", index=False)

In [10]:
from IPython.display import display_html, HTML
def display_images_in_table(df):
    html = """<table border='1' style='border-collapse: collapse; text-align: left;'>
              <tr>"""  

    html += "".join(f"<th style='padding: 4px; text-align: left; white-space: normal;'>{col}</th>" for col in df.columns if (col != "image" and col != "image_original"))
    html += "<th style='padding: 4px;'>Image</th>"

    for _, row in df.iterrows():
        html += "<tr>"
        html += "".join(f"<td style='padding: 4px; text-align: left; white-space: normal; word-break: break-word;'>{row[col]}</td>" for col in df.columns if (col != "image" and col != "image_original"))
        html += f"<td style='padding: 4px;'><img src='{row['image']}' style='width:300px;'></td>"
        html += "</tr>"

    html += "</table>"

    display_html(HTML(html))
    
display_images_in_table(df_1_class)


class,file_name,group,Image
gastric,gastric-bc0dee07-de20-44d6-be65-05af7e63ac96,chromoscope,
LIRI-JP,LIRI-JP-bcc74a5e-c622-11e3-bf01-24c6515278c0,chromoscope,
sarcoma,sarcoma-7d332cb1-ba25-47e4-8bf8-d25e14f40d59,chromoscope,
PBCA-DE,PBCA-DE-2009e5e7-1796-445b-8677-46b3804fe0bf,chromoscope,
prostate adenocarcinoma,prostate adenocarcinoma-0bfd1043-816e-e3e4-e050-11ac0c4860c5,chromoscope,
ovarian serous cystadenocarcinoma,ovarian serous cystadenocarcinoma-b243adb4-b3e7-4e0e-bc0d-625aa8dbb1be,chromoscope,
ovarian,ovarian-7a921087-8e62-4a93-a757-fd8cdbe1eb8f,chromoscope,
breast,breast-b27d75ba-5989-4200-bfe9-f1b7d7cf8008,chromoscope,
breast_cancer,breast_cancer_sw_0_5_s_2_0_cc_2,chromoscope,
breast_cancer_circular,breast_cancer_circular_s_1_0_cc_2,chromoscope,
