In [None]:
import os
import glob

# set the base directory you want to search
base_path = '/Volumes/ag-heidel/wolffjoa/hyperparamter_score/scored_100kb'  # change this to your target path

# find all files with "final_scores" in the name recursively
pattern = os.path.join(base_path, '**', 'final_scores*')
files = glob.glob(pattern, recursive=True)

# dictionary to hold the results
results = {}

for file in files:
    filename = os.path.basename(file)
    # split the file name by '_' and get the 5th entry if available (index 4)
    parts = filename.split('_')
    file_name_key = parts[9] if len(parts) >= 10 else filename
    # print(file_name_key)
    # load the text file where the first column is the key and the second column is the value
    mapping = {}
    with open(file, 'r') as f:
        for line in f:
            # assuming whitespace separated values
            columns = line.strip().split()
            if line.lstrip().startswith('Matrix'):
                continue
            if len(columns) >= 2:
                key, value = columns[0], columns[1]
                mapping[key] = value

    results[file_name_key] = mapping

print(results)

In [None]:
len(results)

In [None]:
results

In [None]:
import pandas as pd

In [None]:
df_results = pd.DataFrame([{'Name': name, **inner_dict} for name, inner_dict in results.items()])
print(df_results)


In [None]:
df_results

In [None]:
df_results = df_results.dropna()
df_results

In [None]:
# Replace this with the actual path to your ELO file
elo_file_path = '/Users/wolffjoa/src/image-ranker/image_rankings_100kb.csv'
df_elo = pd.read_csv(elo_file_path)
df_elo['Image'] = df_elo['Image'].apply(lambda x: os.path.basename(x).split('_')[0])

In [None]:
df_elo

In [None]:
merged_df = pd.merge(df_results, df_elo, left_on='Name', right_on='Image')
print(merged_df)

In [None]:
# Convert all columns except 'Name' to float in merged_df
cols_to_float = merged_df.columns.drop('Name').drop('Image')
merged_df[cols_to_float] = merged_df[cols_to_float].astype(float)

In [None]:
import math
import numpy as np
from adjustText import adjust_text

import matplotlib.pyplot as plt

# Define the y-variables: include 'pearson AUC', 'hicrep' and any columns that start with 'best_model'
y_columns = []
if 'pearson_AUC:' in merged_df.columns:
    y_columns.append('pearson_AUC:')
if 'hicrep:' in merged_df.columns:
    y_columns.append('hicrep:')
if 'TAD_score_MSE:' in merged_df.columns:
    y_columns.append('TAD_score_MSE:')
if 'TAD_fraction:' in merged_df.columns:
    y_columns.append('TAD_fraction:')
if 'TAD_fraction_exact_match:' in merged_df.columns:
    y_columns.append('TAD_fraction_exact_match:')
y_columns.extend([col for col in merged_df.columns if col.startswith('best_model')])

wrong_scoring = {}
MSE_all = {}
# Calculate the grid dimensions for subplots (5 per row)
n_plots = len(y_columns)
n_cols = 4
n_rows = math.ceil(n_plots / n_cols)

fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols*5, n_rows*4), sharex=True)
# Flatten axes for easier iteration (handles both 1D and 2D arrays)
if n_rows == 1:
    ax_list = axes
else:
    ax_list = axes.flatten()

for idx, y in enumerate(y_columns):
    ax = ax_list[idx]
    print(y)
    # Convert the column to numeric values if possible
    # y_data = pd.to_numeric(merged_df[y], errors='coerce')
    # Normalize the 'pearson_AUC:' column for the x-axis
    # Define the x data column using a variable
    x_column = 'ELO'
    
    merged_df[x_column] = pd.to_numeric(merged_df[x_column], errors='coerce')
    x_min = merged_df[x_column].min()
    x_max = merged_df[x_column].max()
    merged_df[x_column] = (merged_df[x_column] - x_min) / (x_max - x_min)
    
    # Normalize the current y column for the y-axis
    merged_df[y] = pd.to_numeric(merged_df[y], errors='coerce')
    y_min = merged_df[y].min()
    y_max = merged_df[y].max()
    merged_df[y] = (merged_df[y] - y_min) / (y_max - y_min)
    
    ax.scatter(merged_df[x_column], merged_df[y])
    mse = np.mean((merged_df[x_column] - merged_df[y]) ** 2)
    MSE_all[y] = mse
    ax.text(0.95, 0.05, f'MSE: {mse:.4f}', transform=ax.transAxes, fontsize=12,
            verticalalignment='bottom', horizontalalignment='right', bbox=dict(boxstyle='round', facecolor='white', alpha=0.5))
    ax.set_xlabel(x_column.replace(":", ""))
    title_text = y.split(".degree")[0].replace('-', ' ').replace('fraction_exact_match', 'FEM').replace('_', ' ').replace("best model", "").replace('.pkl', '').replace("pearson", 'Pearson').replace("hicrep", 'HiCRep').strip(".").strip(":")
    ax.set_title(f'{title_text}')


# Create a mask for points you want to annotate
    mask = ((merged_df[x_column] >= 0.7) & (merged_df[y] <= 0.2)) | ((merged_df[x_column] <= 0.2) & (merged_df[y] >= 0.7))
    # Collect matching IDs for the current y column in a dictionary
    match_ids = {}
    cond1 = (merged_df[x_column] >= 0.7) & (merged_df[y] <= 0.2)
    cond2 = (merged_df[x_column] <= 0.2) & (merged_df[y] >= 0.7)
    match_ids['high_x_low_y'] = merged_df.loc[cond1, 'Name'].tolist()
    match_ids['low_x_high_y'] = merged_df.loc[cond2, 'Name'].tolist()
    wrong_scoring[y] = match_ids
    print(f"{title_text} matching IDs:", match_ids)
    # Prepare annotations
    import matplotlib.patheffects as path_effects
    texts = []
    for idx in merged_df[mask].index:
        txt = ax.text(
            merged_df.loc[idx, x_column], 
            merged_df.loc[idx, y], 
            merged_df.loc[idx, 'Name'], 
            fontsize=8, 
            ha='center', 
            va='center',
            color='black',
            bbox=dict(facecolor='white', edgecolor='none', alpha=0.7)
        )
        txt.set_path_effects([
            path_effects.Stroke(linewidth=1, foreground='white'),
            path_effects.Normal()
        ])
        texts.append(txt)

    # Adjust labels to avoid overlaps
    adjust_text(
        texts, 
        ax=ax, 
        arrowprops=dict(arrowstyle='-', color='grey', lw=0.5),
        expand_points=(1.2, 1.2),  # Controls how far labels can move
        expand_text=(1.2, 1.2),
        force_points=0.2,          # Adjust force parameters as needed
        force_text=0.2,
        lim=100                     # Limit iterations to improve performance
    )
    # Add a red dashed line representing x = y
    ax.plot([0, 1], [0, 1], color='red', linestyle='--', linewidth=1)
    
        # plt.show()
    # Add a bottom-right corner annotation clearly
    # ax.annotate(
    #     f"{title_text}: {value}",
    #     xy=(0.95, 0.05),
    #     xycoords='axes fraction',
    #     xytext=(-10, 10),
    #     textcoords='offset points',
    #     fontsize=10,
    #     verticalalignment='bottom',
    #     horizontalalignment='right',
    #     bbox=dict(boxstyle='round', facecolor='white', alpha=0.5)
    # )   
        # Turn off any unused subplots
for ax in ax_list[n_plots:]:
    ax.axis('off')

plt.tight_layout()
plt.savefig(os.path.join("/Users/wolffjoa/data_local", 'prediction_vs_human_100kb_per_score.pdf'), dpi=300)

plt.show()

In [None]:
binning = {}

for scoring, score_dict in wrong_scoring.items():
    print(scoring)
    for condition, ids in score_dict.items():
        print(f"  {condition}: {len(ids)}")

for score_dict in wrong_scoring.values():
    # for ids in score_dict.values():
    count = sum(len(ids) for ids in score_dict.values())
    binning[count] = binning.get(count, 0) + 1

print("Binning of number of elements:")
for num_elements in sorted(binning):
    print(f"{num_elements} elements: {binning[num_elements]}")

In [None]:
renamed_MSE = {
    key.split(".degree")[0]
          .replace('-', ' ')
          .replace('fraction_exact_match', 'FEM')
          .replace('_', ' ')
          .replace("best model", "")
          .replace('.pkl', '')
          .replace("pearson", 'Pearson')
          .replace("hicrep", 'HiCRep')
          .strip(".")
          .strip(":"): mse
    for key, mse in MSE_all.items()
}

print(renamed_MSE)
sorted_MSE = dict(sorted(renamed_MSE.items(), key=lambda item: item[1]))
print(sorted_MSE)

In [None]:
sorted_MSE

In [None]:
import math
import numpy as np
from adjustText import adjust_text
from matplotlib.text import Annotation

import matplotlib.pyplot as plt

# Define the y-variables: include 'pearson AUC', 'hicrep' and any columns that start with 'best_model'
y_columns = []
if 'pearson_AUC:' in merged_df.columns:
    y_columns.append('pearson_AUC:')
if 'hicrep:' in merged_df.columns:
    y_columns.append('hicrep:')
if 'TAD_score_MSE:' in merged_df.columns:
    y_columns.append('TAD_score_MSE:')
if 'TAD_fraction:' in merged_df.columns:
    y_columns.append('TAD_fraction:')
if 'TAD_fraction_exact_match:' in merged_df.columns:
    y_columns.append('TAD_fraction_exact_match:')
y_columns.extend([col for col in merged_df.columns if col.startswith('best_model')])

texts_all = {}

MSE_all = {}
# Calculate the grid dimensions for subplots (5 per row)
n_plots = len(y_columns)
n_cols = 4
n_rows = math.ceil(n_plots / n_cols)

fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols*5, n_rows*4), sharex=True)
# Flatten axes for easier iteration (handles both 1D and 2D arrays)
if n_rows == 1:
    ax_list = axes
else:
    ax_list = axes.flatten()

for idx, y in enumerate(y_columns):
    ax = ax_list[idx]
    print(y)
    # Convert the column to numeric values if possible
    # y_data = pd.to_numeric(merged_df[y], errors='coerce')
    # Normalize the 'pearson_AUC:' column for the x-axis
    # Define the x data column using a variable
    x_column = 'ELO'
    
    merged_df[x_column] = pd.to_numeric(merged_df[x_column], errors='coerce')
    x_min = merged_df[x_column].min()
    x_max = merged_df[x_column].max()
    merged_df[x_column] = (merged_df[x_column] - x_min) / (x_max - x_min)
    
    # Normalize the current y column for the y-axis
    merged_df[y] = pd.to_numeric(merged_df[y], errors='coerce')
    y_min = merged_df[y].min()
    y_max = merged_df[y].max()
    merged_df[y] = (merged_df[y] - y_min) / (y_max - y_min)
    
    ax.scatter(merged_df[x_column], merged_df[y])
    mse = np.mean((merged_df[x_column] - merged_df[y]) ** 2)
    MSE_all[y] = mse
    ax.text(0.95, 0.05, f'MSE: {mse:.4f}', transform=ax.transAxes, fontsize=12,
            verticalalignment='bottom', horizontalalignment='right', bbox=dict(boxstyle='round', facecolor='white', alpha=0.5))
    ax.set_xlabel(x_column.replace(":", ""))
    title_text = y.split(".degree")[0].replace('-', ' ').replace('fraction_exact_match', 'FEM').replace('_', ' ').replace("best model", "").replace('.pkl', '').replace("pearson", 'Pearson').replace("hicrep", 'HiCRep').strip(".").strip(":")
    ax.set_title(f'{title_text}')


    # Create a mask for points you want to annotate
    
    wrong = wrong_scoring.get('hicrep:', {})
    high = wrong.get('high_x_low_y', [])
    low = wrong.get('low_x_high_y', [])
# annotate high points with 'high'
    # Annotate high points in red
    for name in high:
        row = merged_df[merged_df['Name'] == name]
        if not row.empty:
            x_pos = row[x_column].values[0]
            y_pos = row[y].values[0]
            txt = ax.annotate(
                name,
                (x_pos, y_pos),
                color='red',
                fontsize=10,
                fontweight='bold',
                bbox=dict(facecolor='white', alpha=0.5, edgecolor='none')
            )
            txt.set_path_effects([
                path_effects.Stroke(linewidth=1, foreground='white'),
                path_effects.Normal()
            ])

    # Annotate low points in blue
    for name in low:
        row = merged_df[merged_df['Name'] == name]
        if not row.empty:
            x_pos = row[x_column].values[0]
            y_pos = row[y].values[0]
            txt = ax.annotate(
                name,
                (x_pos, y_pos),
                color='blue',
                fontsize=10,
                fontweight='bold',
                bbox=dict(facecolor='white', alpha=0.5, edgecolor='none')
            )
            txt.set_path_effects([
                path_effects.Stroke(linewidth=1, foreground='white'),
                path_effects.Normal()
            ])
    ax.plot([0, 1], [0, 1], color='red', linestyle='--', linewidth=1)

    
    #
    texts = [child for child in ax.get_children() if isinstance(child, Annotation)]
    texts_all[title_text] = texts

    # print(texts)
    # Adjust the annotations to avoid overlaps and add arrows pointing to their datapoints
    # adjust_text(
    #     texts,
    #     ax=ax,
    #     arrowprops=dict(arrowstyle='->', color='grey', lw=0.5),
    #     expand_points=(1.2, 1.2),
    #     expand_text=(1.2, 1.2)
    # )
    adjust_text(
        texts, 
        ax=ax,
        arrowprops=dict(arrowstyle='->', color='grey'),
        expand_points=(2, 2),  # Increase expansion around points to consider points as obstacles
        force_points=0.5,      # Adjust force of points to encourage arrow drawing
        force_text=0.7,        # Adjust force of texts to push them away slightly
        only_move={'points': 'xy', 'texts': 'xy'},  # Allow labels and points movement in xy directions
    )
    print('y: {}'.format(y))
    if y == 'hicrep:' or y == 'best_model.pearson_AUC-TAD_fraction_exact_match-TAD_score_MSE.degree_1.pkl:':
        fig_ind, ax_ind = plt.subplots(figsize=(5, 4))
        ax_ind.scatter(merged_df[x_column], merged_df[y])
        mse_ind = np.mean((merged_df[x_column] - merged_df[y]) ** 2)
        ax_ind.text(0.95, 0.05, f'MSE: {mse_ind:.4f}', transform=ax_ind.transAxes, fontsize=12,
                verticalalignment='bottom', horizontalalignment='right',
                bbox=dict(boxstyle='round', facecolor='white', alpha=0.5))
        ax_ind.plot([0, 1], [0, 1], color='red', linestyle='--', linewidth=1)
        ax_ind.set_xlabel(x_column.replace(":", ""))
        ax_ind.set_ylabel(title_text)
        # ax_ind.set_title(title_text)
        # texts = []
        wrong = wrong_scoring.get('hicrep:', {})
        high = wrong.get('high_x_low_y', [])
        low = wrong.get('low_x_high_y', [])
    # annotate high points with 'high'
        # Annotate high points in red
        for name in high:
            row = merged_df[merged_df['Name'] == name]
            if not row.empty:
                x_pos = row[x_column].values[0]
                y_pos = row[y].values[0]
                txt = ax_ind.annotate(
                    name,
                    (x_pos, y_pos),
                    color='red',
                    fontsize=10,
                    fontweight='bold',
                    bbox=dict(facecolor='white', alpha=0.5, edgecolor='none')
                )
                txt.set_path_effects([
                    path_effects.Stroke(linewidth=1, foreground='white'),
                    path_effects.Normal()
                ])

        # Annotate low points in blue
        for name in low:
            row = merged_df[merged_df['Name'] == name]
            if not row.empty:
                x_pos = row[x_column].values[0]
                y_pos = row[y].values[0]
                txt = ax_ind.annotate(
                    name,
                    (x_pos, y_pos),
                    color='blue',
                    fontsize=10,
                    fontweight='bold',
                    bbox=dict(facecolor='white', alpha=0.5, edgecolor='none')
                )
                txt.set_path_effects([
                    path_effects.Stroke(linewidth=1, foreground='white'),
                    path_effects.Normal()
                ])
        ax_ind.plot([0, 1], [0, 1], color='red', linestyle='--', linewidth=1)

    
    #
        texts = [child for child in ax_ind.get_children() if isinstance(child, Annotation)]
        texts_all[title_text] = texts

        # Adjust labels to avoid overlaps
        adjust_text(
            texts,
            ax=ax_ind,
            arrowprops=dict(arrowstyle='-', color='grey', lw=0.5),
            expand_points=(1.2, 1.2),  # Controls how far labels can move
            expand_text=(1.2, 1.2),
            force_points=0.2,          # Adjust force parameters as needed
            force_text=0.2,
            lim=100                    # Limit iterations to improve performance
        )
        plt.tight_layout()
        fig_ind.savefig(os.path.join("/Users/wolffjoa/data_local", f"subplot_{title_text.replace(' ', '_')}.pdf"), dpi=300)
        plt.close(fig_ind)
for ax in ax_list[n_plots:]:
    ax.axis('off')

plt.tight_layout()
plt.savefig(os.path.join("/Users/wolffjoa/data_local", 'prediction_vs_human_100kb.pdf'), dpi=300)
plt.show()

In [None]:
ids = [text.get_text() for text in texts]

In [None]:
import os
import glob
import shutil

# Define source and destination directories
source_folder = '/Users/wolffjoa/src/image-ranker/static/100kb_pdf'
dest_folder = os.path.join(source_folder, 'selected_pdfs')
os.makedirs(dest_folder, exist_ok=True)

# List of IDs to look for at the beginning of the PDF file names
pdf_ids = [
    '571e8570', '94f3fed9', '31b4ff17', '5be98a17', 
    'eef49770', 'e79e9b25', '6b74fb7d', '3583c727', 
    'e52b6646', 'd7172994'
]

for pdf_id in pdf_ids:
    pattern = os.path.join(source_folder, f"{pdf_id}*.pdf")
    for pdf_file in glob.glob(pattern):
        shutil.copy(pdf_file, dest_folder)
        print(f"Copied {pdf_file} to {dest_folder}")

In [None]:
import subprocess
import glob
import os

# Get the list of IDs from the wrong_scoring dictionary for low_x_high_y
# (Alternatively, you could also use the pdf_ids list defined earlier)
ids_to_plot = wrong.get('low_x_high_y', pdf_ids)

for sample_id in ids_to_plot:
    pattern = os.path.join("/Users/wolffjoa/data_local/hicgan/hyperparameter/100kb", f"{sample_id}*.cool")
    matrix_files = glob.glob(pattern)
    if matrix_files:
        path_matrix = matrix_files[0]
    else:
        print(f"Matrix file not found for {sample_id}")
        continue
    # Define the genomic region
    region = "chr1:18000000-22000000"
    config_content = """[hic matrix]
    file = {0}
    title = {1}
    depth = {2}
    transform = log1p
    file_type = hic_matrix
    show_masked_bins = false
    """.format(path_matrix, sample_id, 2000000,)

    with open("tracks_config.ini", "w") as f:
        f.write(config_content)
    # Path to your pyGenomeTracks configuration file (adjust this path as needed)
    track_config = "tracks_config.ini"  


    output_file = f"100kb_{sample_id}_chr1_18-22Mb.pdf"
    cmd = [
       "pyGenomeTracks",
       "--tracks", track_config,
       "--region", region,
       "--out", output_file
    ]
    print("Running command:", " ".join(cmd))
    subprocess.run(cmd)

In [None]:
averages_dict = {}

for group, annotations in texts_all.items():
    # For each annotation, get its (x, y) position and compute the average
    x_avg = sum(annot.get_position()[0] for annot in annotations) / len(annotations)
    y_avg = sum(annot.get_position()[1] for annot in annotations) / len(annotations)
    averages = (x_avg, y_avg)
    averages_dict[group] = averages

print(averages_dict)