## Setup the Notebook:

In [2]:
#####################################
#
#   Load libraries
#
#####################################

from IPython.core.interactiveshell import InteractiveShell
from IPython.display import display, HTML

import numpy as np
import pandas as pd
from pathlib import Path
import pyhere
import os
import rfmix_reader

#####################################
#
#   Set Screen Attributes
#
#####################################

InteractiveShell.ast_node_interactivity = "all"

# set cell width
display(HTML("<style>.container { width:100% !important; }</style>"))

# set cell output window height
display(HTML("<style>div.output_scroll { height: 160em;} </style>"))



In [8]:
print(os.getcwd())
print(pyhere.here())
print(Path(pyhere.here()))

/gpfs/projects/p32505/users/manuel/rfmix_reader-benchmarking/visualization/bivariate_tests/_h
/gpfs/projects/p32505/users/manuel/rfmix_reader-benchmarking
/gpfs/projects/p32505/users/manuel/rfmix_reader-benchmarking


## Load the Data:

### Groundtruth data:

In [4]:
def load_groundtruth_data(gt_dir: str) -> pd.DataFrame:
    gt_dir = pyhere.here(gt_dir)
    all_dfs = []

    for chrom in range(1, 23):  # chromosomes 1–22
        fname = gt_dir / f"global_ancestry_chr{chrom}.tsv"
        if fname.exists():
            df = pd.read_csv(fname, sep="\t", header=0)

            # Rename ancestry columns
            df = df.rename(columns={"CEU": "EUR", "YRI": "AFR"})

            # Add chromosome suffix to ancestry columns
            ancestry_cols = [col for col in df.columns if col not in ["Sample"]]
            df = df.rename(columns={col: f"{col}_chr{chrom}" for col in ancestry_cols})

            all_dfs.append(df)
        else:
            print(f"Missing file: {fname}")

    if not all_dfs:
        raise FileNotFoundError(f"No global_ancestry_chr*.tsv files found in {gt_dir}")

    # Merge across chromosomes on 'Sample'
    merged_df = all_dfs[0]
    for df in all_dfs[1:]:
        merged_df = merged_df.merge(df, on="Sample", how="outer")
        
    merged_df = merged_df.sort_values(by='Sample')
    
    # Extract numeric part of Sample for sorting
    merged_df["Sample_num"] = merged_df["Sample"].str.extract(r"(\d+)").astype(int)
    merged_df = merged_df.sort_values(by="Sample_num").drop(columns="Sample_num")
    

    return merged_df

In [5]:
gt_data = load_groundtruth_data("input/simulations/two_populations/ground_truth/_m/")

gt_data

Unnamed: 0,Sample,EUR_chr1,AFR_chr1,EUR_chr2,AFR_chr2,EUR_chr3,AFR_chr3,EUR_chr4,AFR_chr4,EUR_chr5,...,EUR_chr18,AFR_chr18,EUR_chr19,AFR_chr19,EUR_chr20,AFR_chr20,EUR_chr21,AFR_chr21,EUR_chr22,AFR_chr22
0,Sample_1,0.314362,0.685638,0.213771,0.786229,0.280459,0.719541,0.370560,0.629440,0.205906,...,0.244261,0.755739,0.485886,0.514114,0.197688,0.802312,0.000000,1.000000,0.110419,0.889581
111,Sample_2,0.409872,0.590128,0.321084,0.678916,0.160311,0.839689,0.232940,0.767060,0.044281,...,0.317566,0.682434,0.000000,1.000000,0.085948,0.914052,0.170278,0.829722,0.343881,0.656119
222,Sample_3,0.223355,0.776645,0.007728,0.992272,0.205811,0.794189,0.464478,0.535522,0.201636,...,0.172986,0.827014,0.161128,0.838872,0.000000,1.000000,0.140969,0.859031,0.000000,1.000000
333,Sample_4,0.161029,0.838971,0.162617,0.837383,0.018236,0.981764,0.185059,0.814941,0.233355,...,0.064111,0.935889,0.176425,0.823575,0.107004,0.892996,0.272772,0.727228,0.293038,0.706962
444,Sample_5,0.028305,0.971695,0.008504,0.991496,0.234501,0.765499,0.176433,0.823567,0.120807,...,0.216797,0.783203,0.084953,0.915047,0.442491,0.557509,0.257424,0.742576,0.443771,0.556229
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
440,Sample_496,0.017629,0.982371,0.156928,0.843072,0.193409,0.806591,0.073246,0.926754,0.119868,...,0.212505,0.787495,0.031453,0.968547,0.109911,0.890089,0.475538,0.524462,0.226782,0.773218
441,Sample_497,0.275983,0.724017,0.272697,0.727303,0.121946,0.878054,0.088806,0.911194,0.257655,...,0.391670,0.608330,0.143568,0.856432,0.075388,0.924612,0.309671,0.690329,0.170085,0.829915
442,Sample_498,0.400779,0.599221,0.204194,0.795806,0.124295,0.875705,0.362392,0.637608,0.556769,...,0.285743,0.714257,0.127441,0.872559,0.079572,0.920428,0.000000,1.000000,0.000000,1.000000
443,Sample_499,0.170368,0.829632,0.109040,0.890960,0.098155,0.901845,0.362214,0.637786,0.182283,...,0.151625,0.848375,0.056333,0.943667,0.431893,0.568107,0.292890,0.707110,0.004775,0.995225


### RFMix Data:

In [12]:
rfmix_path = Path(("/projects/p32505/users/manuel/rfmix_reader-benchmarking/input/simulations/two_populations/_m/rfmix-out/"))
binary_path = 'rfmix_reader-benchmarking/input/real_data/rfmix-version/_m/binary_files/'

rf_q, admix = rfmix_reader.read_rfmix(file_prefix = rfmix_path,
                                     binary_dir = binary_path)

Multiple RFMIX file sets read in this order: ['chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chr7', 'chr8', 'chr9']



Mapping loci information:   0%|                                                                                            | 0/16 [00:00<?, ?it/s][A

KeyboardInterrupt: 

### Flare Data: 