# 1. set up

## 1.1 library

In [None]:
import sys
print("print version")
print(sys.version)

import os
import time

from collections import Counter

import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt

# for plotting
import matplotlib.patches as patches # for plotting figure
from matplotlib.patches import Patch
import gc # for collecting garbage
import seaborn as sns

import matplotlib.font_manager as fm

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

import cvxpy as cp

from helper_plot import *

## 1.2 data read

### 1.2.1. read variables

In [None]:
# read df_meta_population
path_population = '../data/population/'
file_population = path_population + 'meta_sub.tsv'
fn_file_exists(file_population)
df_meta_population = pd.read_csv(file_population, sep = '\t')

vec_sub = ['GBR', 'FIN', 'CHS', 'PUR', 'CDX',
           'CLM', 'IBS', 'PEL', 'PJL', 'KHV',
           'ACB', 'GWD', 'ESN', 'BEB', 'MSL',
           'STU', 'ITU', 'CEU', 'YRI', 'CHB',
           'JPT', 'LWK', 'ASW', 'MXL', 'TSI',
           'GIH']

dict_sub_index = {}
for i in range(26):
    dict_sub_index[vec_sub[i]] = i
    
#df_meta_population['list_indices']
dict_vec_GT = {
    'GBR': 13, 'FIN': 65, 'CHS': 193, 'PUR': 315, 'CDX': 380,
    'CLM': 452, 'IBS': 639, 'PEL': 679, 'PJL': 1219, 'KHV': 819,
    'ACB': 859, 'GWD': 1248, 'ESN': 1463, 'BEB': 1769, 'MSL': 1526,
    'STU': 1817, 'ITU': 1857, 'CEU': 2164, 'YRI': 2343, 'CHB': 2372,
    'JPT': 2522, 'LWK': 2592, 'ASW': 2860, 'MXL': 2828, 'TSI': 2998,
    'GIH': 3105
}


dict_super = {row['SUB']: row['POP'] for index, row in df_meta_population.iterrows()}

vec_mixed = [
    'ACB_BEB', 'CHS_PJL', 'CLM_YRI', 'FIN_CHS', 'GBR_GWD', 'IBS_YRI',
    'PUR_CEU', 'CHS_KHV', 'CLM_CLM', 'ESN_MXL', 'GBR_GIH', 'IBS_ESN', 
    'PJL_ITU', 'YRI_YRI']

vec_sup_full = vec_pop_ordered

# 2. bone marrow

## 2.1. functions

In [None]:
# function to get the % of present genotypes
def fn_get_prop(input_temp):
    # Assuming dict_GT['210521_MNC_1'] is your NumPy array
    array_values, counts = np.unique(input_temp, return_counts=True)

    # Create a dictionary to store the value counts
    value_counts_dict = dict(zip(array_values, counts))

    # Now you can access the value counts of each element
    return((value_counts_dict[0.0] + value_counts_dict[0.5] + value_counts_dict[1.0]) / n_mut)

def plot_bar_bm(df_prob, label_sample = "", loc_legend = "left", 
                bool_save_plot = False, path_plot = "", name_plot = "", size_plot_x = 3.0,
                size_plot_y = 2.0, pad_x_tick = 0.035, total_width = 1):
    n_sample = df_prob.shape[1]
    df_prob = df_prob.loc[vec_pop_ordered]
    fig, ax = plt.subplots(figsize=(size_plot_x, size_plot_y))  # Create a figure and an axes.

    # Set the background color
    fig.set_facecolor('white')  # Set the figure background to white
    ax.set_facecolor('white')   # Set the axes background to white
    
    # Bar plot settings
    # total_width = 1  # Total width for a group of bars
    bar_width = total_width / n_sample
    
    # bar_width = 1.0 / n_sample  # Width of individual bars
    # total_width = n_sample * bar_width  # Total width for a group of bars
    ind = np.arange(len(df_prob))
    
    # Plot each bar
    for i, col in enumerate(df_prob.columns):
        if (i > 0):
            i_color = i + 2
            if (i > 1):
                i_color = i_color + 1
        else:
            i_color = i
                
        offset = (i + 1/2) * bar_width - total_width / 2  # Center bars around each tick
        ax.bar(ind + offset, df_prob[col], 
               bar_width, # bar width # bar_width + n_sample/20
               color = plt.cm.viridis(np.linspace(0, 1, n_sample + 3))[i_color], 
               edgecolor = plt.cm.viridis(np.linspace(0, 1, n_sample + 3))[i_color], 
               label = col, linewidth = 0)
        
    # Set x-ticks to the center of each group of bars
    ax.set_xticks(ind)
    ax.set_xticklabels(df_prob.index, rotation='vertical', fontsize=size_font)

    plt.xlim(-0.5, 25.5)
    plt.xlabel('Populations', fontsize = size_font)
    plt.ylabel('Predicted Probabilities', fontsize = size_font)
    plt.yticks(np.arange(0, 1.1, 0.2))  # Adjust the range and step as needed

    plt.tick_params(axis = 'both', which = 'major', labelsize = size_font)

    # gridlines in the y-axis
    # ax.grid(True, which='both', axis='y', linestyle='--', linewidth=0.5, color = "black")

    # get handles and labels from the old legend
    old_handles, old_labels = ax.get_legend_handles_labels()

    # create new handles for the new legend entries
    new_handles = [patches.Rectangle((0, 0), 1, 1, color = dict_color_super[region_i]) for region_i in vec_sup]
    new_labels = vec_sup_full

    if label_sample == "":
        label_sample = ["sample" + str(i) for i in range(1, n_sample + 1)]
    
    labels_donor = label_sample
    # combine old and new handles and labels
    handles = old_handles + new_handles
    # labels = labels_donor + new_labels
    labels = labels_donor[:n_sample]

    # create the new legend
    legend_bar = ax.legend(handles, labels, loc = 'upper ' + loc_legend, fontsize = size_font,
                           frameon=True, facecolor='white', edgecolor = "black")
    
    # legend_bar.set_title(title_plot + " (% of GT Reads)", prop={"size": size_font}) 
    ax.add_artist(legend_bar)

    for label in ax.get_xticklabels():  
        pop_temp = label.get_text()
        color_temp = dict_color_super[df_meta_unique[df_meta_unique['SUP'] == pop_temp]['POP'].values[0]]
        label.set_bbox(dict(facecolor = color_temp, edgecolor = 'None', alpha = 0.5, pad = 0.5))  # change color as needed
    
    # Adjusting space between the x-axis and its tick labels
    ## 1. Reduce pad to bring labels closer
    ax.tick_params(axis = 'x', which = 'major', pad = 0.001)

    ## 2. Manually adjust the position of x-axis tick labels
    for label in ax.get_xticklabels():
        label.set_y(label.get_position()[1] + pad_x_tick)
        
    # Reduce empty space inside the plot area
    ax.margins(x = 0.001)  # Reduce the x margins (Adjust the value as needed)

    ax.spines['top'].set_visible(True)
    ax.spines['right'].set_visible(True)
    ax.spines['bottom'].set_visible(True)
    ax.spines['left'].set_visible(True)

    ax.spines['top'].set_color('black')
    ax.spines['right'].set_color('black')
    ax.spines['bottom'].set_color('black')
    ax.spines['left'].set_color('black')

    ax.spines['top'].set_linewidth(1)
    ax.spines['right'].set_linewidth(1)
    ax.spines['bottom'].set_linewidth(1)
    ax.spines['left'].set_linewidth(1)
    
    ax.yaxis.set_ticks([])
    ax.yaxis.set_ticklabels([]) 
    
    # Reduce padding and adjust layout
    plt.tight_layout(pad=0.4, w_pad=0.3, h_pad=0.3)  # Adjust padding parameters as needed

    # Adjust subplot parameters to reduce unnecessary margins
    plt.subplots_adjust(left=0.05, right=0.975, top=0.975, bottom=0.10)
    
    if (bool_save_plot):
        fn_path_exists(path_plot)
        print(f'<plot_bar_admixed> saving at path_plot: {path_plot}')
        plt.savefig(f'{fn_ensure_slash(path_plot)}{name_plot}_vertical.pdf', 
                    format = 'pdf', dpi = 1200, bbox_inches = 'tight')
        plt.savefig(f'{fn_ensure_slash(path_plot)}{name_plot}_vertical.jpeg', 
                    format = 'jpeg', dpi = 1200, bbox_inches = 'tight')
    plt.show()

## 2.2. data process

In [None]:
dict_p_site_present = {
    'BM_1': 5.87, # white
    'BM_2': 6.01, # black
    'BM_3': 3.14, # asian
    'BM_4': 2.57, # indian
    'BM_5': 8.97, # asian
    'BM_6': 13.0, # white
    'BM_7': 7.43, # white
    'BM_WGS_6': 99.3, # white
    'BM_WGS_7': 97.6 # white
}

file_df_prob_input = '../data/results_BM/df_prob_BM_240429.tsv'
fn_file_exists(file_df_prob_input)
df_prob_input = pd.read_csv(
    file_df_prob_input,
    sep = '\t', index_col = 0)

df_meta = pd.DataFrame.from_dict(dict_p_site_present, orient='index', columns=['GT.read'])
df_meta['ID_ind'] = list(df_meta.index)
df_meta['origin'] = list(df_meta.index)

## 2.3. vertical plots

In [None]:
plot_bar_bm(df_prob_input.iloc[:, 0:5], 
            label_sample = ['BM_ID 1, White', 'BM_ID 2, Black', 'BM_ID 3, Asian', 'BM_ID 4, Indian', 'BM_ID 5, Asian'],
            loc_legend = "right", bool_save_plot = True, 
            path_plot = "../figure/figure3/",
            name_plot = "figure3_BM_1to5",
            total_width = 0.8, 
            size_plot_x = 3.9, size_plot_y = 1.9, pad_x_tick = 0.035)

plot_bar_bm(df_prob_input.iloc[:, [5,7,6,8]], 
         label_sample = ['ET1 (scRNA-seq)', 'ET1 (WGS)', 'ET2 (scRNA-seq)', 'ET2 (WGS)'],
         loc_legend = "left", bool_save_plot = True, 
         path_plot = "../figure/figure3/",
         name_plot = "figure3_BM_6and7", 
         total_width = 0.9, 
         size_plot_x = 3.25, pad_x_tick = 0.035)

## 2.4. horizontal plots

### 2.4.1. BM1 ~ 5

In [None]:
get_barhplot(df_prob_input.iloc[:, 0:5].T, df_meta.iloc[0:5, :], label_tick = 'ID_ind',
             bool_save_plot = True,
             path_plot = "../figure/figure3/",
             name_plot = "figure3_BM1to5")

### 2.4.2. BM6

In [None]:
get_barhplot(df_prob_input.iloc[:, [5,7]].T, df_meta.iloc[[5,7], :], label_tick = 'ID_ind',
             bool_save_plot = True,
             path_plot = "../figure/figure3/",
             name_plot = "figure3_BM6")

### 2.4.3. BM7

In [None]:
get_barhplot(df_prob_input.iloc[:, [6,8]].T, df_meta.iloc[[6,8], :], label_tick = 'ID_ind',
             bool_save_plot = True,
             path_plot = "../figure/figure3/",
             name_plot = "figure3_BM7")

# 3. admixtures

## 3.1 functions

In [None]:
# horizontal stacked plots
def get_barhplot_admixed(df_prob_input, df_meta_input, label_tick = 'GT.read', 
                         bool_save_plot = False, path_plot = "", labelpad_y = 50,
                 size_plot_x = 6, size_plot_y = 1.5, name_plot = "", min_prob_label = 0.09):
    
    test_bool_1 = (df_prob_input.index == df_meta_input.index).all()
    if (not(test_bool_1)):
        print(f'<get_barhplot> WARNING: index of df_prob_input not df_meta_input equal')
    
    vec_bool_row_not_NA = ~df_prob_input.isna().all(axis = 1)
    n_row_NA = df_prob_input.shape[0] - sum(vec_bool_row_not_NA)
    if (n_row_NA != 0):
        print(f'<get_barhplot> WARNING: number of NA rows: {n_row_NA}')
    
    df_prob_input = df_prob_input[vec_bool_row_not_NA]
    df_meta_input = df_meta_input[vec_bool_row_not_NA]
    
    # reverse order
    df_meta_input = df_meta_input.iloc[::-1]
    df_prob_input = df_prob_input.iloc[::-1]
        
    n_sample = df_prob_input.shape[0]
    
    fig, ax = plt.subplots(figsize=(size_plot_x, size_plot_y))  # Create a figure and an axes.

    # ... after creating your plot ...
    adjust_left_margin(fig, ax, fixed_margin=0.2)

    # Set the background color
    fig.set_facecolor('white')  # Set the figure background to white
    ax.set_facecolor('white')   # Set the axes background to white
    
    # Initial position of the left edge of the bar
    df = df_prob_input
    lefts = [0] * len(df)
    
    label_legend = list(dict_color_super.keys())
    color_legend = list(dict_color_super.values())
    
    ax.set_xlim(0, 1)
    ax.set_ylim(0 - 0.5, df.shape[0] - 0.5)
    
    scale_factor = calculate_scale_factor(fig, ax)
    
    print(f"scale_factor {scale_factor}")
    
    height_bar = 0.12 * scale_factor
    vec_coord_y = np.arange(n_sample) * height_bar * 1.25

    for i_sample, column in enumerate(df.columns):
        widths = df[column]
        temp_super = get_super(column)
        rects = ax.barh(vec_coord_y, widths, left = lefts, height = height_bar, label=column, 
                        color = dict_color_super[temp_super], edgecolor = 'black')
        # Update the left edge for the next segment
        lefts = [left + value for left, value in zip(lefts, widths)]
        
        for rect in rects:
            width = rect.get_width()
            if width > min_prob_label:  # only if there's a visible segment
                ax.text(rect.get_x() + width/2, rect.get_y() + rect.get_height()/2,
                        column, ha = 'center', va = 'center', fontsize = size_font, 
                        color = 'white')
                    
    ax.set_yticks(vec_coord_y)
    ax.set_yticklabels(list(df_meta_input[label_tick]), fontsize = size_font)
    ax.set_xticks([])
    
    ax.tick_params(axis = 'y', which = 'major', pad = 0.01)
    ax.tick_params(axis = 'x', which = 'major', pad = 0.01)
    
    for label in ax.get_yticklabels():
        label.set_x(label.get_position()[0] + 0.03)
        
    
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax.spines['left'].set_visible(False)
    
    plt.tight_layout()
    
    if (bool_save_plot):
        fn_path_exists(path_plot)
        print(f'<get_barhplot> saving at path_plot: {path_plot}')
        plt.savefig(f'{path_plot}bone-marrow_horizontal_{name_plot}.pdf', 
                    format = 'pdf', dpi = 1200, bbox_inches = 'tight')
        plt.savefig(f'{path_plot}bone-marrow_horizontal_{name_plot}.jpeg', 
                    format = 'jpeg', dpi = 1200, bbox_inches = 'tight')
        
# vertical plots
def plot_bar_admixed(df_prob, label_sample = "", loc_legend = "left", 
                     bool_save_plot = False, path_plot = "", name_plot = "", size_plot_x = 3.0,
                     size_plot_y = 2.0, pad_x_tick = 0.035, total_width = 1):
    n_sample = df_prob.shape[1]
    df_prob = df_prob.loc[vec_pop_ordered]
    fig, ax = plt.subplots(figsize=(size_plot_x, size_plot_y))  # Create a figure and an axes.

    # Set the background color
    fig.set_facecolor('white')  # Set the figure background to white
    ax.set_facecolor('white')   # Set the axes background to white
    
    # Bar plot settings
    # total_width = 1  # Total width for a group of bars
    bar_width = total_width / n_sample
    
    # bar_width = 1.0 / n_sample  # Width of individual bars
    # total_width = n_sample * bar_width  # Total width for a group of bars
    ind = np.arange(len(df_prob))
        
    # Plot each bar
    for i, col in enumerate(df_prob.columns):
        if (i > 0):
            i_color = i + 2
            if (i > 1):
                i_color = i_color + 1
        else:
            i_color = i
                
        offset = (i + 1/2) * bar_width - total_width / 2  # Center bars around each tick
        ax.bar(ind + offset, df_prob[col], 
               bar_width, # bar width # bar_width + n_sample/20
               color = plt.cm.viridis(np.linspace(0, 1, n_sample + 3))[i_color], 
               edgecolor = plt.cm.viridis(np.linspace(0, 1, n_sample + 3))[i_color], 
               label = col, linewidth = 0)
        
    # Set x-ticks to the center of each group of bars
    ax.set_xticks(ind)
    ax.set_xticklabels(df_prob.index, rotation='vertical', fontsize=size_font)

    plt.xlim(-0.5, 25.5)
    plt.xlabel('Populations', fontsize = size_font)
    plt.ylabel('Predicted Probabilities', fontsize = size_font)
    plt.yticks(np.arange(0, 1.1, 0.2))  # Adjust the range and step as needed

    plt.tick_params(axis = 'both', which = 'major', labelsize = size_font)

    # get handles and labels from the old legend
    old_handles, old_labels = ax.get_legend_handles_labels()

    # create new handles for the new legend entries
    new_handles = [patches.Rectangle((0, 0), 1, 1, color = dict_color_super[region_i]) for region_i in vec_sup]
    new_labels = vec_sup_full

    if label_sample == "":
        label_sample = ["sample" + str(i) for i in range(1, n_sample + 1)]
    
    labels_donor = label_sample
    # combine old and new handles and labels
    handles = old_handles + new_handles
    # labels = labels_donor + new_labels
    labels = labels_donor[:n_sample]

    # create the new legend
    legend_bar = ax.legend(handles, labels, loc = 'upper ' + loc_legend, fontsize = size_font,
                           frameon=True, facecolor='white', edgecolor = "black")
    
    # legend_bar.set_title(title_plot + " (% of GT Reads)", prop={"size": size_font}) 
    ax.add_artist(legend_bar)

    for label in ax.get_xticklabels():  
        pop_temp = label.get_text()
        color_temp = dict_color_super[df_meta_unique[df_meta_unique['SUP'] == pop_temp]['POP'].values[0]]
        label.set_bbox(dict(facecolor = color_temp, edgecolor = 'None', alpha = 0.5, pad = 0.5))  # change color as needed
    
    # Adjusting space between the x-axis and its tick labels
    ## 1. Reduce pad to bring labels closer
    ax.tick_params(axis = 'x', which = 'major', pad = 0.001)

    ## 2. Manually adjust the position of x-axis tick labels
    for label in ax.get_xticklabels():
        label.set_y(label.get_position()[1] + pad_x_tick)
        
    # Reduce empty space inside the plot area
    ax.margins(x = 0.001)  # Reduce the x margins (Adjust the value as needed)

    ax.spines['top'].set_visible(True)
    ax.spines['right'].set_visible(True)
    ax.spines['bottom'].set_visible(True)
    ax.spines['left'].set_visible(True)

    ax.spines['top'].set_color('black')
    ax.spines['right'].set_color('black')
    ax.spines['bottom'].set_color('black')
    ax.spines['left'].set_color('black')

    ax.spines['top'].set_linewidth(1)
    ax.spines['right'].set_linewidth(1)
    ax.spines['bottom'].set_linewidth(1)
    ax.spines['left'].set_linewidth(1)
    
    ax.yaxis.set_ticks([])
    ax.yaxis.set_ticklabels([]) 
    
    # Reduce padding and adjust layout
    plt.tight_layout(pad=0.4, w_pad=0.3, h_pad=0.3)  # Adjust padding parameters as needed

    # Adjust subplot parameters to reduce unnecessary margins
    plt.subplots_adjust(left=0.05, right=0.975, top=0.975, bottom=0.10)
    
    if (bool_save_plot):
        fn_path_exists(path_plot)
        print(f'<plot_bar_admixed> saving at path_plot: {path_plot}')
        plt.savefig(f'{path_plot}bone-marrow_vertical_{name_plot}.pdf', 
                    format = 'pdf', dpi = 1200, bbox_inches = 'tight')
        plt.savefig(f'{path_plot}bone-marrow_vertical_{name_plot}.jpeg', 
                    format = 'jpeg', dpi = 1200, bbox_inches = 'tight')
    plt.show()

## 3.2. plot admixtures

### 3.2.1. horizontal

run the following code block to get the plot

In [None]:
# path_admixed = '../data/results_admixed/'
# fn_path_exists(path_admixed)
# for name_mixed in vec_mixed:
#     file_df_prob_read = f'{path_admixed}{name_mixed}/df_probabilities.tsv'
#     fn_file_exists(file_df_prob_read)
#     df_prob_read = pd.read_csv(file_df_prob_read, 
#                                sep = '\t', index_col = 0)
#     df_prob = df_prob_read.loc[vec_pop_ordered].T
#     df_meta = pd.DataFrame({'origin': list(df_prob.index),
#                             'GT.read': ['100.0'] * 3})
#     df_meta['origin'] = df_meta['origin'].str.replace('_', ' + ')
#     df_meta.index = list(df_prob.index)

#     get_barhplot_admixed(df_prob_input = df_prob, df_meta_input = df_meta,
#                 label_tick = 'origin', bool_save_plot = True, 
#                 path_plot = '../figure/sfigure4/',
#                 name_plot = name_mixed, 
#                 size_plot_x = 2.4, size_plot_y = 1.2, min_prob_label = 0.09)

### 3.2.2. vertical

run the following code block to get the plot

In [None]:
# for name_mixed in vec_mixed:
#     file_df_prob_read = f'{path_admixed}{name_mixed}/df_probabilities.tsv'
#     fn_file_exists(file_df_prob_read)
#     df_prob_read = pd.read_csv(file_df_prob_read, sep = '\t', index_col = 0)
#     df_prob = df_prob_read.loc[vec_pop_ordered]
#     list_col = list(df_prob.columns)
#     label_sample = [col.replace('_', ' + ') for col in list_col]

#     plot_bar_admixed(df_prob, 
#              label_sample = label_sample,
#              loc_legend = "right", bool_save_plot = True, 
#              path_plot = '../figure/sfigure4/', name_plot = name_mixed, 
#              size_plot_x = 2.2, 
#              size_plot_y = 1.2, 
#              pad_x_tick = 0.07, total_width = 0.8)

## 3.3. make synthetic admixtures

The sample code below was used to make the synthetic admixture individuals. To run this code, the user needs to download and process the 1000 Genomes Project (1kGP) data. In this gitlab, the 1kGP data itself is not provided as it is already publicly available and because it is too large.

### 3.3.1. functions

In [None]:
def mix_df(df1, df2, n_mut):
    """
    Mixes two dataframes by randomly shuffling indices and then taking a subset from each dataframe
    to create a new mixed dataframe.

    :param df1: The first dataframe to mix.
    :param df2: The second dataframe to mix.
    :param n_mut: The number of mutations, which determines the range of indices to shuffle.
    :return: A new dataframe consisting of a random mix of rows from df1 and df2.
    """
    # Generate a list of numbers from 0 to n_mut-1
    numbers = list(range(n_mut))

    # Shuffle the list of numbers randomly
    random.shuffle(numbers)

    # Calculate the midpoint of the list to split it into two halves
    midpoint = len(numbers) // 2
    # Take the first half of the shuffled numbers for df1
    list1 = numbers[:midpoint]
    # Take the second half of the shuffled numbers for df2
    list2 = numbers[midpoint:]

    # Concatenate the selected rows from df1 and df2 into a new dataframe
    # and sort it by the original index to maintain order
    return(pd.concat([df1.iloc[list1], df2.iloc[list2]]).sort_index())

### 3.3.2. data creation

In [None]:
random.seed(52014)

list_sub_intra1 = ['ACB', 'ASW', 'CLM', 'PEL', 'CDX', 'CHS', 'CEU', 'FIN', 'BEB', 'GIH']
list_sub_intra2 = ['YRI', 'MSL', 'PUR', 'MXL', 'KHV', 'JPT', 'TSI', 'IBS', 'STU', 'PJL']

list_sub_inter1 = ['ESN', 'GWD', 'LWK', 'MSL', 'CLM', 'PEL', 'PUR', 'JPT', 'CHB', 'GBR']
list_sub_inter2 = ['MXL', 'CDX', 'CEU', 'BEB', 'CHB', 'IBS', 'GIH', 'TSI', 'PJL', 'STU']

dict_vec_GT_mixed_intra = {}
dict_df_prob_intra = {}
dict_ld_intra = {}
dict_p_site_present_intra = {}

# takes four minutes
for sub1, sub2 in zip(list_sub_intra1, list_sub_intra2):
    dict_vec_GT_mixed_intra[f'{sub1}_{sub2}'] = mix_df(dict_vec_GT_raw[f'{sub1}'],
                                                       dict_vec_GT_raw[f'{sub2}'])
dict_vec_GT_mixed_inter = {}
dict_df_prob_inter = {}
dict_ld_inter = {}
dict_p_site_present_inter = {}
for sub1, sub2 in zip(list_sub_inter1, list_sub_inter2):
    dict_vec_GT_mixed_inter[f'{sub1}_{sub2}'] = mix_df(dict_vec_GT_raw[f'{sub1}'],
                                                       dict_vec_GT_raw[f'{sub2}'])
    
dict_vec_GT_mixed_intra = {}
dict_df_prob_intra = {}
dict_ld_intra = {}
dict_p_site_present_intra = {}

# takes four minutes
for sub1, sub2 in zip(list_sub_intra1, list_sub_intra2):
    dict_vec_GT_mixed_intra[f'{sub1}_{sub2}'] = mix_df(dict_vec_GT_raw[f'{sub1}'], 
                                                 dict_vec_GT_raw[f'{sub2}'])    
dict_vec_GT_mixed_inter = {}
dict_df_prob_inter = {}
dict_ld_inter = {}
dict_p_site_present_inter = {}
for sub1, sub2 in zip(list_sub_inter1, list_sub_inter2):
    dict_vec_GT_mixed_inter[f'{sub1}_{sub2}'] = mix_df(dict_vec_GT_raw[f'{sub1}'], dict_vec_GT_raw[f'{sub2}'])
df_prob_intra = pd.concat(dict_df_prob_intra).reset_index(level = 1, drop = True)
df_prob_intra = df_prob_intra.T.loc[vec_pop_ordered].T

df_meta_intra = pd.DataFrame.from_dict(dict_p_site_present_intra, orient = 'index', columns=['GT.read'])
df_meta_intra['origin'] = [f'{sub1} + {sub2}' for sub1, sub2 in zip(list_sub_intra1, list_sub_intra2)]

df_prob_inter = pd.concat(dict_df_prob_inter).reset_index(level = 1, drop = True)
df_prob_inter = df_prob_inter.T.loc[vec_pop_ordered].T

df_meta_inter = pd.DataFrame.from_dict(dict_p_site_present_inter, orient='index', columns=['GT.read'])
df_meta_inter['origin'] = [f'{sub1} + {sub2}' for sub1, sub2 in zip(list_sub_inter1, list_sub_inter2)]