# Load packages

In [None]:
import session_info
import pandas as pd
import seaborn as sns
from pyhere import here
import matplotlib.pyplot as plt
from rfmix_reader import read_rfmix

# Load data

In [None]:
prefix_path = here("input/real_data/_m/")
binary_dir = here("real_data/gpu_version/_m/binary_files/")
_, rf_q, _ = read_rfmix(prefix_path, binary_dir=binary_dir)

In [None]:
rf_q.shape

In [None]:
rf_q.head()

# Generate plots

## Sort chromosomes in the correct order

In [None]:
chrom_order = [f'chr{i}' for i in range(1, 23)]

In [None]:
rf_q_pandas = rf_q.to_pandas() # Convert cuDF to pandas DataFrame
rf_q_pandas['chrom'] = pd.Categorical(rf_q_pandas['chrom'], 
                                      categories=chrom_order, ordered=True)

## Sort the dataframe by chromosome

In [None]:
rf_q_sorted = rf_q_pandas.sort_values('chrom')
rf_q_sorted.head()

## Create and save the plot

In [None]:
plt.figure(figsize=(15, 8))
sns.boxplot(x='chrom', y='AFR', data=rf_q_sorted, 
            color='lightgray', width=0.6)
sns.stripplot(x='chrom', y='AFR', data=rf_q_sorted, 
              color='black', alpha=0.1, jitter=True)
plt.axhline(y=0.5, color='black', linestyle='--', linewidth=1)
plt.title('Global Ancestry (AFR) by Chromosome', fontsize=18)
plt.xlabel('Chromosome', fontsize=14)
plt.ylabel('African Genetic Ancestry', fontsize=14)
plt.xticks(rotation=45, fontsize=12)
plt.yticks(fontsize=12)
plt.tight_layout()
plt.savefig('global_ancestry_boxplot.pdf', dpi=300, bbox_inches='tight')

# Session information

In [None]:
session_info.show()