### Explore the filtered and clean data
- Find the most common allele in the world 

In [1]:
from utils import *
url = "https://github.com/slowkow/allelefrequencies/raw/main/afnd.tsv"
df = pd.read_csv(url, sep="\t")
df = clean_data(df)
df = filter_data(df)

Starting shape: (123502, 7)
After filtering for HLA: (111399, 7)
After filtering for Class I (A, B, C): (69275, 7)
After removing 495 G-group rows: (68780, 8)
Final shape after dropping columns: (68780, 5)
Starting shape: (68780, 5)
Starting number of populations: 548
After filtering for populations with >= 50 allele entries: (64437, 5)
  Populations remaining: 364
After filtering for n > 100: (47092, 5)
  Populations remaining: 235


In [28]:
import pandas as pd

def get_top_alleles_from_largest_studies(df_filtered, top_n=5, verbose=True):
    """
    Find the highest frequency allele in the top N largest studies by sample size.
    """
    
    # Step 1: Get sample size per population (should be constant within each population)
    pop_size = df_filtered.groupby('population').agg({
        'n': 'first',  # Sample size of the study
        'allele': 'count'  # allele variety
    }).reset_index()
    pop_size.columns = ['population', 'sample_size', 'num_alleles_reported']
    
    if verbose:
        print(f"Total populations: {len(pop_size)}")
        print(f"\nSample size statistics:")
        print(pop_size['sample_size'].describe())
    
    # Step 2: Get top N populations by sample size
    top_populations = pop_size.nlargest(top_n, 'sample_size')
    
    if verbose:
        print(f"\n{'='*60}")
        print(f"Top {top_n} largest studies:")
        print(f"{'='*60}")
        for idx, row in top_populations.iterrows():
            print(f"{row['population']:<50} n={row['sample_size']:>6,}  ({row['num_alleles_reported']} alleles)")
    
    # Step 3: Filter data for these top populations
    top_studies_data = df_filtered[df_filtered['population'].isin(top_populations['population'])]
    
    # Step 4: For each population, find the allele(s) with highest frequency
    # Using idxmax to get the index of maximum frequency per population
    highest_freq_idx = top_studies_data.groupby('population')['alleles_over_2n'].idxmax()
    highest_freq_alleles = top_studies_data.loc[highest_freq_idx]
    
    # Step 5: Add sample size info and sort by sample size
    result = highest_freq_alleles.merge(
        top_populations[['population', 'sample_size', 'num_alleles_reported']], 
        on='population'
    ).sort_values('sample_size', ascending=False)
    
    # Rename columns for clarity
    result = result.rename(columns={
        'n': 'sample_size_original',
        'alleles_over_2n': 'frequency'
    })
    
    # Drop duplicate sample_size column if it exists
    if 'sample_size_original' in result.columns and 'sample_size' in result.columns:
        result = result.drop(columns=['sample_size_original'])
    
    if verbose:
        print(f"\n{'='*60}")
        print(f"Highest frequency allele in each of top {top_n} studies:")
        print(f"{'='*60}")
        for idx, row in result.iterrows():
            print(f"\n{row['population']}")
            print(f"  Sample size: {row['sample_size']:,}")
            print(f"  Highest freq allele: {row['gene']}*{row['allele']}")
            print(f"  Frequency: {row['frequency']:.4f} ({row['frequency']*100:.2f}%)")
    
    return result


# Usage:
result = get_top_alleles_from_largest_studies(df, top_n=50, verbose=False)
print(result.drop(columns="gene").head())


     allele                       population  frequency  sample_size  \
2   A*02:01     Germany DKMS - German donors     0.2839      3456066   
38  A*02:01      USA NMDP European Caucasian     0.2755      1242890   
31  C*04:01  USA NMDP African American pop 2     0.2037       416581   
44  A*02:01      USA NMDP Mexican or Chicano     0.2230       261235   
47  A*01:01      USA NMDP South Asian Indian     0.1545       185391   

    num_alleles_reported  
2                   2802  
38                  1015  
31                   646  
44                   590  
47                   465  


In [26]:
print("Most common top allele in top-n studies: ")
print(result["allele"].value_counts())

Most common top allele in top-n studies: 
allele
A*02:01    19
A*01:01     6
C*04:01     5
A*11:01     5
A*24:02     5
B*40:01     1
C*07:02     1
C*17:03     1
B*44:03     1
B*49:01     1
A*02:02     1
C*08:22     1
C*01:57     1
B*38:01     1
A*03:02     1
Name: count, dtype: Int64
