In [4]:
import pandas as pd 

df = pd.read_csv("cleaned_data3.csv", index_col=0)

def get_top_alleles_from_largest_studies(df_filtered, top_n=5, verbose=True):
    """
    Find the highest frequency allele in the top N largest studies by sample size.
    """
    
    # Step 1: Get sample size per population (should be constant within each population)
    pop_size = df_filtered.groupby('population').agg({
        'n': 'first',  # Sample size of the study
        'allele': 'count'  # allele variety
    }).reset_index()
    pop_size.columns = ['population', 'sample_size', 'num_alleles_reported']
    
    if verbose:
        print(f"Total populations: {len(pop_size)}")
        print(f"\nSample size statistics:")
        print(pop_size['sample_size'].describe())
    
    # Step 2: Get top N populations by sample size
    top_populations = pop_size.nlargest(top_n, 'sample_size')
    
    if verbose:
        print(f"\n{'='*60}")
        print(f"Top {top_n} largest studies:")
        print(f"{'='*60}")
        for idx, row in top_populations.iterrows():
            print(f"{row['population']:<50} n={row['sample_size']:>6,}  ({row['num_alleles_reported']} alleles)")
    
    # Step 3: Filter data for these top populations
    top_studies_data = df_filtered[df_filtered['population'].isin(top_populations['population'])]
    
    # Step 4: For each population, find the allele(s) with highest frequency
    # Using idxmax to get the index of maximum frequency per population
    highest_freq_idx = top_studies_data.groupby('population')['alleles_over_2n'].idxmax()
    highest_freq_alleles = top_studies_data.loc[highest_freq_idx]
    
    # Step 5: Add sample size info and sort by sample size
    result = highest_freq_alleles.merge(
        top_populations[['population', 'sample_size', 'num_alleles_reported']], 
        on='population'
    ).sort_values('sample_size', ascending=False)
    
    # Rename columns for clarity
    result = result.rename(columns={
        'n': 'sample_size_original',
        'alleles_over_2n': 'frequency'
    })
    
    # Drop duplicate sample_size column if it exists
    if 'sample_size_original' in result.columns and 'sample_size' in result.columns:
        result = result.drop(columns=['sample_size_original'])
    
    if verbose:
        print(f"\n{'='*60}")
        print(f"Highest frequency allele in each of top {top_n} studies:")
        print(f"{'='*60}")
        for idx, row in result.iterrows():
            print(f"\n{row['population']}")
            print(f"  Sample size: {row['sample_size']:,}")
            print(f"  Highest freq allele: {row['gene']}*{row['allele']}")
            print(f"  Frequency: {row['frequency']:.4f} ({row['frequency']*100:.2f}%)")
    
    return result

In [5]:
# Get top alleles from largest studies
result = get_top_alleles_from_largest_studies(df, top_n=100, verbose=False)
print(result.head())

   gene   allele                       population  frequency resolution  \
15    A  A*02:01     Germany DKMS - German donors   0.285204    4-digit   
88    A  A*02:01      USA NMDP European Caucasian   0.275556    4-digit   
80    C  C*04:01  USA NMDP African American pop 2   0.203689    4-digit   
94    A  A*02:01      USA NMDP Mexican or Chicano   0.223083    4-digit   
97    A  A*01:01      USA NMDP South Asian Indian   0.154578    4-digit   

    sample_size  num_alleles_reported  
15      3456066                  2802  
88      1242890                  1015  
80       416581                   646  
94       261235                   590  
97       185391                   465  


In [3]:
# Most common top allele in top-n studies
print("Most common top allele in top-n studies: ")
print(result["allele"].value_counts())

Most common top allele in top-n studies: 
allele
A*02:01    41
A*11:01    12
A*24:02    12
C*04:01     9
C*07:01     6
A*01:01     6
C*07:02     2
A*33:03     1
B*15:01     1
C*01:02     1
B*07:02     1
A*30:01     1
B*46:01     1
A*03:02     1
B*38:01     1
A*02:02     1
B*49:01     1
B*44:03     1
B*53:01     1
Name: count, dtype: int64


In [6]:
# Alternative method: Calculate average frequency across all studies that report each allele
allele_avg_freq = df.groupby('allele').agg({
    'alleles_over_2n': 'mean',  # Average frequency across all studies
    'population': 'count'        # Number of studies reporting this allele
}).reset_index()

allele_avg_freq.columns = ['allele', 'avg_frequency', 'num_studies']

# Sort by highest average frequency
allele_avg_freq = allele_avg_freq.sort_values('avg_frequency', ascending=False)

print("Top alleles by average frequency across all studies:")
print(allele_avg_freq.head(20))

Top alleles by average frequency across all studies:
       allele  avg_frequency  num_studies
105   A*02:01       0.167080          220
682   A*24:02       0.137715          219
3789  C*07:02       0.127004          186
3490  C*04:01       0.120279          186
554   A*11:01       0.115980          208
3788  C*07:01       0.094793          179
3696  C*06:02       0.079487          180
0     A*01:01       0.078450          210
3343  C*03:04       0.074929          185
3216  C*01:02       0.073399          180
417   A*03:01       0.064214          205
1333  B*07:02       0.058447          217
2821  B*51:01       0.056966          228
2063  B*35:01       0.054255          225
3105  B*56:43       0.050941            1
2376  B*40:01       0.049939          220
2729  B*46:01       0.048798          122
4090  C*08:01       0.047797          159
1499  B*08:01       0.046346          209
3342  C*03:03       0.044151          180
