### Explore the cleaned data:
- Find the most common allele in the world

In [1]:
import pandas as pd 
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from utils import *
url = "https://github.com/slowkow/allelefrequencies/raw/main/afnd.tsv"
df = pd.read_csv(url, sep="\t")
df = clean_data(df)


Starting shape: (123502, 7)
After filtering for HLA: (111399, 7)
After filtering for Class I (A, B, C): (69275, 7)
After removing 495 G-group rows: (68780, 8)
Final shape after dropping columns: (68780, 5)


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 68780 entries, 0 to 69273
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   gene             68780 non-null  category
 1   allele           68780 non-null  string  
 2   population       68780 non-null  string  
 3   alleles_over_2n  68780 non-null  float64 
 4   n                68780 non-null  Int64   
dtypes: Int64(1), category(1), float64(1), string(2)
memory usage: 2.8 MB


In [7]:
df.describe()

Unnamed: 0,alleles_over_2n,n
count,68780.0,68780.0
mean,0.013275,172890.247528
std,0.037246,694580.80005
min,0.0,10.0
25%,0.0,98.0
50%,5e-06,222.0
75%,0.0074,2840.0
max,0.863,3456066.0


In [8]:
df["gene"].value_counts()

gene
B    37767
A    19969
C    11044
Name: count, dtype: int64

In [9]:
df["allele"].value_counts()

allele
B*40:01    383
B*40:02    366
B*15:01    353
B*14:02    327
A*02:01    321
          ... 
B*27:75      1
B*27:76      1
B*27:80      1
B*27:82      1
C*18:06      1
Name: count, Length: 4561, dtype: Int64

In [16]:
df["population"].value_counts().describe()

count         548.0
mean     125.510949
std      226.800821
min             1.0
25%            35.0
50%            52.0
75%          128.25
max          2802.0
Name: count, dtype: Float64

In [15]:
print(df[df["population"] == "England Newcastle"])

      gene   allele         population  alleles_over_2n     n
57402    B  B*67:01  England Newcastle              0.0  2739


In [28]:
# Count entries per population
pop_counts = df.groupby('population').size().reset_index(name='num_entries')
print(f"\nTotal unique populations: {len(pop_counts)}")
print(f"\nDistribution of entries per population:")
print(pop_counts['num_entries'].describe())

# Identify populations with only 1 entry
single_entry_pops = pop_counts[pop_counts['num_entries'] == 1]['population'].values
print(f"\nPopulations with only 1 entry: {len(single_entry_pops)}")
print(f"Percentage: {100 * len(single_entry_pops) / len(pop_counts):.2f}%")



Total unique populations: 548

Distribution of entries per population:
count     548.000000
mean      125.510949
std       226.800821
min         1.000000
25%        35.000000
50%        52.000000
75%       128.250000
max      2802.000000
Name: num_entries, dtype: float64

Populations with only 1 entry: 7
Percentage: 1.28%


In [29]:
# Look at some examples
print("\n" + "="*80)
print("SINGLE-ENTRY POPULATIONS")
print("="*80)
single_entry_data = df[df['population'].isin(single_entry_pops)]
print(single_entry_data)



SINGLE-ENTRY POPULATIONS
      gene   allele          population  alleles_over_2n      n
16677    A  A*34:02   England Lancaster           0.0010    545
20094    A  A*80:02  Brazil Piaui Mixed           0.0040  21943
46595    B  B*42:02        Saudi Arabia           0.0571     18
51531    B  B*51:03       Senegal Dakar           0.0050    112
57402    B  B*67:01   England Newcastle           0.0000   2739
57522    B  B*73:01   England Sheffield           0.0000   4755
57574    B  B*73:01    Mongolia Khalkha           0.0050    200


In [52]:
good_studies = pop_counts[pop_counts["num_entries"] >= 50]

In [53]:
# Identify populations with only 1 entry
good_studies_data = df[df['population'].isin(good_studies["population"])]
print(good_studies_data.head())

  gene   allele                           population  alleles_over_2n    n
1    A  A*01:01             Armenia combined Regions            0.125  100
4    A  A*01:01  Australia New South Wales Caucasian            0.187  134
5    A  A*01:01         Australia Yuendumu Aborigine            0.008  191
6    A  A*01:01                              Austria            0.146  200
7    A  A*01:01               Azores Central Islands            0.080   59


In [54]:
good_studies_data.shape

(64437, 5)

In [55]:
good_studies_data = good_studies_data[good_studies_data["n"] > 100]
good_studies_data.shape

(47092, 5)

In [60]:
print("The number of good studies left after filtering: ", good_studies_data["population"].value_counts().shape)

The number of good studies left after filtering:  (235,)


### Summary 
- Filter out studies with less than 50 4-digit allele types
- filter out studies with less than 100 population

In [1]:
from utils import *
url = "https://github.com/slowkow/allelefrequencies/raw/main/afnd.tsv"
df = pd.read_csv(url, sep="\t")
df = clean_data(df)
df = filter_data(df)

Starting shape: (123502, 7)
After filtering for HLA: (111399, 7)
After filtering for Class I (A, B, C): (69275, 7)
After removing 495 G-group rows: (68780, 8)
Final shape after dropping columns: (68780, 5)
Starting shape: (68780, 5)
Starting number of populations: 548
After filtering for populations with >= 50 allele entries: (64437, 5)
  Populations remaining: 364
After filtering for n > 100: (47092, 5)
  Populations remaining: 235
