### Exploratory Data Analysis:
-  Find most common class 1 HLA allele by frequency in the world
- Using tab-delimited data exported from the allele frequency net dataset website 

In [1]:
import pandas as pd 
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [28]:
# Download the file
url = "https://github.com/slowkow/allelefrequencies/raw/main/afnd.tsv"
df = pd.read_csv(url, sep='\t')

In [29]:
df.head()

Unnamed: 0,group,gene,allele,population,indivs_over_n,alleles_over_2n,n
0,hla,A,A*01:01,Argentina Rosario Toba,15.1,0.076,86
1,hla,A,A*01:01,Armenia combined Regions,,0.125,100
2,hla,A,A*01:01,Australia Cape York Peninsula Aborigine,,0.053,103
3,hla,A,A*01:01,Australia Groote Eylandt Aborigine,,0.027,75
4,hla,A,A*01:01,Australia New South Wales Caucasian,,0.187,134


In [30]:
df.dtypes

group              object
gene               object
allele             object
population         object
indivs_over_n      object
alleles_over_2n    object
n                  object
dtype: object

In [31]:
print("total number of rows and columns: ", df.shape)

total number of rows and columns:  (123502, 7)


In [32]:
hla_mask = df["group"]=="hla"
df = df[hla_mask]

In [33]:
print("total number of HLA: ", df.shape)

total number of HLA:  (111399, 7)


In [34]:
class1_mask = df["gene"].isin(["A", "B", "C"])
df = df[class1_mask]


In [35]:
print("total number of HLA class 1 allele: ", df.shape)

total number of HLA class 1 allele:  (69275, 7)


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 69275 entries, 0 to 69274
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   group            69275 non-null  object
 1   gene             69275 non-null  object
 2   allele           69275 non-null  object
 3   population       69275 non-null  object
 4   indivs_over_n    20356 non-null  object
 5   alleles_over_2n  69275 non-null  object
 6   n                69275 non-null  object
dtypes: object(7)
memory usage: 4.2+ MB


In [37]:
df.describe()

Unnamed: 0,group,gene,allele,population,indivs_over_n,alleles_over_2n,n
count,69275,69275,69275,69275,20356.0,69275,69275
unique,1,3,4582,578,555.0,2049,327
top,hla,B,B*40:01,Germany DKMS - German donors,0.0,0,98
freq,69275,38030,383,2802,13689.0,31438,2902


In [38]:
df = df.drop(columns="group")

In [39]:
df.describe()

Unnamed: 0,gene,allele,population,indivs_over_n,alleles_over_2n,n
count,69275,69275,69275,20356.0,69275,69275
unique,3,4582,578,555.0,2049,327
top,B,B*40:01,Germany DKMS - German donors,0.0,0,98
freq,38030,383,2802,13689.0,31438,2902


### Summary
- nulls in "indivs_over_n" attribute, no nulls elsewhere
- 70k rows of class 1 HLA alleles 
- there is about 500 rows with G-groups, how do we deal with that? 
  - include G groups directly
  - remove them
  - Separate specific allele with G groups and analyse them separately
- all of HLA is 4-digit resolution due to the way the information was scrapped in the python script in the repo

In [52]:
url = "https://github.com/slowkow/allelefrequencies/raw/main/afnd.tsv"
# Define column data types
dtype_dict = {
    'group': 'category',        # Limited values (hla, kir, mic, cytokine)
    'gene': 'category',         # Limited values (A, B, C, DRB1, etc.)
    'allele': 'string',         # Text but many unique values
    'population': 'string',     # Text with many unique values
    'indivs_over_n': 'float64', # Percentage, can have NaN
    'alleles_over_2n': 'float64', # Frequency, can have NaN
    'n': 'Int64'                # Integer, using nullable Int64 for potential missing values
}
df = pd.read_csv(url, sep="\t" , dtype=dtype_dict)

ValueError: could not convert string to float: '2.0(*)'

In [4]:
# Step 2: Inspect the numeric columns for non-numeric values
print("\nChecking 'indivs_over_n' column:")
print(f"Unique non-numeric values: {df[pd.to_numeric(df['indivs_over_n'], errors='coerce').isna() & df['indivs_over_n'].notna()]['indivs_over_n'].unique()}")

print("\nChecking 'alleles_over_2n' column:")
print(f"Unique non-numeric values: {df[pd.to_numeric(df['alleles_over_2n'], errors='coerce').isna() & df['alleles_over_2n'].notna()]['alleles_over_2n'].unique()}")



Checking 'indivs_over_n' column:
Unique non-numeric values: ['2.0(*)' '1.6(*)' '11.8(*)' '0.4(*)' '2.8(*)' '0.6(*)' '4.8(*)' '7.1(*)'
 '40.0(*)' '50.0(*)' '6.7(*)' '21.1(*)' '46.2(*)' '20.0(*)' '10.0(*)'
 '21.4(*)' '26.7(*)' '13.3(*)' '23.5(*)' '33.3(*)' '83.3(*)' '1.0(*)'
 '1.4(*)' '5.6(*)' '5.3(*)' '6.3(*)' '0.8(*)' '5.0(*)' '3.6(*)' '4.0(*)'
 '2.4(*)' '1.8(*)' '12.2(*)' '15.8(*)' '16.6(*)' '14.3(*)' '28.6(*)'
 '30.0(*)' '10.5(*)' '60.0(*)' '9.5(*)' '2.6(*)' '5.9(*)' '65.0(*)'
 '23.8(*)' '35.7(*)' '45.0(*)' '17.6(*)' '15.0(*)' '70.6(*)' '71.4(*)'
 '37.5(*)' '36.8(*)' '55.6(*)' '25.0(*)' '35.3(*)' '42.9(*)' '64.3(*)'
 '35.0(*)' '9.1(*)' '5.4(*)' '4.6(*)' '100.0(*)' '3.0(*)' '3.2(*)'
 '6.8(*)' '3.8(*)' '36.4(*)' '8.0(*)' '9.8(*)' '10.9(*)' '47.1(*)'
 '16.7(*)' '58.3(*)' '78.9(*)' '17.4(*)' '11.1(*)' '7.8(*)' '20.9(*)'
 '40.6(*)' '30.7(*)' '31.7(*)' '13.9(*)' '22.3(*)' '5.8(*)' '9.2(*)'
 '5.2(*)' '2.2(*)' '20.6(*)' '3.4(*)' '14.5(*)' '14.6(*)' '10.4(*)'
 '12.5(*)' '52.6(*)' '55.0(*)' '

In [53]:
from utils import *
df["resolution"] = df['allele'].apply(get_allele_resolution)


In [54]:
df.head()

Unnamed: 0,gene,allele,population,indivs_over_n,alleles_over_2n,n,resolution
0,A,A*01:01,Argentina Rosario Toba,15.1,0.076,86,4-digit
1,A,A*01:01,Armenia combined Regions,,0.125,100,4-digit
2,A,A*01:01,Australia Cape York Peninsula Aborigine,,0.053,103,4-digit
3,A,A*01:01,Australia Groote Eylandt Aborigine,,0.027,75,4-digit
4,A,A*01:01,Australia New South Wales Caucasian,,0.187,134,4-digit


In [10]:
four_digit = df[df["resolution"] == "4-digit"]


In [13]:
four_digit.shape

(111563, 8)

In [12]:
# Step 2: Inspect the numeric columns for non-numeric values
print("\nChecking 'indivs_over_n' column:")
print(f"Unique non-numeric values: {four_digit[pd.to_numeric(four_digit['indivs_over_n'], errors='coerce').isna() & four_digit['indivs_over_n'].notna()]['indivs_over_n'].unique()}")

print("\nChecking 'alleles_over_2n' column:")
print(f"Unique non-numeric values: {four_digit[pd.to_numeric(four_digit['alleles_over_2n'], errors='coerce').isna() & four_digit['alleles_over_2n'].notna()]['alleles_over_2n'].unique()}")



Checking 'indivs_over_n' column:
Unique non-numeric values: ['2.0(*)' '1.6(*)' '11.8(*)' '0.4(*)' '2.8(*)' '0.6(*)' '4.8(*)' '7.1(*)'
 '40.0(*)' '50.0(*)' '6.7(*)' '21.1(*)' '46.2(*)' '20.0(*)' '10.0(*)'
 '21.4(*)' '26.7(*)' '13.3(*)' '23.5(*)' '33.3(*)' '83.3(*)' '1.0(*)'
 '1.4(*)' '5.6(*)' '5.3(*)' '6.3(*)' '0.8(*)' '5.0(*)' '3.6(*)' '4.0(*)'
 '2.4(*)' '1.8(*)' '12.2(*)' '15.8(*)' '16.6(*)' '14.3(*)' '28.6(*)'
 '30.0(*)' '10.5(*)' '60.0(*)' '9.5(*)' '2.6(*)' '5.9(*)' '65.0(*)'
 '23.8(*)' '35.7(*)' '45.0(*)' '17.6(*)' '15.0(*)' '70.6(*)' '71.4(*)'
 '37.5(*)' '36.8(*)' '55.6(*)' '25.0(*)' '35.3(*)' '42.9(*)' '64.3(*)'
 '35.0(*)' '9.1(*)' '5.4(*)' '4.6(*)' '100.0(*)' '3.0(*)' '3.2(*)'
 '6.8(*)' '3.8(*)' '36.4(*)' '8.0(*)' '9.8(*)' '10.9(*)' '47.1(*)'
 '16.7(*)' '58.3(*)' '78.9(*)' '17.4(*)' '11.1(*)' '7.8(*)' '20.9(*)'
 '40.6(*)' '30.7(*)' '31.7(*)' '13.9(*)' '22.3(*)' '5.8(*)' '9.2(*)'
 '5.2(*)' '2.2(*)' '20.6(*)' '3.4(*)' '14.5(*)' '14.6(*)' '10.4(*)'
 '12.5(*)' '52.6(*)' '55.0(*)' '

In [52]:
df["resolution"].value_counts()

resolution
4-digit    69275
Name: count, dtype: int64

### Plan
- convert to proper types 
- how many are G groups? 

In [40]:
from utils import *
df["is_ggroup_allele"] = df["alleles_over_2n"].apply(is_g_group)

In [41]:
print(df["is_ggroup_allele"].value_counts())

is_ggroup_allele
False    68780
True       495
Name: count, dtype: int64


In [42]:
# See some examples
print("\nSample G group entries:")
print(df[df['is_ggroup_allele']].head(10))


Sample G group entries:
     gene    allele                         population indivs_over_n  \
309     A   A*01:02  Costa Rica African -Caribbean (G)        2.0(*)   
310     A   A*01:02         Costa Rica Amerindians (G)        1.6(*)   
379     A   A*01:02             Saudi Arabia pop 6 (G)           NaN   
506     A   A*01:03             Saudi Arabia pop 6 (G)           NaN   
601     A   A*01:06             Saudi Arabia pop 6 (G)           NaN   
673     A   A*01:09             Saudi Arabia pop 6 (G)           NaN   
835     A   A*01:23             Saudi Arabia pop 6 (G)           NaN   
964     A   A*01:91             Saudi Arabia pop 6 (G)           NaN   
971     A  A*01:106             Saudi Arabia pop 6 (G)           NaN   
1360    A   A*02:02  Costa Rica African -Caribbean (G)       11.8(*)   

     alleles_over_2n       n  is_ggroup_allele  
309        0.0100(*)     102              True  
310        0.0080(*)     125              True  
379        0.0006(*)  28,927       

In [45]:
df_clean = df[df["is_ggroup_allele"] == False]
df_clean.shape

(68780, 7)

In [47]:
df_clean = df_clean.drop(columns="indivs_over_n")

In [48]:
df_clean.shape

(68780, 6)

In [50]:
df_clean.head()

Unnamed: 0,gene,allele,population,alleles_over_2n,n,is_ggroup_allele
0,A,A*01:01,Argentina Rosario Toba,0.076,86,False
1,A,A*01:01,Armenia combined Regions,0.125,100,False
2,A,A*01:01,Australia Cape York Peninsula Aborigine,0.053,103,False
3,A,A*01:01,Australia Groote Eylandt Aborigine,0.027,75,False
4,A,A*01:01,Australia New South Wales Caucasian,0.187,134,False


In [4]:
from utils import *
url = "https://github.com/slowkow/allelefrequencies/raw/main/afnd.tsv"
df = pd.read_csv(url, sep="\t")
df_clean = clean_data(df)


Starting shape: (123502, 7)
After filtering for HLA: (111399, 7)
After filtering for Class I (A, B, C): (69275, 7)
After removing 495 G-group rows: (68780, 8)
Final shape after dropping columns: (68780, 5)


In [5]:
# Define column data types
dtype_dict = {
    'gene': 'category',         # Limited values (A, B, C, DRB1, etc.)
    'allele': 'string',         # Text but many unique values
    'population': 'string',     # Text with many unique values
    'alleles_over_2n': 'float64', # Frequency, can have NaN
    'n': 'Int64'                # Integer, using nullable Int64 for potential missing values
}
df_clean = df_clean.astype(dtype_dict)

In [7]:
df_clean.describe()

Unnamed: 0,alleles_over_2n,n
count,68780.0,68780.0
mean,0.013275,172890.247528
std,0.037246,694580.80005
min,0.0,10.0
25%,0.0,98.0
50%,5e-06,222.0
75%,0.0074,2840.0
max,0.863,3456066.0


In [9]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 68780 entries, 0 to 69273
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   gene             68780 non-null  category
 1   allele           68780 non-null  string  
 2   population       68780 non-null  string  
 3   alleles_over_2n  68780 non-null  float64 
 4   n                68780 non-null  Int64   
dtypes: Int64(1), category(1), float64(1), string(2)
memory usage: 2.8 MB
