In [1]:
from utils import *

# Load data from local file
df = pd.read_csv("afnd.tsv", sep="\t")
df = clean_data(df)
# df = filter_data(df)

Starting shape: (152716, 7)
After filtering for HLA: (140613, 7)
After filtering for Class I (A, B, C): (98489, 7)
After removing 1529 G-group rows: (96960, 8)
Final shape after dropping columns: (96960, 5)


In [2]:
# Add resolution column to the dataframe
df['resolution'] = df['allele'].apply(get_allele_resolution)

# Check the distribution of resolutions
print("Resolution distribution:")
print(df['resolution'].value_counts())

Resolution distribution:
resolution
4-digit    68780
2-digit    22010
6-digit     5688
8-digit      482
Name: count, dtype: int64


In [3]:
print("Total number of studies:", df['population'].nunique())
# df = df[df['n'] > 100]
print("Total number of studies:", df['population'].nunique())

Total number of studies: 832
Total number of studies: 832


In [4]:
# Filter for 2-digit allele entries
df_2digit = df[df['resolution'] == '2-digit']

# Find studies (populations) that have 2-digit alleles
studies_with_2digit = df_2digit.groupby('population').agg({
    'n': 'first',  # Sample size (constant within each population)
    'alleles_over_2n': 'max',  # Highest frequency among 2-digit alleles
    'allele': 'count'  # Number of 2-digit allele entries
}).reset_index()

studies_with_2digit.columns = ['population', 'sample_size', 'highest_frequency', 'num_2digit_alleles']

# Sort by sample size descending
studies_with_2digit = studies_with_2digit.sort_values('sample_size', ascending=False)

print(f"Number of studies with 2-digit allele representations: {len(studies_with_2digit)}")
print(f"\n{'='*80}")
print(f"Summary statistics:")
print(f"{'='*80}")
print(f"Sample size - min: {studies_with_2digit['sample_size'].min():,}, max: {studies_with_2digit['sample_size'].max():,}, median: {studies_with_2digit['sample_size'].median():,.0f}")
print(f"Highest frequency - min: {studies_with_2digit['highest_frequency'].min():.4f}, max: {studies_with_2digit['highest_frequency'].max():.4f}, median: {studies_with_2digit['highest_frequency'].median():.4f}")
print(f"\n{'='*80}")
print(f"Studies with 2-digit alleles (sorted by sample size):")
print(f"{'='*80}")
studies_with_2digit

Number of studies with 2-digit allele representations: 514

Summary statistics:
Sample size - min: 10, max: 800,809, median: 149
Highest frequency - min: 0.0000, max: 0.6920, median: 0.2770

Studies with 2-digit alleles (sorted by sample size):


Unnamed: 0,population,sample_size,highest_frequency,num_2digit_alleles
70,Brazil REDOME Sao Paulo,800809,0.2563,57
61,Brazil REDOME Parana,341639,0.2666,56
65,Brazil REDOME Rio Grande do Sul,241329,0.2745,57
58,Brazil REDOME Minas Gerais,211275,0.2589,57
192,Italy,159311,0.2540,48
...,...,...,...,...
167,India Kerala Hindu Pulaya,16,0.5310,45
171,India Kerala Kuruma,15,0.4000,32
170,India Kerala Kurichiya,10,0.3500,32
173,India Kerala Malapandaram,10,0.4500,32


In [5]:
def collapse_8digit_to_6digit(df):
    """
    For each study (population), collapse 8-digit alleles into their 6-digit parents.
    
    - If a 6-digit parent exists: update it to max(parent_freq, sum of 8-digit children freq)
    - If no 6-digit parent exists: create one with freq = sum of 8-digit children freq
    - Remove all 8-digit entries after processing
    """
    df_result = df.copy()
    
    # Add parent allele column for 8-digit entries
    df_result['parent_6digit'] = df_result.apply(
        lambda row: extract_allele_parts(row['allele'])['6digit'] 
        if row['resolution'] == '8-digit' and extract_allele_parts(row['allele']) 
        else None, 
        axis=1
    )
    
    changes_log = []
    
    for pop in df_result['population'].unique():
        pop_mask = df_result['population'] == pop
        pop_df = df_result[pop_mask]
        
        # Get 8-digit entries for this population
        eight_digit_entries = pop_df[pop_df['resolution'] == '8-digit']
        
        if len(eight_digit_entries) == 0:
            continue
        
        # Group 8-digit entries by their 6-digit parent
        for parent_6d in eight_digit_entries['parent_6digit'].unique():
            if parent_6d is None:
                continue
            
            # Sum of children frequencies
            children_mask = (pop_mask) & (df_result['parent_6digit'] == parent_6d)
            children_freq_sum = df_result.loc[children_mask, 'alleles_over_2n'].sum()
            
            # Check if 6-digit parent exists
            parent_mask = (pop_mask) & (df_result['allele'] == parent_6d)
            parent_exists = parent_mask.sum() > 0
            
            if parent_exists:
                # Get current parent frequency
                parent_freq = df_result.loc[parent_mask, 'alleles_over_2n'].values[0]
                new_freq = max(parent_freq, children_freq_sum)
                
                # Update parent frequency
                df_result.loc[parent_mask, 'alleles_over_2n'] = new_freq
                
                changes_log.append({
                    'population': pop,
                    'parent_6digit': parent_6d,
                    'action': 'updated',
                    'old_freq': parent_freq,
                    'children_sum': children_freq_sum,
                    'new_freq': new_freq
                })
            else:
                # Create new 6-digit parent entry
                # Use the first child's metadata as template
                template_idx = df_result[children_mask].index[0]
                template_row = df_result.loc[template_idx].copy()
                
                template_row['allele'] = parent_6d
                template_row['resolution'] = '6-digit'
                template_row['alleles_over_2n'] = children_freq_sum
                template_row['parent_6digit'] = None
                
                df_result = pd.concat([df_result, pd.DataFrame([template_row])], ignore_index=True)
                
                changes_log.append({
                    'population': pop,
                    'parent_6digit': parent_6d,
                    'action': 'created',
                    'old_freq': None,
                    'children_sum': children_freq_sum,
                    'new_freq': children_freq_sum
                })
    
    # Remove all 8-digit entries
    df_result = df_result[df_result['resolution'] != '8-digit']
    
    # Drop the helper column
    df_result = df_result.drop(columns=['parent_6digit'])
    
    return df_result, pd.DataFrame(changes_log)


# Apply the collapse function
df_collapsed, changes_df = collapse_8digit_to_6digit(df)

print(f"Original dataframe shape: {df.shape}")
print(f"Collapsed dataframe shape: {df_collapsed.shape}")
print(f"\nResolution distribution after collapsing:")
print(df_collapsed['resolution'].value_counts())
print(f"\n{'='*80}")
print(f"Changes made:")
print(f"{'='*80}")
print(f"Total changes: {len(changes_df)}")
if len(changes_df) > 0:
    print(f"  - Updates to existing 6-digit parents: {(changes_df['action'] == 'updated').sum()}")
    print(f"  - New 6-digit parents created: {(changes_df['action'] == 'created').sum()}")
    print(f"\nSample of changes:")
    display(changes_df.head(20))

Original dataframe shape: (96960, 6)
Collapsed dataframe shape: (96798, 6)

Resolution distribution after collapsing:
resolution
4-digit    68780
2-digit    22010
6-digit     6008
Name: count, dtype: int64

Changes made:
Total changes: 413
  - Updates to existing 6-digit parents: 93
  - New 6-digit parents created: 320

Sample of changes:


Unnamed: 0,population,parent_6digit,action,old_freq,children_sum,new_freq
0,Azores Terceira Island,A*29:01:01,created,,0.004,0.004
1,China North Han,B*15:01:01,created,,0.0,0.0
2,China North Han,B*15:17:01,created,,0.0,0.0
3,China North Han,B*40:06:01,created,,0.0,0.0
4,China North Han,B*44:02:01,created,,0.01,0.01
5,China North Han,B*47:01:01,created,,0.0,0.0
6,China North Han,A*03:01:01,updated,0.029,0.0,0.029
7,China North Han,A*24:02:01,created,,0.152,0.152
8,China North Han,A*29:01:01,created,,0.0,0.0
9,China North Han,C*04:01:01,created,,0.076,0.076


In [6]:
def collapse_6digit_to_4digit(df):
    """
    For each study (population), collapse 6-digit alleles into their 4-digit parents.
    
    - If a 4-digit parent exists: update it to max(parent_freq, sum of 6-digit children freq)
    - If no 4-digit parent exists: create one with freq = sum of 6-digit children freq
    - Remove all 6-digit entries after processing
    """
    df_result = df.copy()
    
    # Add parent allele column for 6-digit entries
    df_result['parent_4digit'] = df_result.apply(
        lambda row: extract_allele_parts(row['allele'])['4digit'] 
        if row['resolution'] == '6-digit' and extract_allele_parts(row['allele']) 
        else None, 
        axis=1
    )
    
    changes_log = []
    
    for pop in df_result['population'].unique():
        pop_mask = df_result['population'] == pop
        pop_df = df_result[pop_mask]
        
        # Get 6-digit entries for this population
        six_digit_entries = pop_df[pop_df['resolution'] == '6-digit']
        
        if len(six_digit_entries) == 0:
            continue
        
        # Group 6-digit entries by their 4-digit parent
        for parent_4d in six_digit_entries['parent_4digit'].unique():
            if parent_4d is None:
                continue
            
            # Sum of children frequencies
            children_mask = (pop_mask) & (df_result['parent_4digit'] == parent_4d)
            children_freq_sum = df_result.loc[children_mask, 'alleles_over_2n'].sum()
            
            # Check if 4-digit parent exists
            parent_mask = (pop_mask) & (df_result['allele'] == parent_4d)
            parent_exists = parent_mask.sum() > 0
            
            if parent_exists:
                # Get current parent frequency
                parent_freq = df_result.loc[parent_mask, 'alleles_over_2n'].values[0]
                new_freq = max(parent_freq, children_freq_sum)
                
                # Update parent frequency
                df_result.loc[parent_mask, 'alleles_over_2n'] = new_freq
                
                changes_log.append({
                    'population': pop,
                    'parent_4digit': parent_4d,
                    'action': 'updated',
                    'old_freq': parent_freq,
                    'children_sum': children_freq_sum,
                    'new_freq': new_freq
                })
            else:
                # Create new 4-digit parent entry
                # Use the first child's metadata as template
                template_idx = df_result[children_mask].index[0]
                template_row = df_result.loc[template_idx].copy()
                
                template_row['allele'] = parent_4d
                template_row['resolution'] = '4-digit'
                template_row['alleles_over_2n'] = children_freq_sum
                template_row['parent_4digit'] = None
                
                df_result = pd.concat([df_result, pd.DataFrame([template_row])], ignore_index=True)
                
                changes_log.append({
                    'population': pop,
                    'parent_4digit': parent_4d,
                    'action': 'created',
                    'old_freq': None,
                    'children_sum': children_freq_sum,
                    'new_freq': children_freq_sum
                })
    
    # Remove all 6-digit entries
    df_result = df_result[df_result['resolution'] != '6-digit']
    
    # Drop the helper column
    df_result = df_result.drop(columns=['parent_4digit'])
    
    return df_result, pd.DataFrame(changes_log)


# Apply the collapse function (use df_collapsed from previous step to chain the collapses)
df_collapsed_4d, changes_df_4d = collapse_6digit_to_4digit(df_collapsed)

print(f"Input dataframe shape: {df_collapsed.shape}")
print(f"Collapsed dataframe shape: {df_collapsed_4d.shape}")
print(f"\nResolution distribution after collapsing 6-digit to 4-digit:")
print(df_collapsed_4d['resolution'].value_counts())
print(f"\n{'='*80}")
print(f"Changes made:")
print(f"{'='*80}")
print(f"Total changes: {len(changes_df_4d)}")
if len(changes_df_4d) > 0:
    print(f"  - Updates to existing 4-digit parents: {(changes_df_4d['action'] == 'updated').sum()}")
    print(f"  - New 4-digit parents created: {(changes_df_4d['action'] == 'created').sum()}")
    print(f"\nSample of changes:")
    display(changes_df_4d.head(20))

Input dataframe shape: (96798, 6)
Collapsed dataframe shape: (94851, 6)

Resolution distribution after collapsing 6-digit to 4-digit:
resolution
4-digit    72841
2-digit    22010
Name: count, dtype: int64

Changes made:
Total changes: 4543
  - Updates to existing 4-digit parents: 482
  - New 4-digit parents created: 4061

Sample of changes:


Unnamed: 0,population,parent_4digit,action,old_freq,children_sum,new_freq
0,Azores Terceira Island,A*31:01,created,,0.016,0.016
1,Azores Terceira Island,A*29:01,updated,0.0,0.0,0.0
2,Belgium,A*31:01,created,,0.042,0.042
3,China North Han,B*07:02,updated,0.0,0.019,0.019
4,China North Han,B*14:02,updated,0.005,0.0,0.005
5,China North Han,B*14:06,updated,0.0,0.0,0.0
6,China North Han,B*15:01,updated,0.038,0.0,0.038
7,China North Han,B*15:11,updated,0.0,0.014,0.014
8,China North Han,B*15:17,updated,0.0,0.0,0.0
9,China North Han,B*18:01,updated,0.0,0.0,0.0


In [7]:
# Find 2-digit parents that have frequency > sum of their 4-digit children

def find_2digit_larger_than_children(df, threshold=0.001):
    """
    Find 2-digit alleles where the parent frequency is greater than 
    the sum of its 4-digit children frequencies.
    """
    df_analysis = df.copy()
    
    # Get 2-digit parent for 4-digit entries
    df_analysis['parent_2digit'] = df_analysis.apply(
        lambda row: extract_allele_parts(row['allele'])['2digit'] 
        if row['resolution'] == '4-digit' and extract_allele_parts(row['allele']) 
        else None, 
        axis=1
    )
    
    results = []
    
    for pop in df_analysis['population'].unique():
        pop_mask = df_analysis['population'] == pop
        pop_df = df_analysis[pop_mask]
        
        # Get all 2-digit entries for this population
        two_digit_entries = pop_df[pop_df['resolution'] == '2-digit']
        
        for _, parent_row in two_digit_entries.iterrows():
            parent_allele = parent_row['allele']
            parent_freq = parent_row['alleles_over_2n']
            
            # Find 4-digit children of this 2-digit parent
            children_mask = (pop_mask) & (df_analysis['parent_2digit'] == parent_allele)
            children_freq_sum = df_analysis.loc[children_mask, 'alleles_over_2n'].sum()
            num_children = children_mask.sum()
            
            # Check if parent > sum of children
            if parent_freq > children_freq_sum + threshold and num_children > 0:
                results.append({
                    'population': pop,
                    'n': parent_row['n'],
                    'parent_2digit': parent_allele,
                    'parent_freq': parent_freq,
                    'children_sum': children_freq_sum,
                    'difference': parent_freq - children_freq_sum,
                    'num_4digit_children': num_children
                })
    
    return pd.DataFrame(results)


# Find cases where 2-digit > sum of 4-digit children
larger_parents_df = find_2digit_larger_than_children(df_collapsed_4d)

print(f"Number of 2-digit alleles with freq > sum of 4-digit children: {len(larger_parents_df)}")
print(f"Number of studies with such cases: {larger_parents_df['population'].nunique()}")

if len(larger_parents_df) > 0:
    # Summary by study
    study_summary = larger_parents_df.groupby('population').agg({
        'n': 'first',
        'parent_2digit': 'count',
        'difference': ['sum', 'mean', 'max']
    }).reset_index()
    study_summary.columns = ['population', 'sample_size', 'num_cases', 'total_diff', 'avg_diff', 'max_diff']
    study_summary = study_summary.sort_values('num_cases', ascending=False)
    
    print(f"\n{'='*80}")
    print(f"Studies with 2-digit parents larger than sum of 4-digit children:")
    print(f"{'='*80}")
    display(study_summary)
    
    print(f"\n{'='*80}")
    print(f"Detailed cases (sorted by difference):")
    print(f"{'='*80}")
    display(larger_parents_df.sort_values('difference', ascending=False).head(30))

Number of 2-digit alleles with freq > sum of 4-digit children: 89
Number of studies with such cases: 31

Studies with 2-digit parents larger than sum of 4-digit children:


Unnamed: 0,population,sample_size,num_cases,total_diff,avg_diff,max_diff
12,Italy Bergamo,101,34,2.02,0.059412,0.188
13,Italy North Pavia,81,9,0.756,0.084,0.247
8,France Southeast,130,5,0.024,0.0048,0.008
21,Portugal North,46,4,0.044,0.011,0.021
0,Argentina Chiriguano,54,3,0.642,0.214,0.236
3,Azores Terceira Island,130,3,0.011,0.003667,0.004
4,Burkina Faso Mossi,53,2,0.339,0.1695,0.207
11,Iran Tehran,120,2,0.179,0.0895,0.166
2,Austria,200,2,0.004,0.002,0.002
24,Senegal Niokholo Mandenka,165,2,0.046,0.023,0.033



Detailed cases (sorted by difference):


Unnamed: 0,population,n,parent_2digit,parent_freq,children_sum,difference,num_4digit_children
70,Scotland Orkney,99,C*07,0.378,0.0,0.378,1
69,Scotland Orkney,99,A*02,0.31,0.0,0.31,14
53,Italy North Pavia,81,A*02,0.253,0.006,0.247,1
1,Argentina Chiriguano,54,A*02,0.264,0.028,0.236,1
16,Guatemala Mayan,132,A*24,0.231,0.004,0.227,1
56,Italy North Pavia,81,C*07,0.241,0.025,0.216,1
4,Argentina Salta Wichi pop 2,19,A*02,0.289,0.079,0.21,1
0,Argentina Chiriguano,54,B*35,0.217,0.009,0.208,1
9,Burkina Faso Mossi,53,A*02,0.207,0.0,0.207,3
2,Argentina Chiriguano,54,A*68,0.245,0.047,0.198,1


In [8]:
# Step 1: Identify studies with total_diff > 0.005 and remove them
studies_to_remove = study_summary[study_summary['total_diff'] > 0.005]['population'].tolist()

print(f"Number of studies to remove (total_diff > 0.005): {len(studies_to_remove)}")
print(f"Studies being removed: {studies_to_remove[:10]}..." if len(studies_to_remove) > 10 else f"Studies being removed: {studies_to_remove}")

# Remove these studies from df_collapsed_4d
df_filtered = df_collapsed_4d[~df_collapsed_4d['population'].isin(studies_to_remove)]
print(f"\nAfter removing high-diff studies:")
print(f"  Original shape: {df_collapsed_4d.shape}")
print(f"  Filtered shape: {df_filtered.shape}")
print(f"  Studies remaining: {df_filtered['population'].nunique()}")

# Step 2: Remove all 2-digit entries
df_4digit_only = df_filtered[df_filtered['resolution'] == '4-digit']
print(f"\nAfter removing 2-digit entries:")
print(f"  Shape: {df_4digit_only.shape}")
print(f"  Studies remaining: {df_4digit_only['population'].nunique()}")




Number of studies to remove (total_diff > 0.005): 26
Studies being removed: ['Italy Bergamo', 'Italy North Pavia', 'France Southeast', 'Portugal North', 'Argentina Chiriguano', 'Azores Terceira Island', 'Burkina Faso Mossi', 'Iran Tehran', 'Senegal Niokholo Mandenka', 'Argentina Salta Wichi pop 2']...

After removing high-diff studies:
  Original shape: (94851, 6)
  Filtered shape: (90488, 6)
  Studies remaining: 806

After removing 2-digit entries:
  Shape: (69411, 6)
  Studies remaining: 525


In [9]:
print("Total number of studies:", df_4digit_only['population'].nunique())
# df_4digit_only = df_4digit_only[df_4digit_only['n'] > 100]
print("Total number of studies:", df_4digit_only['population'].nunique())

Total number of studies: 525
Total number of studies: 525


In [10]:
df_4digit_only[df_4digit_only['population'] == "England Newcastle"].head()

Unnamed: 0,gene,allele,population,alleles_over_2n,n,resolution
49531,B,B*67:01,England Newcastle,0.0,2739,4-digit


In [11]:
# Step 3: Check if sum of allele frequencies in each study exceeds 1 or is below 1
# Group by population and gene (since frequencies should sum to ~1 per gene per study)
freq_sums = df_4digit_only.groupby(['population', 'gene']).agg({
    'alleles_over_2n': 'sum',
    'n': 'first'
}).reset_index()
freq_sums.columns = ['population', 'gene', 'total_freq', 'n']

# Find studies where any gene has total_freq > 1 + threshold OR < 1 - threshold
threshold = 0.1
invalid_entries = freq_sums[(freq_sums['total_freq'] > 1.0 + threshold) | 
                            (freq_sums['total_freq'] < 1.0 - threshold)]

print(f"\n{'='*80}")
print(f"Validation: Checking for studies with allele frequency sum outside [1 - {threshold}, 1 + {threshold}]")
print(f"{'='*80}")
print(f"Number of (study, gene) combinations with freq sum > {1.0 + threshold}: {(freq_sums['total_freq'] > 1.0 + threshold).sum()}")
print(f"Number of (study, gene) combinations with freq sum < {1.0 - threshold}: {(freq_sums['total_freq'] < 1.0 - threshold).sum()}")
print(f"Total invalid combinations: {len(invalid_entries)}")
print(f"Number of unique studies breaking this rule: {invalid_entries['population'].nunique()}")

if len(invalid_entries) > 0:
    print(f"\n{'='*80}")
    print(f"Studies breaking the logical rule (freq sum outside [0.95, 1.05]):")
    print(f"{'='*80}")
    # Calculate deviation from 1.0
    invalid_entries['deviation'] = invalid_entries['total_freq'] - 1.0
    invalid_entries_sorted = invalid_entries.sort_values('deviation', key=abs, ascending=False)
    display(invalid_entries_sorted)
else:
    print("\n✓ All studies have valid allele frequency sums (within [0.95, 1.05])")



Validation: Checking for studies with allele frequency sum outside [1 - 0.1, 1 + 0.1]
Number of (study, gene) combinations with freq sum > 1.1: 2
Number of (study, gene) combinations with freq sum < 0.9: 230
Total invalid combinations: 232
Number of unique studies breaking this rule: 176

Studies breaking the logical rule (freq sum outside [0.95, 1.05]):


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  invalid_entries['deviation'] = invalid_entries['total_freq'] - 1.0


Unnamed: 0,population,gene,total_freq,n,deviation
232,England Leeds,A,0.0000,5024,-1.0000
64,Brazil Vale do Ribeira Quilombos,A,0.0000,144,-1.0000
65,Brazil Vale do Ribeira Quilombos,B,0.0000,144,-1.0000
66,Brazil Vale do Ribeira Quilombos,C,0.0000,144,-1.0000
233,England Leeds,B,0.0000,5024,-1.0000
...,...,...,...,...,...
310,Germany Essen,C,0.8440,174,-0.1560
494,Latvia,A,0.8697,266,-0.1303
527,Malaysia Perak Grik Jehai,A,0.8800,25,-0.1200
761,"Russia Bashkortostan, Bashkirs",C,0.8918,120,-0.1082


In [12]:
# Step 4: Remove invalid (population, gene) combinations where freq sum is outside [1-threshold, 1+threshold]
# Get the list of invalid combinations
invalid_combinations = set(zip(invalid_entries['population'], invalid_entries['gene']))

print(f"Number of invalid (population, gene) combinations to remove: {len(invalid_combinations)}")

# Create a mask to filter out invalid combinations
df_4digit_only['pop_gene'] = list(zip(df_4digit_only['population'], df_4digit_only['gene']))
df_4digit_cleaned = df_4digit_only[~df_4digit_only['pop_gene'].isin(invalid_combinations)]
df_4digit_cleaned = df_4digit_cleaned.drop(columns=['pop_gene'])

print(f"\nAfter removing invalid combinations:")
print(f"  Original shape: {df_4digit_only.shape}")
print(f"  Cleaned shape: {df_4digit_cleaned.shape}")
print(f"  Rows removed: {df_4digit_only.shape[0] - df_4digit_cleaned.shape[0]}")
print(f"  Studies remaining: {df_4digit_cleaned['population'].nunique()}")

# Verify the cleaning worked
freq_sums_cleaned = df_4digit_cleaned.groupby(['population', 'gene']).agg({
    'alleles_over_2n': 'sum'
}).reset_index()
freq_sums_cleaned.columns = ['population', 'gene', 'total_freq']

still_invalid = freq_sums_cleaned[(freq_sums_cleaned['total_freq'] > 1.0 + threshold) | 
                                   (freq_sums_cleaned['total_freq'] < 1.0 - threshold)]
print(f"\n✓ Verification: {len(still_invalid)} invalid combinations remaining (should be 0)")

Number of invalid (population, gene) combinations to remove: 232

After removing invalid combinations:
  Original shape: (69411, 7)
  Cleaned shape: (62925, 6)
  Rows removed: 6486
  Studies remaining: 381

✓ Verification: 0 invalid combinations remaining (should be 0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_4digit_only['pop_gene'] = list(zip(df_4digit_only['population'], df_4digit_only['gene']))


In [13]:
# Show final summary
print(f"\n{'='*80}")
print(f"Final dataset summary:")
print(f"{'='*80}")
print(f"Shape: {df_4digit_cleaned.shape}")
print(f"Number of studies: {df_4digit_cleaned['population'].nunique()}")
print(f"Resolution distribution: {df_4digit_cleaned['resolution'].value_counts().to_dict()}")


Final dataset summary:
Shape: (62925, 6)
Number of studies: 381
Resolution distribution: {'4-digit': 62925}


In [14]:

df_4digit_cleaned = df_4digit_cleaned[df_4digit_cleaned['n'] >= 100]

# Show final summary
print(f"\n{'='*80}")
print(f"Final dataset summary:")
print(f"{'='*80}")
print(f"Shape: {df_4digit_cleaned.shape}")
print(f"Number of studies: {df_4digit_cleaned['population'].nunique()}")
print(f"Resolution distribution: {df_4digit_cleaned['resolution'].value_counts().to_dict()}")


Final dataset summary:
Shape: (47421, 6)
Number of studies: 250
Resolution distribution: {'4-digit': 47421}


In [22]:
df_4digit_cleaned.head()
df_4digit_cleaned[df_4digit_cleaned["population"] == "Armenia combined Regions"].head(10)

Unnamed: 0,gene,allele,population,alleles_over_2n,n,resolution
12485,B,B*07:02,Armenia combined Regions,0.03,100,4-digit
14325,B,B*08:01,Armenia combined Regions,0.01,100,4-digit
15489,B,B*13:02,Armenia combined Regions,0.035,100,4-digit
16069,B,B*14:01,Armenia combined Regions,0.005,100,4-digit
16385,B,B*14:02,Armenia combined Regions,0.06,100,4-digit
17227,B,B*15:01,Armenia combined Regions,0.015,100,4-digit
18700,B,B*15:08,Armenia combined Regions,0.005,100,4-digit
19129,B,B*15:10,Armenia combined Regions,0.005,100,4-digit
20342,B,B*15:17,Armenia combined Regions,0.02,100,4-digit
24627,B,B*18:01,Armenia combined Regions,0.045,100,4-digit


In [23]:
df_4digit_cleaned.to_csv("cleaned_data.csv")