In [None]:
from utils import *
import pandas as pd

## Load and Clean Data

In [None]:
# Load raw data
df_raw = pd.read_csv("afnd.tsv", sep="\t")
print(f"Raw data shape: {df_raw.shape}")
df_raw.head()

In [None]:
# Clean data using clean_data function
df_clean = clean_data(df_raw, class1_only=True, remove_g_groups=True, verbose=True)
df_clean.head()

## Test Resolution Detection

In [None]:
# Add resolution column and check distribution
df_clean['resolution'] = df_clean['allele'].apply(get_allele_resolution)
print("Resolution distribution:")
print(df_clean['resolution'].value_counts())

## Test Individual Collapse Functions

In [None]:
# Test collapse_8digit_to_6digit
df_collapsed_6d, changes_8to6 = collapse_8digit_to_6digit(df_clean, verbose=True)
print(f"\nChanges log shape: {changes_8to6.shape}")
if len(changes_8to6) > 0:
    display(changes_8to6.head(10))

In [None]:
# Test collapse_6digit_to_4digit
df_collapsed_4d, changes_6to4 = collapse_6digit_to_4digit(df_collapsed_6d, verbose=True)
print(f"\nChanges log shape: {changes_6to4.shape}")
if len(changes_6to4) > 0:
    display(changes_6to4.head(10))

In [None]:
# Check resolution distribution after collapsing
print("Resolution distribution after collapsing:")
print(df_collapsed_4d['resolution'].value_counts())

## Test 2-digit Inconsistency Detection

In [None]:
# Test find_2digit_larger_than_children
larger_parents = find_2digit_larger_than_children(df_collapsed_4d, threshold=0.001, verbose=True)
print(f"\nFound {len(larger_parents)} cases where 2-digit freq > sum of 4-digit children")
if len(larger_parents) > 0:
    display(larger_parents.head(10))

In [None]:
# Test remove_inconsistent_2digit_studies
df_consistent = remove_inconsistent_2digit_studies(df_collapsed_4d, max_total_diff=0.005, verbose=True)

## Test Frequency Validation

In [None]:
# Remove 2-digit entries first
df_4digit_only = df_consistent[df_consistent['resolution'] == '4-digit']
print(f"Shape after removing 2-digit entries: {df_4digit_only.shape}")

# Test validate_frequency_sums
valid_df, invalid_df = validate_frequency_sums(df_4digit_only, threshold=0.1, verbose=True)
if len(invalid_df) > 0:
    print("\nInvalid combinations:")
    display(invalid_df.head(10))

In [None]:
# Test remove_invalid_freq_combinations
df_validated = remove_invalid_freq_combinations(df_4digit_only, threshold=0.1, verbose=True)

## Test Complete Pipeline (collapse_to_4digit)

In [1]:
# Reload utils to ensure we have the latest version
from importlib import reload
import utils
reload(utils)
from utils import *

# Load fresh data
df_raw = pd.read_csv("afnd.tsv", sep="\t")
df_clean = clean_data(df_raw, verbose=False)

# Run the complete pipeline
df_final = collapse_to_4digit(
    df_clean,
    remove_inconsistent_studies=True,
    max_2digit_diff=0.005,
    freq_sum_threshold=0.1,
    min_sample_size=100,
    verbose=True
)

Starting collapse_to_4digit pipeline
Input shape: (96960, 5)
Input studies: 832

Resolution distribution before collapse:
{'4-digit': 68780, '2-digit': 22010, '6-digit': 5688, '8-digit': 482}

--- Step 1: Collapse 8-digit to 6-digit ---


Collapsing 8-digit to 6-digit: 100%|██████████| 832/832 [00:12<00:00, 67.69it/s] 



Collapsed 8-digit to 6-digit: (96960, 6) -> (96798, 6)
  Updates: 93
  Created: 320

--- Step 2: Collapse 6-digit to 4-digit ---


Collapsing 6-digit to 4-digit: 100%|██████████| 832/832 [02:01<00:00,  6.85it/s] 
Collapsing 6-digit to 4-digit: 100%|██████████| 832/832 [02:01<00:00,  6.85it/s]


Collapsed 6-digit to 4-digit: (96798, 6) -> (94851, 6)
  Updates: 482
  Created: 4061

--- Step 3: Remove inconsistent 2-digit studies ---


Finding 2-digit inconsistencies: 100%|██████████| 832/832 [01:15<00:00, 10.96it/s] 

Removed 26 studies with total_diff > 0.005
  Shape: (94851, 6) -> (90488, 6)
  Studies: 832 -> 806

--- Step 4: Remove 2-digit entries ---
Removed 21077 2-digit entries
  Shape: 90488 -> 69411

--- Step 5: Validate frequency sums ---
Removed 232 invalid (population, gene) combinations
  Shape: (69411, 6) -> (62925, 6)
  Studies: 525 -> 381

--- Step 6: Normalize frequencies ---
Normalized frequencies for 958 (population, gene) combinations
  Frequency sum range after normalization: [1.000000, 1.000000]

--- Step 7: Filter by sample size >= 100 ---
Studies: 381 -> 250

Pipeline complete!
Final shape: (47421, 6)
Final studies: 250
Resolution distribution: {'4-digit': 47421}





In [2]:
# Verify the final result
print("=" * 80)
print("Final Verification")
print("=" * 80)

# Check all entries are 4-digit
print(f"\nResolution check: {df_final['resolution'].unique()}")
assert (df_final['resolution'] == '4-digit').all(), "Not all entries are 4-digit!"
print("✓ All entries are 4-digit")

# Check frequency sums are exactly 1.0 for each (population, gene) combination
freq_sums = df_final.groupby(['population', 'gene'])['alleles_over_2n'].sum()
print(f"\nFrequency sum range: [{freq_sums.min():.6f}, {freq_sums.max():.6f}]")
tolerance = 1e-9
assert abs(freq_sums - 1.0).max() < tolerance, f"Frequency sums not equal to 1.0! Max deviation: {abs(freq_sums - 1.0).max()}"
print(f"✓ All frequency sums equal to 1.0 (within tolerance {tolerance})")

# Check sample size
print(f"\nSample size range: [{df_final['n'].min()}, {df_final['n'].max()}]")
assert df_final['n'].min() >= 100, "Sample sizes below 100!"
print("✓ All sample sizes >= 100")

print(f"\n✓ All validations passed!")
print(f"Final dataset: {df_final.shape[0]} rows, {df_final['population'].nunique()} studies")

Final Verification

Resolution check: ['4-digit']
✓ All entries are 4-digit

Frequency sum range: [1.000000, 1.000000]
✓ All frequency sums equal to 1.0 (within tolerance 1e-09)

Sample size range: [100, 3456066]
✓ All sample sizes >= 100

✓ All validations passed!
Final dataset: 47421 rows, 250 studies


In [3]:
# Preview the final data
df_final.head(20)

Unnamed: 0,gene,allele,population,alleles_over_2n,n,resolution
12485,B,B*07:02,Armenia combined Regions,0.030612,100,4-digit
12486,B,B*07:02,Australia Cape York Peninsula Aborigine,0.045,103,4-digit
12487,B,B*07:02,Australia New South Wales Caucasian,0.121827,134,4-digit
12488,B,B*07:02,Australia Yuendumu Aborigine,0.0,191,4-digit
12489,B,B*07:02,Austria,0.129,200,4-digit
12493,B,B*07:02,Brazil Puyanawa,0.06012,150,4-digit
12498,B,B*07:02,Cameroon Beti,0.083092,174,4-digit
12503,B,B*07:02,China Beijing Shijiazhuang Tianjian Han,0.033465,618,4-digit
12504,B,B*07:02,China Canton Han,0.008016,264,4-digit
12505,B,B*07:02,China Guangzhou,0.0,102,4-digit


In [4]:
df_final.to_csv("cleaned_data3.csv")

In [5]:
# Compare cleaned_data.csv and cleaned_data2.csv
df1 = pd.read_csv("cleaned_data.csv")
df2 = pd.read_csv("cleaned_data3.csv")

print(f"cleaned_data.csv shape: {df1.shape}")
print(f"cleaned_data2.csv shape: {df2.shape}")
print(f"\nSame shape: {df1.shape == df2.shape}")
print(f"Same columns: {list(df1.columns) == list(df2.columns)}")

# Check if identical
if df1.shape == df2.shape and list(df1.columns) == list(df2.columns):
    print(f"\nDataframes are identical: {df1.equals(df2)}")
    
    # If not identical, show differences
    if not df1.equals(df2):
        diff = df1.compare(df2)
        print(f"\nNumber of cell differences: {len(diff)}")
        if len(diff) > 0:
            display(diff.head(20))
else:
    # Find rows unique to each using merge
    merged = df1.merge(df2, indicator=True, how='outer')
    print("\nRow comparison:")
    print(merged['_merge'].value_counts())

cleaned_data.csv shape: (47421, 7)
cleaned_data2.csv shape: (47421, 7)

Same shape: True
Same columns: True

Dataframes are identical: False

Number of cell differences: 30287


Unnamed: 0_level_0,alleles_over_2n,alleles_over_2n
Unnamed: 0_level_1,self,other
0,0.03,0.030612
2,0.12,0.121827
5,0.06,0.06012
6,0.086,0.083092
7,0.034,0.033465
8,0.008,0.008016
10,0.0162,0.016205
11,0.0215,0.021444
12,0.0193,0.019145
13,0.019,0.018849
