In [1]:
import pandas as pd
import numpy as np

In [53]:
# Load your cleaned dataset
df = pd.read_csv('../data/clean_access.csv')
df = df[df["scrape_status"] == "scraped"]
df.shape
df.columns


Index(['id', 'web_URL_id', 'domain_category', 'web_URL', 'scrape_status',
       'html_file_name', 'html_file_path', 'violation_count', 'violation_name',
       'violation_score', 'violation_description', 'violation_description_url',
       'affected_html_elements', 'violation_category', 'violation_impact',
       'wcag_reference', 'supplementary_information', 'high_severity'],
      dtype='object')

In [54]:
top_20 = (
    df["violation_name"]
    .value_counts()
    .head(20)
    .reset_index()
)

top_20.columns = ["violation_name", "count"]
top_20


Unnamed: 0,violation_name,count
0,color-contrast-enhanced,502
1,region,382
2,color-contrast,245
3,duplicate-id,230
4,link-name,200
5,landmark-unique,180
6,landmark-one-main,178
7,page-has-heading-one,177
8,heading-order,164
9,image-alt,115


In [55]:
impact_map = pd.read_csv("../data/User Impact Groups by Accessibility Need.csv")
impact_map

Unnamed: 0,Affected users,Violations,WCAG references
0,Visual disabilities,"color-contrast, color-contrast-enhanced, image...","1.4.3, 1.4.6, 1.1.1, 2.4.4, 2.4.6, 4.1.2"
1,Cognitive & learning disabilities,"region, page-has-heading-one, heading-order, m...","1.3.1, 2.4.6, 1.4.4"
2,Motor & mobility disabilities,"button-name, link-name, scrollable-region-focu...","2.1.1, 2.4.3, 4.1.2"
3,Language & comprehension support users,html-has-lang,3.1.1
4,All users (structural / technical),"duplicate-id, duplicate-id-aria, duplicate-id-...","1.3.1, 4.1.1, 4.1.2"


In [56]:
impact_map.columns = (
    impact_map.columns
    .str.strip()
    .str.lower()
    .str.replace(" ", "_")
)

impact_map.columns


Index(['affected_users', 'violations', 'wcag_references'], dtype='object')

In [57]:
impact_map_clean = impact_map[["affected_users", "violations"]].rename(
    columns={
        "affected_users": "disability_group",
        "violations": "violation_name"
    }
)

impact_map_clean




Unnamed: 0,disability_group,violation_name
0,Visual disabilities,"color-contrast, color-contrast-enhanced, image..."
1,Cognitive & learning disabilities,"region, page-has-heading-one, heading-order, m..."
2,Motor & mobility disabilities,"button-name, link-name, scrollable-region-focu..."
3,Language & comprehension support users,html-has-lang
4,All users (structural / technical),"duplicate-id, duplicate-id-aria, duplicate-id-..."


In [58]:
impact_map_clean["violation_name"] = (
    impact_map_clean["violation_name"]
    .str.split(",")
)

impact_map_clean = impact_map_clean.explode("violation_name")

impact_map_clean["violation_name"] = (
    impact_map_clean["violation_name"]
    .str.strip()
)

impact_map_clean.reset_index(drop=True, inplace=True)
impact_map_clean



Unnamed: 0,disability_group,violation_name
0,Visual disabilities,color-contrast
1,Visual disabilities,color-contrast-enhanced
2,Visual disabilities,image-alt
3,Visual disabilities,link-name
4,Visual disabilities,page-has-heading-one
5,Visual disabilities,empty-heading
6,Visual disabilities,button-name
7,Cognitive & learning disabilities,region
8,Cognitive & learning disabilities,page-has-heading-one
9,Cognitive & learning disabilities,heading-order


In [59]:
impact_analysis = top_20.merge(
    impact_map_clean,
    on="violation_name",
    how="inner"
)

impact_analysis



Unnamed: 0,violation_name,count,disability_group
0,color-contrast-enhanced,502,Visual disabilities
1,region,382,Cognitive & learning disabilities
2,color-contrast,245,Visual disabilities
3,duplicate-id,230,All users (structural / technical)
4,link-name,200,Visual disabilities
5,link-name,200,Motor & mobility disabilities
6,landmark-unique,180,All users (structural / technical)
7,landmark-one-main,178,All users (structural / technical)
8,page-has-heading-one,177,Visual disabilities
9,page-has-heading-one,177,Cognitive & learning disabilities


In [60]:
exclusion_summary = (
    impact_analysis
    .groupby("disability_group")["count"]
    .sum()
    .reset_index()
    .sort_values("count", ascending=False)
)

exclusion_summary


Unnamed: 0,disability_group,count
4,Visual disabilities,1376
0,All users (structural / technical),850
1,Cognitive & learning disabilities,776
3,Motor & mobility disabilities,320
2,Language & comprehension support users,48


In [64]:
disability_exclusion = exclusion_summary[
    exclusion_summary["disability_group"] != "All users (structural / technical)"
]

structural_failures = exclusion_summary[
    exclusion_summary["disability_group"] == "All users (structural / technical)"
]


In [65]:
total = exclusion_summary["count"].sum()
exclusion_summary["percentage"] = (
    exclusion_summary["count"] / total * 100
).round(2)

exclusion_summary


Unnamed: 0,disability_group,count,percentage
4,Visual disabilities,1376,40.83
0,All users (structural / technical),850,25.22
1,Cognitive & learning disabilities,776,23.03
3,Motor & mobility disabilities,320,9.5
2,Language & comprehension support users,48,1.42


In [80]:
# Disability Population Data
# Source: Cornell University Disability Statistics (2023)
# URL: https://www.disabilitystatistics.org/reports/acs.cfm
# Based on: U.S. Census Bureau American Community Survey

disability_population = {
    'Visual Disability': {
        'percentage': 2.5,
        'population': 8_301_100
    },
    'Hearing Disability': {
        'percentage': 3.7,
        'population': 12_105_000
    },
    'Ambulatory Disability': {
        'percentage': 6.6,
        'population': 20_795_100
    },
    'Cognitive Disability': {
        'percentage': 5.8,
        'population': 18_009_100
    },
    'Self-Care Disability': {
        'percentage': 2.6,
        'population': 8_026_300
    },
    'Independent Living Disability': {
        'percentage': 5.8,
        'population': 15_797_600
    },
    'Any Disability': {
        'percentage': 13.5,
        'population': 44_765_700
    }
}



In [83]:
# CORRECT mapping - match YOUR exact group names
your_group_to_cornell = {
    'Visual disabilities': 'Visual Disability',  # Match your exact name!
    'Cognitive & learning disabilities': 'Cognitive Disability',
    'Motor & mobility disabilities': 'Ambulatory Disability',
    'Deaf & hard-of-hearing users': 'Hearing Disability',
    'Language & comprehension support users': None,
    'All users (structural / technical)': 'Any Disability'
}


In [79]:
# Apply the corrected mapping
impact_analysis['cornell_category'] = impact_analysis['disability_group'].map(your_group_to_cornell)

# Add population data
impact_analysis['affected_population'] = impact_analysis['cornell_category'].apply(
    lambda x: disability_population[x]['population'] if x and x in disability_population else 0
)

impact_analysis['population_percentage'] = impact_analysis['cornell_category'].apply(
    lambda x: disability_population[x]['percentage'] if x and x in disability_population else 0
)

# Add percentage of violations
impact_analysis['violation_percentage'] = (impact_analysis['count'] / impact_analysis['count'].sum()) * 100

print(impact_analysis[['disability_group', 'count', 'cornell_category', 'affected_population']])


 Mapping complete!
                          disability_group  count       cornell_category  \
0                      Visual disabilities    502      Visual Disability   
1        Cognitive & learning disabilities    382   Cognitive Disability   
2                      Visual disabilities    245      Visual Disability   
3       All users (structural / technical)    230         Any Disability   
4                      Visual disabilities    200      Visual Disability   
5            Motor & mobility disabilities    200  Ambulatory Disability   
6       All users (structural / technical)    180         Any Disability   
7       All users (structural / technical)    178         Any Disability   
8                      Visual disabilities    177      Visual Disability   
9        Cognitive & learning disabilities    177   Cognitive Disability   
10       Cognitive & learning disabilities    164   Cognitive Disability   
11                     Visual disabilities    115      Visual Disabi

In [81]:
# Group by disability group to get totals
impact_by_group = impact_analysis.groupby('disability_group').agg({
    'count': 'sum',
    'cornell_category': 'first',
    'affected_population': 'first',
    'population_percentage': 'first'
}).reset_index()

# Add percentage
impact_by_group['violation_percentage'] = (impact_by_group['count'] / impact_by_group['count'].sum()) * 100

# Sort by count
impact_by_group = impact_by_group.sort_values('count', ascending=False)

print("\n IMPACT BY DISABILITY GROUP:")
print(impact_by_group)


 IMPACT BY DISABILITY GROUP:
                         disability_group  count       cornell_category  \
4                     Visual disabilities   1376      Visual Disability   
0      All users (structural / technical)    850         Any Disability   
1       Cognitive & learning disabilities    776   Cognitive Disability   
3           Motor & mobility disabilities    320  Ambulatory Disability   
2  Language & comprehension support users     48                   None   

   affected_population  population_percentage  violation_percentage  
4              8301100                    2.5             40.830861  
0             44765700                   13.5             25.222552  
1             18009100                    5.8             23.026706  
3             20795100                    6.6              9.495549  
2                    0                    0.0              1.424332  


In [82]:
print("\n" + "="*80)
print("REAL-WORLD IMPACT: Who Gets Excluded?")
print("="*80)

for _, row in impact_by_group.iterrows():
    group = row['disability_group']
    violations = row['count']
    violation_pct = row['violation_percentage']
    population = row['affected_population']
    pop_pct = row['population_percentage']
    
    print(f"\n{group}:")
    print(f"  {violations:,} violations ({violation_pct:.1f}% of all violations)")
    
    if population > 0:
        print(f" Affects {population:,} Americans ({pop_pct}% of US population)")
        print(f" That's {violations:,} barriers blocking {population:,} people!")
    else:
        print(f"  Affects all users (linguistic/technical barriers)")

print(f"\n" + "="*80)
print(f"TOTAL: {disability_population['Any Disability']['population']:,} Americans with disabilities")
print(f"ALL face these {impact_by_group['count'].sum():,} total violations")
print("="*80)


REAL-WORLD IMPACT: Who Gets Excluded?

Visual disabilities:
  1,376 violations (40.8% of all violations)
 Affects 8,301,100 Americans (2.5% of US population)
 That's 1,376 barriers blocking 8,301,100 people!

All users (structural / technical):
  850 violations (25.2% of all violations)
 Affects 44,765,700 Americans (13.5% of US population)
 That's 850 barriers blocking 44,765,700 people!

Cognitive & learning disabilities:
  776 violations (23.0% of all violations)
 Affects 18,009,100 Americans (5.8% of US population)
 That's 776 barriers blocking 18,009,100 people!

Motor & mobility disabilities:
  320 violations (9.5% of all violations)
 Affects 20,795,100 Americans (6.6% of US population)
 That's 320 barriers blocking 20,795,100 people!

Language & comprehension support users:
  48 violations (1.4% of all violations)
  Affects all users (linguistic/technical barriers)

TOTAL: 44,765,700 Americans with disabilities
ALL face these 3,370 total violations


In [78]:
# Save the results
impact_by_group.to_csv('../data/q1_impact_by_group.csv', index=False)
print("\n✅ Saved: q1_impact_by_group.csv")


✅ Saved: q1_impact_by_group.csv
