In [1]:
import pandas as pd

# Load and filter for North America
df = pd.read_csv("alzheimers_prediction_dataset.csv") 
country_data = df[df['Country'].isin(['USA', 'Mexico', 'Canada'])].copy()

# Prepare country-level prevalence for D3 map
prevalence = country_data.groupby('Country')['Alzheimer’s Diagnosis'].apply(lambda x: (x == 'Yes').mean()).reset_index()
prevalence.columns = ['Country', 'Prevalence']
prevalence.to_json('graphs/prevalence_by_country.json', orient='records')


In [2]:
import pandas as pd
import altair as alt

# Enable Altair for large datasets
alt.data_transformers.enable('default', max_rows=None)

# Load and filter for North America
df = pd.read_csv("alzheimers_prediction_dataset.csv")
country_data = df[df['Country'].isin(['USA', 'Canada', 'Mexico'])].copy()


In [3]:
# Create a violin plot with dropdown to filter by country (USA, Canada, Mexico)

# Use North American subset
na_data = country_data.copy()

# Create country dropdown interaction
country_dropdown = alt.binding_select(options=sorted(na_data['Country'].unique()), name='Country:')
country_selection = alt.selection_point(fields=['Country'], bind=country_dropdown, value='USA')

# Violin-style age distribution plot, colored by diagnosis, filtered by selected country
violin_country_plot = alt.Chart(na_data).transform_filter(
    country_selection
).transform_density(
    density='Age',
    as_=['Age', 'density'],
    groupby=['Alzheimer’s Diagnosis']
).mark_area(orient='vertical').encode(
    x='Age:Q',
    y='density:Q',
    color='Alzheimer’s Diagnosis:N',
    column='Alzheimer’s Diagnosis:N',
    tooltip=['Age:Q', 'density:Q', 'Alzheimer’s Diagnosis:N']
).add_params(
    country_selection
).properties(
    title='Age Distribution by Alzheimer’s Diagnosis (Select Country)',
    width=400,
    height=350
)

violin_country_plot

violin_country_plot.save('graphs/viz2.json')


In [4]:
# List of categorical variables with three levels to include in the dropdown
categorical_variables = [
    'Air Pollution Exposure', 'Dietary Habits', 'Sleep Quality', 'Depression Level', 
    'Alcohol Consumption', 'Smoking Status', 'Income Level', 'Stress Levels', 'Social Engagement Level'
]

# Create a dropdown selector for the categorical variables
input_dropdown = alt.binding_select(options=categorical_variables, name='Category ')

# Setup the selection without initialization
selection = alt.selection_single(fields=['variable'], bind=input_dropdown)

# Calculate the count per category based on the selected variable
base = alt.Chart(country_data).transform_fold(
    categorical_variables,
    as_=['variable', 'value']
).transform_filter(
    selection
).transform_aggregate(
    count='count()',
    groupby=['value', 'Alzheimer’s Diagnosis']
)

# Create the normalized stacked bar chart
normalized_stacked_bar_chart = base.mark_bar().encode(
    x=alt.X('sum(count):Q', stack='normalize', title='Percentage of Cases', axis=alt.Axis(format='%')),
    y=alt.Y('value:N', title='Category Value'),
    color=alt.Color('Alzheimer’s Diagnosis:N', legend=alt.Legend(title="Alzheimer’s Diagnosis")),
    tooltip=[alt.Tooltip('Alzheimer’s Diagnosis:N'), alt.Tooltip('sum(count):Q', title='Percentage', format='.0%')]
).properties(
    title='Impact of Various Categories on Alzheimer’s Diagnosis in USA',
    width=800,
    height=500
).add_params(
    selection
)

# Display the chart
normalized_stacked_bar_chart

#normalized_stacked_bar_chart.save('graphs/viz3.json')

Deprecated since `altair=5.0.0`. Use selection_point instead.
  selection = alt.selection_single(fields=['variable'], bind=input_dropdown)


In [9]:
# Your categorical variables
categorical_variables = [
    'Air Pollution Exposure', 'Dietary Habits', 'Sleep Quality', 'Depression Level',
    'Alcohol Consumption', 'Smoking Status', 'Income Level', 'Stress Levels', 'Social Engagement Level'
]

# Clean sort order dictionary
sort_orders = {
    'Air Pollution Exposure': ['Low', 'Medium', 'High'],
    'Dietary Habits': ['Unhealthy', 'Average', 'Healthy'],
    'Sleep Quality': ['Poor', 'Average', 'Good'],
    'Depression Level': ['Low', 'Medium', 'High'],
    'Alcohol Consumption': ['Never', 'Occasionally', 'Regularly'],
    'Smoking Status': ['Never', 'Former', 'Current'],
    'Income Level': ['Low', 'Medium', 'High'],
    'Stress Levels': ['Low', 'Medium', 'High'],
    'Social Engagement Level': ['Low', 'Medium', 'High']
}

# Create dropdown param with default value
dropdown = alt.param(name='selected_var',
                     bind=alt.binding_select(options=categorical_variables, name='Select Category:'),
                     value='Social Engagement Level')

# Fold all categorical variables
base = alt.Chart(country_data).transform_fold(
    categorical_variables,
    as_=['variable', 'value']
).transform_filter(
    alt.datum.variable == dropdown
).transform_filter(
    "datum.value != null && datum.value != 'Unknown'"
).transform_aggregate(
    count='count()',
    groupby=['value', 'Alzheimer’s Diagnosis']
)

# Fix sort dynamically using a conditional sort dictionary
# Instead of passing a dynamic expression (which was buggy), we create one fixed Y-axis sort — just for now — for the default
# You can duplicate this chart if needed for multiple variables with hardcoded sort values per one

chart = base.mark_bar().encode(
    x=alt.X('sum(count):Q', stack='normalize', title='Percentage of Cases', axis=alt.Axis(format='%')),
    y=alt.Y('value:N',
            title='Category Value',
            sort=sort_orders['Social Engagement Level']),  # manually assign the correct default sort
    color=alt.Color('Alzheimer’s Diagnosis:N', legend=alt.Legend(title="Alzheimer’s Diagnosis")),
    tooltip=[
        alt.Tooltip('value:N', title='Category'),
        alt.Tooltip('Alzheimer’s Diagnosis:N'),
        alt.Tooltip('sum(count):Q', title='Percentage', format='.0%')
    ]
).add_params(
    dropdown
).properties(
    title='Impact of Various Categories on Alzheimer’s Diagnosis in USA',
    width=800,
    height=500
)

chart
normalized_stacked_bar_chart.save('graphs/viz3.json')

In [5]:
import altair as alt

import pandas as pd

# List of risk factors to include
risk_factors = [
    'Diabetes',
    'Hypertension',
    'Genetic Risk Factor (APOE-ε4 allele)',
    'Family History of Alzheimer’s'
]

# Filter for North American countries
na_countries = ['USA', 'Mexico', 'Canada']
risk_data = []

for country in na_countries:
    country_subset = country_data[country_data['Country'] == country]
    total = len(country_subset)
    
    for factor in risk_factors:
        with_risk = len(country_subset[country_subset[factor] == 'Yes'])
        percentage = (with_risk / total) * 100
        risk_data.append({
            'Country': country,
            'Risk Factor': factor,
            'Percentage': percentage
        })

# Create the DataFrame
risk_df = pd.DataFrame(risk_data)

# Convert the percentage data into a scatterplot-ready format
# Use the same risk_df from earlier (country, risk factor, percentage)
scatter_plot = alt.Chart(risk_df).mark_circle(size=120).encode(
    x=alt.X('Country:N', title='Country'),
    y=alt.Y('Percentage:Q', title='Percentage of Population (%)'),
    color=alt.Color('Risk Factor:N', title='Risk Factor'),
    shape=alt.Shape('Risk Factor:N'),
    tooltip=['Country:N', 'Risk Factor:N', alt.Tooltip('Percentage:Q', format='.1f')]
).properties(
    width=800,
    height=400,
    title="Alzheimer's Risk Factors by Country (Scatterplot View)"
)

# Save as Vega-Lite JSON
scatter_plot.save('graphs/viz4.json')

scatter_plot


In [6]:
import pandas as pd
import altair as alt

alt.data_transformers.enable('default', max_rows=None)

# Load the dataset
df = pd.read_csv("alzheimers_prediction_dataset.csv")

# Filter for North America
df_na = df[df['Country'].isin(['USA', 'Canada', 'Mexico'])].copy()

# Create Age Group
df_na['Age Group'] = pd.cut(
    df_na['Age'],
    bins=[39, 49, 59, 69, 79, 89, 100],
    labels=['40-49', '50-59', '60-69', '70-79', '80-89', '90+']
)

# --- Left Chart: Diagnosis Rate by Age Group ---
# Calculate proportions
age_diagnosis = df_na.groupby(['Age Group', 'Alzheimer’s Diagnosis']).size().reset_index(name='Count')
total_by_age = df_na.groupby('Age Group').size().reset_index(name='Total')
age_diagnosis = age_diagnosis.merge(total_by_age, on='Age Group')
age_diagnosis['Proportion'] = age_diagnosis['Count'] / age_diagnosis['Total']
alzheimers_by_age = age_diagnosis[age_diagnosis['Alzheimer’s Diagnosis'] == 'Yes'].copy()

# Dropdown interaction
age_dropdown = alt.binding_select(options=list(alzheimers_by_age['Age Group'].unique()), name='Highlight Age Group:')
age_selection = alt.selection_point(fields=['Age Group'], bind=age_dropdown, value='60-69')

bar_chart = alt.Chart(alzheimers_by_age).mark_bar(size=60).encode(
    x='Age Group:N',
    y=alt.Y('Proportion:Q', axis=alt.Axis(format='%'), title='Diagnosis Rate (%)'),
    color=alt.condition(age_selection, alt.value('steelblue'), alt.value('lightgray')),
    tooltip=[
        alt.Tooltip('Age Group:N'),
        alt.Tooltip('Proportion:Q', format='.1%'),
        'Count:Q',
        'Total:Q'
    ]
).add_params(age_selection).properties(
    title='Percent of People Diagnosed with Alzheimer’s by Age Group',
    width=400,
    height=350
)

# Text Labels
text = alt.Chart(alzheimers_by_age).mark_text(dy=-10, fontWeight='bold').encode(
    x='Age Group:N',
    y='Proportion:Q',
    text=alt.Text('Proportion:Q', format='.0%'),
    color=alt.condition(age_selection, alt.value('black'), alt.value('gray'))
).add_params(age_selection)

# --- Right Chart: Genetic Risk by Diagnosis within Age Group (Raw Counts) ---
df_na['Age Group'] = df_na['Age Group'].astype(str)

# Right chart: Raw counts of family history by diagnosis for selected age group
family_chart = alt.Chart(df_na).mark_bar().encode(
    y=alt.Y('Family History of Alzheimer’s:N', title='Family History'),
    x=alt.X('count():Q', title='Number of People'),
    color=alt.Color("Alzheimer’s Diagnosis:N", legend=alt.Legend(title="Alzheimer’s Diagnosis")),
    tooltip=[
        'Family History of Alzheimer’s:N',
        'Alzheimer’s Diagnosis:N',
        alt.Tooltip('count():Q', title='Number of People')
    ]
).transform_filter(
    age_selection
).properties(
    title="Family History of Alzheimer’s by Diagnosis in Selected Age Group",
    width=400,
    height=350
)

# Combine left (bar + text) and right (family chart)
final_combined_family = (bar_chart + text) | family_chart
final_combined_family

final_combined_family.save('graphs/viz5.json')



  age_diagnosis = df_na.groupby(['Age Group', 'Alzheimer’s Diagnosis']).size().reset_index(name='Count')
  total_by_age = df_na.groupby('Age Group').size().reset_index(name='Total')


In [7]:
# Load the full dataset again
df = pd.read_csv("alzheimers_prediction_dataset.csv")

# Clean country-level prevalence across all countries
prevalence_all = (
    df.groupby('Country')['Alzheimer’s Diagnosis']
    .apply(lambda x: (x == 'Yes').mean())
    .reset_index(name='Prevalence')
)

# Format as list of dicts for D3
prevalence_all_d3 = prevalence_all.to_dict(orient='records')

# Save the JSON for D3 to use
import json
output_path_all = "prevalence_by_country.json"
with open(output_path_all, "w") as f:
    json.dump(prevalence_all_d3, f, indent=2)

prevalence_all_d3[:5], output_path_all  # Preview first 5 entries and file path


([{'Country': 'Argentina', 'Prevalence': 0.40203698740284105},
  {'Country': 'Australia', 'Prevalence': 0.4026934248745709},
  {'Country': 'Brazil', 'Prevalence': 0.48580359468611617},
  {'Country': 'Canada', 'Prevalence': 0.34141740770681756},
  {'Country': 'China', 'Prevalence': 0.38641425389755013}],
 'prevalence_by_country.json')