# Project/Questions for Project (Not staff)

In a perfect world, my static visualization (and I'll likely repeat this process for my interactive visualization) will answer one or more of the following questions
* What metro area has the largest wage gap between men and women in 2023? 
    * How does that differ per industry? 
    * How does that change by race? Hispanic/Latino ethnicity? 
    * How does this differ from 2013?

* What industry has the largest wage gap between men and women in 2023? 
    * How does that differ by metro area?
    * How does that change by race? Hispanic/Latino ethnicity?  
    * How does this differ from 2013?

In [None]:
import pandas as pd
import polars as pl
import skimpy
import numpy as np
import os
import pathlib
import altair as alt
alt.data_transformers.enable('default', max_rows=None)
from altair import datum
from clean_wages_df import clean_wages
#from scipy.stats import gaussian_kde

import os



In [None]:
if not os.path.exists("htmls/"):
    os.makedirs("htmls/")

color_fem = '#D03E84'
color_male = '#4269DD'
color_money = '#75BB88'
color_gold = '#C4B257'

In [None]:
all_23, women_23, men_23 = clean_wages()

all_23.write_csv("../data/metrowages23_final.csv")

In [None]:
all_23.head()

# Race Box: Charts By Metric

Goal: Show differences in income metrics by race<br>
Solution: Generate charts for each metric, join together

## Data Prep

In [None]:
quants_avg_by_race_23_all = all_23.group_by([pl.col('race_ethnc_gen'), pl.col('sex')]).agg(
    pl.col('incwage').mean().round().alias('avg'),
    pl.col('incwage').quantile(0.25).round().alias('p25'),
    pl.col('incwage').quantile(0.75).round().alias('p75'),
)
quants_avg_by_race_23_all

In [None]:
# sort order

race_order = quants_avg_by_race_23_all.filter(pl.col('sex')== "Male").sort(pl.col('avg'), descending=True)['race_ethnc_gen'].to_list()

## Graph Creation

### 25th percentile

In [None]:
# Create points for every 25th percentile

q25_avg_by_race_23_point = alt.Chart(quants_avg_by_race_23_all).mark_point(
    filled=True, size =120, shape = 'triangle-right').encode(
        y=alt.Y("race_ethnc_gen:N", sort=race_order),
        x=alt.X("p25:Q", title="Annual Income"),
        color = alt.Color("sex:N").scale(
            domain=['Female', 'Male'],
            range=[color_fem, color_male]
        ),
          tooltip= [
      { "field": "sex", "type": "nominal", "title": "Sex" },
      { "field": "race_ethnc_gen", "type": "nominal", "title": "Race/Ethnicity" },
      { "field": "p25", "type": "quantitative", "title": "25th Percentile Annual Income", "format": "$,.0f" },
      { "field": "avg", "type": "quantitative", "title": "Average Annual Income", "format": "$,.0f" },
      { "field": "p75", "type": "quantitative", "title": "75th Percentile Annual Income", "format": "$,.0f" },
    ]
    )
#q25_avg_by_race_23_point 

### Mean

In [None]:
# Create points for every mean value

mean_avg_by_race_23_point = alt.Chart(quants_avg_by_race_23_all).mark_point(
    
    filled=True, size =80, shape = 'square').encode(
        y=alt.Y("race_ethnc_gen:N", sort=race_order),
        x=alt.X("avg:Q", title="Annual Income"),
        color = alt.Color("sex:N").scale(
            domain=['Female', 'Male'],
            range=[color_fem, color_male]
        ),
          tooltip= [
      { "field": "sex", "type": "nominal", "title": "Sex" },
      { "field": "race_ethnc_gen", "type": "nominal", "title": "Race/Ethnicity" },
      { "field": "p25", "type": "quantitative", "title": "25th Percentile Annual Income", "format": "$,.0f" },
      { "field": "avg", "type": "quantitative", "title": "Average Annual Income", "format": "$,.0f" },
      { "field": "p75", "type": "quantitative", "title": "75th Percentile Annual Income", "format": "$,.0f" },
    ]
    )
    
# Experiment with line

mean_avg_by_race_23_line = alt.Chart(quants_avg_by_race_23_all).mark_rule().encode(
    y=alt.Y("race_ethnc_gen:N", title=None, sort=race_order),
    x="min(avg)",
    x2="max(avg)",
) #+ mean_avg_by_race_23_point 

mean_avg_by_race_23 = mean_avg_by_race_23_line + mean_avg_by_race_23_point

mean_avg_by_race_23

### 75th percentile

In [None]:
# Create points for every 25th percentile


p75_avg_by_race_23_point = alt.Chart(quants_avg_by_race_23_all).mark_point(
    filled=True, size =120, shape = 'triangle-left').encode(
        y=alt.Y("race_ethnc_gen:N", sort=race_order),
        x=alt.X("p75:Q", title="Annual Income"),
        color = alt.Color("sex:N").scale(
            domain=['Female', 'Male'],
            range=[color_fem, color_male]
        ),
        shape = alt.Shape().legend(title="Average Salary"),
          tooltip= [
      { "field": "sex", "type": "nominal", "title": "Sex" },
      { "field": "race_ethnc_gen", "type": "nominal", "title": "Race/Ethnicity" },
      { "field": "p25", "type": "quantitative", "title": "25th Percentile Annual Income", "format": "$,.0f" },
      { "field": "avg", "type": "quantitative", "title": "Average Annual Income", "format": "$,.0f" },
      { "field": "p75", "type": "quantitative", "title": "75th Percentile Annual Income", "format": "$,.0f" },
    ]
    )

In [None]:
alt.Chart(pd.DataFrame({'x': [50000]})).mark_rule(color='black').encode(
        y='y:Q'
    )

### All together

In [None]:
raceBoxPlot = p75_avg_by_race_23_point + \
    q25_avg_by_race_23_point + mean_avg_by_race_23_point + mean_avg_by_race_23_line + \
    alt.Chart(pd.DataFrame({'x': [50000]})).mark_rule(color='black').encode(
        x='x:Q',
        tooltip= [{"field": "x", "type": "nominal", "title": "Your annual income", "format": "$,.0f"}]
    )

raceBoxPlot

raceBoxPlot.save("htmls/race_fake_box.html")

#avg_by_race_23.save("svgs/race_fake_box.svg")

## Stats Check

There are some conclusions I make from the viz, and I want to ensure I'm accurate

In [None]:
quants_avg_by_race_23_all.unpivot(index=['race_ethnc_gen', 'sex'], variable_name="metric", value_name="incwage"
).pivot(['sex', 'metric'], index='race_ethnc_gen', values='incwage',).filter(
    pl.col('{"Male","avg"}') > pl.col('{"Female","p75"}'))


# Metro Ridge/Lines
**AI DISCLOSURE: Visuals** <br>
This series of charts were by far the most difficult to put together. I originally was trying to get a density distribution that looked like their [ridgeline plot example](https://altair-viz.github.io/gallery/ridgeline_plot.html). I struggled to get that to turn out as I wanted so I turned to ChatGPT for suggestions on parameters to add. <br>
<br>
I also attempted several times to color in between the lines, but that didn't work out as intended, and I ended up filing them in with Affinity Designer. But in that process, I did consult ChatGPT and Gemini to attempt to figure out how to edit accordingly. In that process, it made some changes to font sizes, strokes, that I decided to keep. 
<br>
**NEW**: I also use AI to help me adapt the tooltip to display which line is larger


## Data Prep

In [None]:
metro_mean_dif = all_23.group_by([pl.col('metro'), pl.col('sex')]).agg(
    pl.col('incwage').mean().round().alias('avg'),
).pivot('sex', index='metro', values='avg').with_columns(
    dif = pl.col('Male') - pl.col('Female')
).sort(pl.col('dif'), descending=True)

metro_list = metro_mean_dif['metro'].to_list()

metro_list

## Density Adjustment

In [None]:
# First use altair to create the density paramters

metro_density = alt.Chart(all_23).transform_density(
    density='incwage',
    groupby=['metro', 'sex'],
    as_=['incwage', 'density'],
)

In [None]:
gen_density = alt.Chart(all_23).transform_density(
    density='incwage',
    groupby=['sex'],
    as_=['incwage', 'density'],
)

In [None]:
gaussian_jitter = alt.Chart(all_23, title='Normally distributed jitter').mark_circle(size=8).encode(
    y="sex:N",
    x="incwage:Q",
    yOffset="jitter:Q",
    color=alt.Color('sex:N').legend(None)
).transform_calculate(
    # Generate Gaussian jitter with a Box-Muller transform
    jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
)

gaussian_jitter

## Facet and adjust

In [None]:
# Second Create a line chart fo each metro

metro_ridge_area = metro_density.mark_line(strokeWidth=1.5 ).encode(
    x=alt.X('incwage:Q', title='Income', axis=alt.Axis(labelFontSize=10,\
        titleFontSize=12)),
    y=alt.Y('density:Q', stack=None, title=None, axis=None),  \
        # remove per-facet y-axis
    color=alt.Color(
        'sex:N',
        scale=alt.Scale(domain=['Female', 'Male'], range=[color_fem, color_male]),
        legend=alt.Legend(title='Sex', orient='top', labelFontSize=10, \
            titleFontSize=11)
    )
)
metro_ridge_area

#metro_ridge_area

#metro_ridge_area.save("svgs/metro_ridge647pm.svg")

In [None]:
base_for_tooltip = alt.Chart(all_23).transform_density(
    density='incwage',
    groupby=['metro', 'sex'],
    as_=['incwage', 'density'],
).transform_pivot(
    # Pivot 'sex' values ('Female', 'Male') into new columns.
    'sex',
    'density',
    # Group by both the X-axis field ('incwage') and the Facet field ('metro')
    groupby=['incwage', 'metro'] 
).transform_calculate(
    # Create the comparison string with HTML/bold tags
    comparison_text=alt.expr.if_(
        alt.datum.Male > alt.datum.Female, 
        "The proportion of males making this income is higher than the proportion of females making this income.", # Note the <b> tag
        alt.expr.if_(
            alt.datum.Female > alt.datum.Male,
            "The proportion of females making this income is higher than the proportion of males making this income.", # Note the <b> tag
            "The proportions of males and females making this amount are equal."
        )
    )
)

# 3. Create Selection for Interactivity
nearest = alt.selection_point(
    fields=['incwage'], 
    nearest=True, 
    on='pointerover',
    empty=False, 
    clear='pointerout',
    # Key: The selection must span across all facets
    resolve='global' 
)

# 4. Tooltip Rule Layer
# This layer uses the transformed data with the comparison text
tooltip_rule = base_for_tooltip.mark_rule(color='gray').encode(
    x='incwage:Q',
    opacity=alt.condition(nearest, alt.value(0.5), alt.value(0)),
    tooltip= [{"field": "incwage", "type": "quantitative", "title": "Income", "format": "$,.0f"},
              {"field": "comparison_text", "type": "nominal", "title": " "}]
).add_params(nearest)

In [None]:
# Line for person's income

line = alt.Chart(pd.DataFrame({'x': [50000]})).mark_rule(color='black').encode(
        x='x:Q',
        tooltip= [{"field": "x", "type": "nominal", "title": "Your annual income", "format": "$,.0f"}]
    )

In [None]:
densityChart_noConfigs = (metro_ridge_area + tooltip_rule + line).facet(
    facet=alt.Facet(
        'metro:N',
        title=None,
        sort=metro_list,
        header=alt.Header(labelOrient='top', labelAnchor='middle')
    ),
    columns=2
).resolve_scale(
    x='independent',
    y='independent'
)

In [None]:
densityChart = (metro_ridge_area + tooltip_rule + line).facet(
    facet=alt.Facet(
        'metro:N',
        title=None,
        sort=metro_list,
        header=alt.Header(labelOrient='top', labelAnchor='middle')
    ),
    columns=2
).resolve_scale(
    x='independent',
    y='independent'
).configure_facet(
    spacing=10
).configure_view(
    stroke=None
).configure_header(
    labelFontSize=13,
    labelFontWeight='bold',
    labelAnchor='middle'
).configure_axis(
    labelFontSize=10,
    titleFontSize=12
).configure_legend(
    titleFontSize=11,
    labelFontSize=10
)

In [None]:
#densityChart

In [None]:
(densityChart).save("htmls/density.html")

# Industry Heat Map

My group suggested a heat map -- that didnt quite work so I ended up doing a bar chart. 

I need to have industry on the y-axis, women's average salary as portion of 
men's on the x-axis, and heat by number of women in the industry


## Data Prep

In [None]:
avg_per_ind = all_23.group_by(["sex", "industry"]).agg(
    pl.col('incwage').mean().round(),
    pl.len()
).pivot('sex', index='industry', values=['incwage', 'len']).with_columns(
    difference = pl.col('incwage_Male') - pl.col('incwage_Female'),
    women_sal_as_portion_men = (((pl.col('incwage_Female')/(pl.col('incwage_Male')))*100)).round(decimals=2),
    num_women_to_men = (pl.col('len_Female')/ pl.col('len_Male')).round(decimals=2)
).filter(  ## Last minute cleaning
    (pl.col('industry') != 'Agriculture, Forestry, Fishing and Hunting') \
        # throwing out due to only 26 men and 13 women
        &  (pl.col('industry') != 'Management of Companies and Enterprises')
).with_columns(
    total_employees = pl.col('len_Female') + pl.col('len_Male')
)


# Only looking at the top 10 industries for cleanliness

top10_industries = avg_per_ind.sort('total_employees', descending=True)[0:10]

## Graph Creation

### Bar

In [None]:
alt.Chart(top10_industries).mark_bar(filled =True).encode(
    alt.Y('industry').sort('-color'),
    alt.X('women_sal_as_portion_men').title("Women's salary as portion of men's (%)"),
    color = alt.Color('num_women_to_men',  scale=alt.Scale(
            domain=[0.5,1,1.5,2, 2.5, 3], 
            range=['#D1BCC7','#D19DB6','#D17DA6','#D15E96','#D03E84'], 
            interpolate='hcl'
            ),
        ))

### Line

In [None]:
line = alt.Chart(pd.DataFrame({'x': [59.46]})).mark_rule(color='black', strokeDash=[5, 5]).encode(
        x='x:Q')

line

In [None]:
industriesBar = (alt.Chart(top10_industries).mark_bar(filled =True).encode(
    alt.Y('industry').sort('-color'),
    alt.X('women_sal_as_portion_men'),
    color = alt.Color('num_women_to_men',  scale=alt.Scale(
            domain=[0.5,1,1.5,2, 2.5, 3], 
            range=['#D1BCC7','#D19DB6','#D17DA6','#D15E96','#D03E84'], 
            interpolate='hcl'
            ),
        )) + line)

industriesBar.save("htmls/industry_portions_bar.html")

# General Bar

## Data Prep

In [None]:
avg_by_sex= all_23.group_by(["sex"]).agg(
    pl.col('incwage').mean().round()
)

avg_by_sex


### Graph Creation

In [None]:
line = alt.Chart(pd.DataFrame({'y': [50000]})).mark_rule(color='black').encode(
        y='y:Q',
        tooltip= [{"field": "y", "type": "nominal", "title": "Your annual income", "format": "$,.0f"}]
    )

### Bar

In [None]:
BasicBar = alt.Chart(avg_by_sex).mark_bar().encode(
    y= alt.Y('incwage:Q', title='Income'),
    x=alt.X('sex:N', title=None),
   # yOffset="sex:N",
    color = alt.Color("sex:N").scale(
            domain=['Female', 'Male'],
            range=[color_fem, color_male],
    ),
    tooltip=[{"field": "incwage", "type": "quantitative", "title": "Average Income", "format": "$,.0f"},
             {"field": "sex", "type": "nominal", "title": "Sex"}]
    ) + line

BasicBar.save("htmls/basic_bar.html")

In [None]:
alt.vconcat(BasicBar, industriesBar, raceBoxPlot, densityChart_noConfigs).save("htmls/RoughDraft.html")