# Comparing numerical data across groups

## Setup

In [1]:
import pandas as pd
import altair as alt
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

## Data

## Import data

In [2]:
ROOT = "https://raw.githubusercontent.com/kirenz/datasets/master/"
DATA = "county.csv"

df = pd.read_csv(ROOT + DATA)

HTTPError: HTTP Error 404: Not Found

In [35]:
# Select only relevant variables
data_selection = ["state", "name", "pop_change", 
                  "population_change", "median_hh_income", "metro"]
                  
df = df[data_selection]

# drop missing values
df.dropna(inplace=True)

# Data transformations
df.rename(columns={'population_change': 'change'}, inplace=True)
df['change'] = df['change'].astype("category")

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3136 entries, 0 to 3138
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   state             3136 non-null   object  
 1   name              3136 non-null   object  
 2   pop_change        3136 non-null   float64 
 3   change            3136 non-null   category
 4   median_hh_income  3136 non-null   float64 
 5   metro             3136 non-null   object  
dtypes: category(1), float64(2), object(3)
memory usage: 150.2+ KB


In [36]:
df['change'].value_counts()

no gain    1597
gain       1539
Name: change, dtype: int64

In [37]:
df['metro'].value_counts()

no     1971
yes    1165
Name: metro, dtype: int64

## Histogram for two groups

In [38]:
alt.Chart(df).mark_bar().encode(
    alt.X("median_hh_income:Q", bin=alt.BinParams(maxbins=50)),
    alt.Y('count()'),
    alt.Color('change:N'),
    alt.OpacityValue(0.9),
)

## Side-by-side box plot

In [39]:
alt.Chart(df).mark_boxplot().encode(
    alt.X('median_hh_income'),
    alt.Y('change:N'),
    alt.Color('change:N'),
).properties(
    width=400,
    height=100,
    )

## Faceting

In [41]:
alt.Chart(df).mark_bar().encode(
    alt.X("median_hh_income:Q", bin=alt.BinParams(maxbins=50)),
    alt.Y('count()'),
    alt.Column('metro:N'), # <--
    alt.Row('change:N'), # <--
).properties(
    width=200,
    height=100,
)

In [42]:
alt.Chart(df).mark_bar().encode(
    alt.X("median_hh_income:Q", bin=alt.BinParams(maxbins=50)),
    alt.Y('count()'),
).properties(
    width=200,
    height=100,
).facet( # <--
    column='metro:N',
    row='change:N',
)

## Pair plots

In [44]:
alt.Chart(df).mark_circle().encode(
    alt.X(alt.repeat("column"), type='quantitative'),
    alt.Y(alt.repeat("row"), type='quantitative'),
    alt.Color('change:N'),
).properties(
    width=150,
    height=150
).repeat(
    row=['pop_change', 'median_hh_income'],
    column=['median_hh_income', 'pop_change']
).interactive()