# Comparing numerical data across groups

## Setup

In [1]:
import pandas as pd
import altair as alt
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

## Data

## Import data

In [3]:
ROOT = "https://raw.githubusercontent.com/kirenz/datasets/master/"
DATA = "county.csv"

df = pd.read_csv(ROOT + DATA)

In [None]:
# Select only relevant variables
data_selection = ["state", "name", "pop_change", 
                  "population_change", "median_hh_income", "metro"]
                  
df = df[data_selection]

## Data corrections

In [4]:
# drop missing values
df.dropna(inplace=True)

In [5]:
# rename variable
df.rename(columns={'population_change': 'change'}, inplace=True)

In [7]:
# change data type
df['change'] = df['change'].astype("category")

## Analysis

In [8]:
df['change'].value_counts()

no gain    1285
gain       1275
Name: change, dtype: int64

In [9]:
df['metro'].value_counts()

no     1615
yes     945
Name: metro, dtype: int64

### Histogram for two groups

In [25]:
alt.Chart(df).mark_bar().encode(
    x=alt.X("median_hh_income", 
            bin=alt.BinParams(maxbins=50)),
    y=alt.Y('count()'),
    color=alt.Color('change')
)

### Side-by-side box plot

In [23]:
alt.Chart(df).mark_boxplot().encode(
    x=alt.X('median_hh_income'),
    y=alt.Y('change'),
    color=alt.Color('change'),
).properties(
    width=400,
    height=150,
    )

## Faceting

In [29]:
alt.Chart(df).mark_bar().encode(
    alt.X("median_hh_income", bin=alt.BinParams(maxbins=50)),
    alt.Y('count()'),
    alt.Column('metro'), # <--
    alt.Row('change'), # <--
).properties(
    width=200,
    height=100,
)

In [35]:
alt.Chart(df).mark_bar().encode(
    x=alt.X("median_hh_income", bin=alt.BinParams(maxbins=50)),
    y=alt.Y('count()'),
).properties(
    width=200,
    height=100,
).facet( # <--
    column='metro',
    row='change',
)

## Pair plots

In [36]:
alt.Chart(df).mark_circle().encode(
    x=alt.X(alt.repeat("column"), type='quantitative'),
    y=alt.Y(alt.repeat("row"), type='quantitative'),
    color=alt.Color('change'),
).properties(
    width=150,
    height=150
).repeat(
    row=['pop_change', 'median_hh_income'],
    column=['median_hh_income', 'pop_change']
).interactive()