# ATUS Group Populations Project 
CS765

Rebecca Roembke & Katherine Heidi Fehr


## Resources:
- Helpful Pandas data encoding: https://pbpython.com/categorical-encoding.html
- Altair color scales: https://vega.github.io/vega/docs/schemes/

## Helpful Notes
**How to make variable categorical**
```python
# Convert column to categorical data:
data["TESEX"] = data["TESEX"].astype('category')
````


## Set Up

In [1]:
import pandas as pd
import altair as alt
import numpy as np

In [2]:
# NEED the below line in order to use such a large data set, 
# instead of storing the data in the jupyter notebook, 
# it leaves the file hosted locally
# More info: https://altair-viz.github.io/user_guide/faq.html
alt.data_transformers.enable('json')

# Import Data
data=pd.read_csv("Data/atussum_1121-reduced.csv")

# Set selection
pts = alt.selection(type="single", encodings=['x'])

## Data Clean-up
tesex_map = {1: "Male", 2: "Female"}
data["TESEX"] = data["TESEX"].map(tesex_map)

data["TRERNWA"] = data["TRERNWA"].replace(-1,np.NaN)
# TRERNWA_map = {-1: np.nan}
# data["TRERNWA"] = data["TRERNWA"].map(TRERNWA_map)

# Show sample
data.head()


Unnamed: 0,TUCASEID,GEMETSTA,GTMETSTA,PEEDUCA,PEHSPNON,PTDTRACE,TEAGE,TELFS,TEMJOT,TESCHENR,...,t09,t10,t11,t12,t13,t14,t15,t16,t18,t50
0,20110101110010,-1,1,37,2,2,62,5,-1,-1,...,0,0,10,688,0,0,0,0,12,0
1,20110101110072,-1,1,39,2,1,22,1,2,2,...,0,0,95,0,0,0,0,0,25,0
2,20110101110074,-1,2,36,2,1,33,1,2,2,...,0,0,25,180,0,0,0,0,10,525
3,20110101110081,-1,1,39,1,1,45,1,2,2,...,0,0,30,868,0,0,0,0,0,0
4,20110101110082,-1,1,39,2,1,24,1,2,2,...,0,0,95,70,0,0,0,0,40,0


## Manual Binning
To avoid unexpected artifacts in our figures, we need to manually bin some categories.

## Chart Generation

### Bar Chart Filter Charts

In [19]:
# Make basic M/F bar chart
MF_bar = alt.Chart(data).mark_bar().encode(
    x="TESEX",
    y='count()',
    color=alt.condition(pts, alt.ColorValue("darkslateblue"), alt.ColorValue("grey"))
).add_selection(pts)

(MF_bar|MF_bar)

### Activities by Age

In [6]:
colorAvg_work = alt.Chart(data).mark_rect(clip=True).encode(
    alt.X('TEAGE', bin=alt.Bin(maxbins=16)),
    alt.Y(
        't05:Q',bin=alt.Bin(maxbins=20),
        scale=alt.Scale(domain=[0, 1000]),
        title = "Work"),
    alt.Color('count()',
        scale=alt.Scale(type='log',scheme='lighttealblue',domain=[10, 1500]),
        # scale=alt.Scale(scheme='blues',domain=[0, 10, 300, 1500], range=["white", "white", "pink", "red"]),
        legend=alt.Legend(title='Total Records')
    )
)

circ_work = colorAvg_work.mark_point(clip=True).encode(
    alt.ColorValue('grey'),
    alt.Size('count()',
        legend=alt.Legend(title='Records in Selection'),
        scale = alt.Scale(domain=[10,1500])
    ),
    tooltip=['TEAGE', 'count()']
).transform_filter(
    pts
)

colorAvg_PC = alt.Chart(data).mark_rect(clip=True).encode(
    alt.X('TEAGE', bin=alt.Bin(maxbins=16)),
    alt.Y(
        't01:Q',bin=alt.Bin(maxbins=20),
        scale=alt.Scale(domain=[0, 1000]),
        title = "Personal Care"),
    alt.Color('count()',
        scale=alt.Scale(type='log',scheme='lighttealblue',domain=[10, 1500]),
        # scale=alt.Scale(scheme='blues',domain=[0, 10, 300, 1500], range=["white", "white", "pink", "red"]),
        legend=alt.Legend(title='Total Records')
    )
)

circ_PC = colorAvg_PC.mark_point(clip=True).encode(
    alt.ColorValue('grey'),
    alt.Size('count()',
        legend=alt.Legend(title='Records in Selection'),
        scale = alt.Scale(domain=[10,1500])
    ),
    tooltip=['TEAGE', 'count()']
).transform_filter(
    pts
)

colorAvg_sports = alt.Chart(data).mark_rect(clip=True).encode(
    alt.X('TEAGE', bin=alt.Bin(maxbins=16)),
    alt.Y(
        't13:Q',bin=alt.Bin(maxbins=20),
        scale=alt.Scale(domain=[0, 1000]),
        title="Sports"),
    alt.Color('count()',
        scale=alt.Scale(type='log',scheme='lighttealblue',domain=[10, 1500]),
        legend=alt.Legend(title='Total Records')
    )
)

circ_sports = colorAvg_sports.mark_point(clip=True).encode(
    alt.ColorValue('grey'),
    alt.Size('count()',
        legend=alt.Legend(title='Records in Selection'),
        scale = alt.Scale(domain=[10,1500])
    ),
    tooltip=['TEAGE', 'count()']
).transform_filter(
    pts
)

alt.vconcat(
    MF_bar,
    colorAvg_work+circ_work,
    colorAvg_PC+circ_PC,
    colorAvg_sports+circ_sports,
).resolve_legend(
    color="independent",
    size="independent"
)


### Activities by Weekly Earnings

In [12]:
# TRERNWA 

colorAvg_weekEarn_work = alt.Chart(data).mark_rect(clip=True).encode(
    alt.X(
        'TRERNWA', 
        bin=alt.Bin(maxbins=16),
        scale= alt.Scale(domain=[0,300000])),
    alt.Y(
        't05:Q',bin=alt.Bin(maxbins=20),
        scale=alt.Scale(domain=[0, 1000]),
        title = "Work"),
    alt.Color('count()',
        scale=alt.Scale(type='log',scheme='lighttealblue',domain=[10, 1500]),
        # scale=alt.Scale(scheme='blues',domain=[0, 10, 300, 1500], range=["white", "white", "pink", "red"]),
        legend=alt.Legend(title='Total Records')
    )
)

circ_weekEarn_work = colorAvg_weekEarn_work.mark_point(clip=True).encode(
    alt.ColorValue('grey'),
    alt.Size('count()',
        legend=alt.Legend(title='Records in Selection'),
        scale = alt.Scale(domain=[10,1500],range=[0,300])
    ),
    tooltip=['TRERNWA', 'count()']
).transform_filter(
    pts
)

colorAvg_weekEarn_PC = alt.Chart(data).mark_rect(clip=True).encode(
    alt.X(
        'TRERNWA', 
        bin=alt.Bin(maxbins=16),
        scale= alt.Scale(domain=[0,300000])),
    alt.Y(
        't01:Q',bin=alt.Bin(maxbins=20),
        scale=alt.Scale(domain=[0, 1000]),
        title = "Personal Care"),
    alt.Color('count()',
        scale=alt.Scale(type='log',scheme='lighttealblue',domain=[10, 1500]),
        # scale=alt.Scale(scheme='blues',domain=[0, 10, 300, 1500], range=["white", "white", "pink", "red"]),
        legend=alt.Legend(title='Total Records')
    )
)

circ_weekEarn_PC = colorAvg_weekEarn_PC.mark_point(clip=True).encode(
    alt.ColorValue('grey'),
    alt.Size('count()',
        # bin=True,
        legend=alt.Legend(title='Records in Selection'),
        scale = alt.Scale(domain=[10,1500],range=[0,300])
    ),
    tooltip=['TRERNWA', 'count()']
).transform_filter(
    pts
)

colorAvg_weekEarn_sports = alt.Chart(data).mark_rect(clip=True).encode(
    alt.X(
        'TRERNWA', 
        bin=alt.Bin(maxbins=16),
        scale= alt.Scale(domain=[0,300000])),
    alt.Y(
        't13:Q',bin=alt.Bin(maxbins=20),
        scale=alt.Scale(domain=[0, 1000]),
        title="Sports"),
    alt.Color('count()',
        scale=alt.Scale(type='log',scheme='lighttealblue',domain=[10, 1500]),
        legend=alt.Legend(title='Total Records')
    )
)

circ_weekEarn_sports = colorAvg_weekEarn_sports.mark_point(clip=True).encode(
    alt.ColorValue('grey'),
    alt.Size('count()',
        legend=alt.Legend(title='Records in Selection'),
        scale = alt.Scale(domain=[10,1500],range=[0,300])
    ),
    tooltip=['TRERNWA', 'count()']
).transform_filter(
    pts
)

alt.vconcat(
    MF_bar,
    colorAvg_weekEarn_work+circ_weekEarn_work | colorAvg_work+circ_work,
    colorAvg_weekEarn_PC+circ_weekEarn_PC| colorAvg_PC+circ_PC,
    colorAvg_weekEarn_sports+circ_weekEarn_sports | colorAvg_sports+circ_sports,

).resolve_legend(
    color="independent",
    size="independent"
)
