# ATUS Group Populations Project 
CS765

Rebecca Roembke & Katherine Heidi Fehr


## Resources:
- Helpful Pandas data encoding: https://pbpython.com/categorical-encoding.html
- Altair color scales: https://vega.github.io/vega/docs/schemes/

## Helpful Notes
**How to make variable categorical**
```python
# Convert column to categorical data:
data["TESEX"] = data["TESEX"].astype('category')
````


## Set Up

In [36]:
import pandas as pd
import altair as alt
import numpy as np

In [37]:
# NEED the below line in order to use such a large data set, 
# instead of storing the data in the jupyter notebook, 
# it leaves the file hosted locally
# More info: https://altair-viz.github.io/user_guide/faq.html
alt.data_transformers.enable('json')

# Import Data
data=pd.read_csv("Data/atussum_1121-reduced.csv")

# Set selection
pts = alt.selection(type="single", encodings=['x'])

## Data Clean-up
tesex_map = {1: "Male", 2: "Female"}
data["TESEX"] = data["TESEX"].map(tesex_map)

race_map = {-3:"unknown", -2: "unknown", -1:"unknown",1: "White Only", 2: "Black Only", 3: "American Indian, Alaskan Native Only", 4: "Asian Only",5: "Hawaiian/Pacific Islander Only", 6:"more than 1 race",7:"more than 1 race",8:"more than 1 race",9:"more than 1 race",10:"more than 1 race",11:"more than 1 race",12:"more than 1 race",13:"more than 1 race",14:"more than 1 race",15:"more than 1 race",16:"more than 1 race",17:"more than 1 race",18:"more than 1 race",19:"more than 1 race",20:"more than 1 race",21:"more than 1 race",22:"more than 1 race",23:"more than 1 race",24:"more than 1 race",25:"more than 1 race"}
data["PTDTRACE"] = data["PTDTRACE"].map(race_map)

telfs_map = {1: "Employed", 2: "Employed", 3: "Unemployed", 4: "Unemployed",5: "Not in labor force"}
data["TELFS"] = data["TELFS"].map(telfs_map)

data["TRERNWA"] = data["TRERNWA"].replace(-1,np.NaN)
data["TRERNWA"] = data["TRERNWA"]/100

# Show sample
data.head()


Unnamed: 0,TUCASEID,GEMETSTA,GTMETSTA,PEEDUCA,PEHSPNON,PTDTRACE,TEAGE,TELFS,TEMJOT,TESCHENR,...,t09,t10,t11,t12,t13,t14,t15,t16,t18,t50
0,20110101110010,-1,1,37,2,Black Only,62,Not in labor force,-1,-1,...,0,0,10,688,0,0,0,0,12,0
1,20110101110072,-1,1,39,2,White Only,22,Employed,2,2,...,0,0,95,0,0,0,0,0,25,0
2,20110101110074,-1,2,36,2,White Only,33,Employed,2,2,...,0,0,25,180,0,0,0,0,10,525
3,20110101110081,-1,1,39,1,White Only,45,Employed,2,2,...,0,0,30,868,0,0,0,0,0,0
4,20110101110082,-1,1,39,2,White Only,24,Employed,2,2,...,0,0,95,70,0,0,0,0,40,0


## Manual Binning
To avoid unexpected artifacts in our figures, we need to manually bin some categories.

In [38]:
#### Age
data.loc[data['TEAGE'].between(0,15,'both'), 'AGE'] = '0-15'
data.loc[data['TEAGE'].between(15,20,'right'), 'AGE'] = '16-20'

cAge = 15
while cAge < 81:
    myString = str(cAge+1)+"-"+str(cAge+5)
    data.loc[data['TEAGE'].between(cAge,cAge+5,'right'), 'AGE'] = myString #right side inclusive
    cAge = cAge + 5
data.loc[data['TEAGE'].between(85,100,'right'), 'AGE'] = '85+'

#### Weekly income 'TRERNWA'
# Start list of strings in order
orderIncome = ['Unknown','$0-$200']
# Make first string
data.loc[data['TRERNWA'].between(0,200,'both'), 'WINCOME'] = '$0-$200'

sIncome = 200
increment = 200
while sIncome < 2801:
    myString = "$"+str(sIncome+1)+"-"+"$"+str(sIncome+increment)
    data.loc[data['TRERNWA'].between(sIncome,sIncome+increment,'right'), 'WINCOME'] = myString #right side inclusive
    orderIncome.append(myString)
    sIncome = sIncome + increment
data.loc[data['TRERNWA'].between(3001,6000,'right'), 'WINCOME'] = '$3000+'
orderIncome.append('$3000+')

data['WINCOME'] = data['WINCOME'].replace(np.NaN, 'Unknown')
# orderIncome.append('Unknown')


## Chart Generation

### Create Selector

In [39]:
pts = alt.selection_interval(
    encodings=['x'],
    resolve='intersect',
    empty="all",
    mark=alt.BrushConfig(fill='green')
)

### Constants

In [40]:
## Yaxis
yWork = [0, 1200]
yPC = [0, 1400]
ySports = [0, 800]

myOP = alt.value(0.9)
circColor = 'lightgrey'
circRange = [0,100]


### Tooltip Definitions

In [41]:
tSex = alt.Tooltip('TESEX', title="Sex")
tEmploy = alt.Tooltip('TELFS', title="Employment Status")
tRace = alt.Tooltip('PTDTRACE', title="Race")

tAge = alt.Tooltip('AGE:N', title="Age")
tInc = alt.Tooltip('WINCOME', title="Weekly Income")
tChiNum = alt.Tooltip('TRCHILDNUM', title="Number of Children")

tPC = alt.Tooltip('t01:Q', title="Personal Care (minutes spent)")
tWork = alt.Tooltip('t05:Q', title="Work (minutes spent)")
tSports = alt.Tooltip('t01:Q', title="Sports (minutes spent)")

### Bar Chart Filter Charts

In [42]:
# Make basic M/F bar chart
MF_bar = alt.Chart(data).mark_bar().encode(
    alt.X(
        "TESEX", 
        title="Sex"),
    alt.Y(
        'count()'
    ),
    color=alt.condition(pts, alt.ColorValue("darkslateblue"), alt.ColorValue("grey")),
    tooltip=[tSex, 'count()']
).add_selection(pts
).properties(
    width=100,
    height=150
)

#make employment bar chart
Employment_bar=alt.Chart(data).mark_bar().encode(
    alt.X(
        "TELFS", 
        title="Employment Status"),
    alt.Y(
        'count()'
    ),
    color=alt.condition(pts, alt.ColorValue("darkslateblue"), alt.ColorValue("grey")),
    tooltip=[tEmploy, 'count()']
).add_selection(pts
).properties(
    width=150,
    height=150
)

#make race bar chart
Race_bar=alt.Chart(data).mark_bar().encode(
    alt.X (
        "PTDTRACE",
        title="Race",
        sort=alt.EncodingSortField(field="PTDTRACE", op="count", order='descending')
    ),
    alt.Y('count()'),
    color=alt.condition(pts, alt.ColorValue("darkslateblue"), alt.ColorValue("grey")),
    tooltip=[tRace, 'count()']
).add_selection(pts
).properties(
    width=300,
    height=150
)



(MF_bar|Employment_bar|Race_bar)



### A: Age

In [43]:
colorAvg_work = alt.Chart(data).mark_rect(clip=True).encode(
    alt.X(
        'AGE:N',
        title="",
        axis=alt.Axis(labels=False)
        ),
    alt.Y(
        't05:Q',bin=alt.Bin(maxbins=20),
        scale=alt.Scale(domain=yWork),
        title = "Work"),
    alt.Color('count()',
        scale=alt.Scale(type='log',scheme='lighttealblue',domain=[10, 1500]),
        # scale=alt.Scale(scheme='blues',domain=[0, 10, 300, 1500], range=["white", "white", "pink", "red"]),
        legend=alt.Legend(title='Total Records')
    ),
    # opacity = myOP
).properties(
    width=200,
    height=150
)

circ_work = colorAvg_work.mark_point(clip=True).encode(
    alt.ColorValue(circColor),
    alt.Size('count()',
        legend=alt.Legend(title='Records in Selection'),
        scale = alt.Scale(domain=[10,1500],range=circRange)
    ),
    tooltip=[tAge, 'count()']
).transform_filter(
    pts
).interactive(
) 

colorAvg_PC = alt.Chart(data).mark_rect(clip=True).encode(
    alt.X(
        'AGE:N',
        title="",
        axis=alt.Axis(labels=False)
        ),
    alt.Y(
        't01:Q',bin=alt.Bin(maxbins=20),
        scale=alt.Scale(domain=yPC),
        title = "Personal Care"),
    alt.Color('count()',
        scale=alt.Scale(type='log',scheme='lighttealblue',domain=[10, 1500]),
        # scale=alt.Scale(scheme='blues',domain=[0, 10, 300, 1500], range=["white", "white", "pink", "red"]),
        legend=alt.Legend(title='Total Records')
    )
).properties(
    width=200,
    height=150
)

circ_PC = colorAvg_PC.mark_point(clip=True).encode(
    alt.ColorValue(circColor),
    alt.Size('count()',
        legend=alt.Legend(title='Records in Selection'),
        scale = alt.Scale(domain=[10,1500],range=circRange)
    ),
    tooltip=[tAge, 'count()']
).transform_filter(
    pts
).interactive() 

colorAvg_sports = alt.Chart(data).mark_rect(clip=True).encode(
    alt.X(
        'AGE:N',
        title="Age"
        ),
    alt.Y(
        't13:Q',bin=alt.Bin(maxbins=20),
        scale=alt.Scale(domain=ySports),
        title="Sports"),
    alt.Color('count()',
        scale=alt.Scale(type='log',scheme='lighttealblue',domain=[10, 1500]),
        legend=alt.Legend(title='Total Records')
    )
).properties(
    width=200,
    height=150
)

circ_sports = colorAvg_sports.mark_point(clip=True).encode(
    alt.ColorValue(circColor),
    alt.Size('count()',
        legend=alt.Legend(title='Records in Selection'),
        scale = alt.Scale(domain=[10,1500],range=circRange)
    ),
    tooltip=[tAge, 'count()']
).transform_filter(
    pts
).interactive() 

alt.vconcat(
    (MF_bar|Employment_bar|Race_bar),
    colorAvg_work+circ_work,
    colorAvg_PC+circ_PC,
    colorAvg_sports+circ_sports,
).resolve_legend(
    color="independent",
    size="independent"
).configure_concat(
    spacing=0
)

### A: Weekly Earnings

In [44]:
# TRERNWA 

colorAvg_weekEarn_work = alt.Chart(data).mark_rect(clip=True).encode(
    alt.X(
        'WINCOME', 
        sort=orderIncome,
        title="",
        axis=alt.Axis(labels=False)
        # bin=alt.Bin(maxbins=16),
        # scale= alt.Scale(domain=[0,3000])
        ),
    alt.Y(
        't05:Q',bin=alt.Bin(maxbins=20),
        scale=alt.Scale(domain=yWork),
        title="",
        # title = "Work"
        ),
    alt.Color('count()',
        scale=alt.Scale(type='log',scheme='lighttealblue',domain=[10, 1500]),
        # scale=alt.Scale(scheme='blues',domain=[0, 10, 300, 1500], range=["white", "white", "pink", "red"]),
        legend=alt.Legend(title='Total Records')
    )
).properties(
    width=200,
    height=150
)

circ_weekEarn_work = colorAvg_weekEarn_work.mark_point(clip=True).encode(
    alt.ColorValue(circColor),
    alt.Size('count()',
        legend=alt.Legend(title='Records in Selection'),
        scale = alt.Scale(domain=[10,1500],range=circRange)
    ),
    tooltip=[tInc, 'count()']
).transform_filter(
    pts
).interactive()

colorAvg_weekEarn_PC = alt.Chart(data).mark_rect(clip=True).encode(
    alt.X(
        'WINCOME',
        sort=orderIncome,
        title="",
        axis=alt.Axis(labels=False)
        # bin=alt.Bin(maxbins=16),
        # scale= alt.Scale(domain=[0,3000])
        ),
    alt.Y(
        't01:Q',bin=alt.Bin(maxbins=20),
        scale=alt.Scale(domain=yPC),
        title="",
        # title = "Personal Care"
        ),
    alt.Color('count()',
        scale=alt.Scale(type='log',scheme='lighttealblue',domain=[10, 1500]),
        # scale=alt.Scale(scheme='blues',domain=[0, 10, 300, 1500], range=["white", "white", "pink", "red"]),
        legend=alt.Legend(title='Total Records')
    )
).properties(
    width=200,
    height=150
)

circ_weekEarn_PC = colorAvg_weekEarn_PC.mark_point(clip=True).encode(
    alt.ColorValue(circColor),
    alt.Size('count()',
        # bin=True,
        legend=alt.Legend(title='Records in Selection'),
        scale = alt.Scale(domain=[10,1500],range=circRange)
    ),
    tooltip=[tInc, 'count()']
).transform_filter(
    pts
).interactive()

colorAvg_weekEarn_sports = alt.Chart(data).mark_rect(clip=True).encode(
    alt.X(
        'WINCOME', 
        sort=orderIncome,
        title="Weekly Income", 
        # bin=alt.Bin(maxbins=16),
        # scale= alt.Scale(domain=[0,3000])
        ),
    alt.Y(
        't13:Q',bin=alt.Bin(maxbins=20),
        scale=alt.Scale(domain=ySports),
        title="",
        # title="Sports"
        ),
    alt.Color('count()',
        scale=alt.Scale(type='log',scheme='lighttealblue',domain=[10, 1500]),
        legend=alt.Legend(title='Total Records')
    )
).properties(
    width=200,
    height=150
)

circ_weekEarn_sports = colorAvg_weekEarn_sports.mark_point(clip=True).encode(
    alt.ColorValue(circColor),
    alt.Size('count()',
        legend=alt.Legend(title='Records in Selection'),
        scale = alt.Scale(domain=[10,1500],range=circRange)
    ),
    tooltip=[tInc, 'count()']
).transform_filter(
    pts
).interactive()

alt.vconcat(
    (MF_bar|Employment_bar|Race_bar),
    colorAvg_weekEarn_work+circ_weekEarn_work | colorAvg_work+circ_work,
    colorAvg_weekEarn_PC+circ_weekEarn_PC| colorAvg_PC+circ_PC,
    colorAvg_weekEarn_sports+circ_weekEarn_sports | colorAvg_sports+circ_sports,

).resolve_legend(
    color="independent",
    size="independent"
)




### A: Number of Children

In [45]:
# TRCHILDNUM

colorAvg_childNum_work = alt.Chart(data).mark_rect(clip=True).encode(
    alt.X(
        'TRCHILDNUM:N', 
        title="",
        axis=alt.Axis(labels=False)
        # scale= alt.Scale(domain=[0,10])
        ),
    alt.Y(
        't05:Q',bin=alt.Bin(maxbins=20),
        scale=alt.Scale(domain=yWork),
        title="",
        # title = "Work"
        ),
    alt.Color('count()',
        scale=alt.Scale(type='log',scheme='lighttealblue',domain=[10, 1500]),
        # scale=alt.Scale(scheme='blues',domain=[0, 10, 300, 1500], range=["white", "white", "pink", "red"]),
        legend=alt.Legend(title='Total Records')
    )
).properties(
    width=200,
    height=150
)

circ_childNum_work = colorAvg_childNum_work.mark_point(clip=True).encode(
    alt.ColorValue(circColor),
    alt.Size('count()',
        legend=alt.Legend(title='Records in Selection'),
        scale = alt.Scale(domain=[10,1500],range=circRange)
    ),
    tooltip=[tChiNum, 'count()']
).transform_filter(
    pts
).interactive()

colorAvg_childNum_PC = alt.Chart(data).mark_rect(clip=True).encode(
    alt.X(
        'TRCHILDNUM:N', 
        title="",
        axis=alt.Axis(labels=False)
        # scale= alt.Scale(domain=[0,3000])
        ),
    alt.Y(
        't01:Q',bin=alt.Bin(maxbins=20),
        scale=alt.Scale(domain=yPC),
        title="",
        # title = "Personal Care"
        ),
    alt.Color('count()',
        scale=alt.Scale(type='log',scheme='lighttealblue',domain=[10, 1500]),
        # scale=alt.Scale(scheme='blues',domain=[0, 10, 300, 1500], range=["white", "white", "pink", "red"]),
        legend=alt.Legend(title='Total Records')
    )
).properties(
    width=200,
    height=150
)

circ_childNum_PC = colorAvg_childNum_PC.mark_point(clip=True).encode(
    alt.ColorValue(circColor),
    alt.Size('count()',
        # bin=True,
        legend=alt.Legend(title='Records in Selection'),
        scale = alt.Scale(domain=[10,1500],range=circRange)
    ),
    tooltip=[tChiNum, 'count()']
).transform_filter(
    pts
).interactive()

colorAvg_childNum_sports = alt.Chart(data).mark_rect(clip=True).encode(
    alt.X(
        'TRCHILDNUM:N',
        title="Number of Children", 
        ),
    alt.Y(
        't13:Q',bin=alt.Bin(maxbins=20),
        scale=alt.Scale(domain=ySports),
        title="",
        # title="Sports"
        ),
    alt.Color('count()',
        scale=alt.Scale(type='log',scheme='lighttealblue',domain=[10, 1500]),
        legend=alt.Legend(title='Total Records')
    )
).properties(
    width=200,
    height=150
)

circ_childNum_sports = colorAvg_childNum_sports.mark_point(clip=True).encode(
    alt.ColorValue(circColor),
    alt.Size('count()',
        legend=alt.Legend(title='Records in Selection'),
        scale = alt.Scale(domain=[10,1500],range=circRange)
    ),
    tooltip=[tChiNum, 'count()']
).transform_filter(
    pts
).interactive()

out = alt.vconcat(
    (MF_bar|Employment_bar|Race_bar),
    colorAvg_work+circ_work | colorAvg_childNum_work+circ_childNum_work| colorAvg_weekEarn_work+circ_weekEarn_work,
    colorAvg_PC+circ_PC | colorAvg_childNum_PC+circ_childNum_PC | colorAvg_weekEarn_PC+circ_weekEarn_PC,
    colorAvg_sports+circ_sports | colorAvg_childNum_sports+circ_childNum_sports | colorAvg_weekEarn_sports+circ_weekEarn_sports,

).resolve_legend(
    color="independent",
    size="independent"
).configure_concat(
    spacing=15
)

out

## Export chart

In [46]:
with alt.data_transformers.enable('default'):
    out.save('chart.html')

with alt.data_transformers.enable('default'):
    out.save('ATUS_All.json')