# ATUS Group Populations Project 
CS765

Rebecca Roembke & Katherine Heidi Fehr


## Resources:
- Helpful Pandas data encoding: https://pbpython.com/categorical-encoding.html
- Altair color scales: https://vega.github.io/vega/docs/schemes/

## Helpful Notes
**How to make variable categorical**
```python
# Convert column to categorical data:
data["TESEX"] = data["TESEX"].astype('category')
````


## Set Up

In [3]:
import pandas as pd
import altair as alt
import numpy as np

In [4]:
# NEED the below line in order to use such a large data set, 
# instead of storing the data in the jupyter notebook, 
# it leaves the file hosted locally
# More info: https://altair-viz.github.io/user_guide/faq.html
alt.data_transformers.enable('json')

# Import Data
data=pd.read_csv("Data/atussum_1121-reduced.csv")

# Set selection
#pts = alt.selection(type="multi", encodings=['x'])
#selection = alt.selection_multi(fields=['Origin', 'TESEX'])

alex = alt.selection_interval(
    on="[mousedown[event.altKey], mouseup] > mousemove",
    name='alex'
)
morgan = alt.selection_interval(
    on="[mousedown[event.shiftKey], mouseup] > mousemove",
    mark=alt.BrushConfig(fill="#fdbb84", fillOpacity=0.5, stroke="#e34a33"),
    name='morgan'
)

## Data Clean-up
tesex_map = {1: "Male", 2: "Female"}
data["TESEX"] = data["TESEX"].map(tesex_map)

race_map = {-3:"unknown", -2: "unknown", -1:"unknown",1: "White Only", 2: "Black Only", 3: "American Indian, Alaskan Native Only", 4: "Asian Only",5: "Hawaiian/Pacific Islander Only", 6:"more than 1 race",7:"more than 1 race",8:"more than 1 race",9:"more than 1 race",10:"more than 1 race",11:"more than 1 race",12:"more than 1 race",13:"more than 1 race",14:"more than 1 race",15:"more than 1 race",16:"more than 1 race",17:"more than 1 race",18:"more than 1 race",19:"more than 1 race",20:"more than 1 race",21:"more than 1 race",22:"more than 1 race",23:"more than 1 race",24:"more than 1 race",25:"more than 1 race"}
data["PTDTRACE"] = data["PTDTRACE"].map(race_map)

telfs_map = {1: "Employed", 2: "Employed", 3: "Unemployed", 4: "Unemployed",5: "Not in labor force"}
data["TELFS"] = data["TELFS"].map(telfs_map)

data["TRERNWA"] = data["TRERNWA"].replace(-1,np.NaN)
data["TRERNWA"] = data["TRERNWA"]/100

# Show sample
data.head()


Unnamed: 0,TUCASEID,GEMETSTA,GTMETSTA,PEEDUCA,PEHSPNON,PTDTRACE,TEAGE,TELFS,TEMJOT,TESCHENR,...,t09,t10,t11,t12,t13,t14,t15,t16,t18,t50
0,20110101110010,-1,1,37,2,Black Only,62,Not in labor force,-1,-1,...,0,0,10,688,0,0,0,0,12,0
1,20110101110072,-1,1,39,2,White Only,22,Employed,2,2,...,0,0,95,0,0,0,0,0,25,0
2,20110101110074,-1,2,36,2,White Only,33,Employed,2,2,...,0,0,25,180,0,0,0,0,10,525
3,20110101110081,-1,1,39,1,White Only,45,Employed,2,2,...,0,0,30,868,0,0,0,0,0,0
4,20110101110082,-1,1,39,2,White Only,24,Employed,2,2,...,0,0,95,70,0,0,0,0,40,0


## Manual Binning
To avoid unexpected artifacts in our figures, we need to manually bin some categories.

In [5]:
# df.loc[df['score'].between(0, 50, 'both'), 'grade'] = 'C'
# df.loc[df['score'].between(50, 80, 'right'), 'grade'] = 'B'
# df.loc[df['score'].between(80, 100, 'right'), 'grade'] = 'A'

# Age
data.loc[data['TEAGE'].between(0,15,'both'), 'AGE'] = '0-15'
data.loc[data['TEAGE'].between(15,20,'right'), 'AGE'] = '16-20'

cAge = 15
while cAge < 81:
    myString = str(cAge)+"-"+str(cAge+5)
    data.loc[data['TEAGE'].between(cAge,cAge+5,'right'), 'AGE'] = myString
    cAge = cAge + 5
data.loc[data['TEAGE'].between(85,100,'right'), 'AGE'] = '85+'

## Chart Generation

### Bar Chart Filter Charts

In [6]:
# Make basic M/F bar chart
MF_bar = alt.Chart(data).mark_bar().encode(
    x="TESEX",
    y='count()',
    color=alt.condition(alex | morgan, alt.ColorValue("darkslateblue"), alt.ColorValue("grey"))
).add_selection(alex, morgan)

#make employment bar chart
Employment_bar=alt.Chart(data).mark_bar().encode(
    x="TELFS",
    y='count()',
    color=alt.condition(alex | morgan, alt.ColorValue("darkslateblue"), alt.ColorValue("grey"))
).add_selection(alex, morgan)

#make race bar chart
Race_bar=alt.Chart(data).mark_bar().encode(
    x="PTDTRACE",
    y='count()',
    color=alt.condition(alex | morgan, alt.ColorValue("darkslateblue"), alt.ColorValue("grey"))
).add_selection(alex, morgan)



(MF_bar|Employment_bar|Race_bar)



  for col_name, dtype in df.dtypes.iteritems():


### Activities by Age

In [7]:
colorAvg_work = alt.Chart(data).mark_rect(clip=True).encode(
    alt.X(
        'AGE:N',
        ),
    alt.Y(
        't05:Q',bin=alt.Bin(maxbins=20),
        scale=alt.Scale(domain=[0, 1000]),
        title = "Work"),
    alt.Color('count()',
        scale=alt.Scale(type='log',scheme='lighttealblue',domain=[10, 1500]),
        # scale=alt.Scale(scheme='blues',domain=[0, 10, 300, 1500], range=["white", "white", "pink", "red"]),
        legend=alt.Legend(title='Total Records')
    )
)

circ_work = colorAvg_work.mark_point(clip=True).encode(
    alt.ColorValue('grey'),
    alt.Size('count()',
        legend=alt.Legend(title='Records in Selection'),
        scale = alt.Scale(domain=[10,1500],range=[0,200])
    ),
    tooltip=['AGE:N', 'count()']
).transform_filter(
    alex & morgan
)

colorAvg_PC = alt.Chart(data).mark_rect(clip=True).encode(
    alt.X(
        'AGE:N',
        ),
    alt.Y(
        't01:Q',bin=alt.Bin(maxbins=20),
        scale=alt.Scale(domain=[0, 1000]),
        title = "Personal Care"),
    alt.Color('count()',
        scale=alt.Scale(type='log',scheme='lighttealblue',domain=[10, 1500]),
        # scale=alt.Scale(scheme='blues',domain=[0, 10, 300, 1500], range=["white", "white", "pink", "red"]),
        legend=alt.Legend(title='Total Records')
    )
)

circ_PC = colorAvg_PC.mark_point(clip=True).encode(
    alt.ColorValue('grey'),
    alt.Size('count()',
        legend=alt.Legend(title='Records in Selection'),
        scale = alt.Scale(domain=[10,1500],range=[0,200])
    ),
    tooltip=['AGE:N', 'count()']
).transform_filter(
    alex & morgan
)

colorAvg_sports = alt.Chart(data).mark_rect(clip=True).encode(
    alt.X(
        'AGE:N',
        ),
    alt.Y(
        't13:Q',bin=alt.Bin(maxbins=20),
        scale=alt.Scale(domain=[0, 1000]),
        title="Sports"),
    alt.Color('count()',
        scale=alt.Scale(type='log',scheme='lighttealblue',domain=[10, 1500]),
        legend=alt.Legend(title='Total Records')
    )
)

circ_sports = colorAvg_sports.mark_point(clip=True).encode(
    alt.ColorValue('grey'),
    alt.Size('count()',
        legend=alt.Legend(title='Records in Selection'),
        scale = alt.Scale(domain=[10,1500],range=[0,200])
    ),
    tooltip=['AGE:N', 'count()']
).transform_filter(
    alex & morgan
)

alt.vconcat(
    (MF_bar|Employment_bar|Race_bar),
    colorAvg_work+circ_work,
    colorAvg_PC+circ_PC,
    colorAvg_sports+circ_sports,
).resolve_legend(
    color="independent",
    size="independent"
)

### Activities by Weekly Earnings

In [18]:
# TRERNWA 

colorAvg_weekEarn_work = alt.Chart(data).mark_rect(clip=True).encode(
    alt.X(
        'TRERNWA', 
        bin=alt.Bin(maxbins=16),
        # scale= alt.Scale(domain=[0,3000])
        ),
    alt.Y(
        't05:Q',bin=alt.Bin(maxbins=20),
        scale=alt.Scale(domain=[0, 1000]),
        title = "Work"),
    alt.Color('count()',
        scale=alt.Scale(type='log',scheme='lighttealblue',domain=[10, 1500]),
        # scale=alt.Scale(scheme='blues',domain=[0, 10, 300, 1500], range=["white", "white", "pink", "red"]),
        legend=alt.Legend(title='Total Records')
    )
)

circ_weekEarn_work = colorAvg_weekEarn_work.mark_point(clip=True).encode(
    alt.ColorValue('grey'),
    alt.Size('count()',
        legend=alt.Legend(title='Records in Selection'),
        scale = alt.Scale(domain=[10,1500],range=[0,300])
    ),
    tooltip=['TRERNWA', 'count()']
).transform_filter(
    alex & morgan
)

colorAvg_weekEarn_PC = alt.Chart(data).mark_rect(clip=True).encode(
    alt.X(
        'TRERNWA', 
        bin=alt.Bin(maxbins=16),
        # scale= alt.Scale(domain=[0,3000])
        ),
    alt.Y(
        't01:Q',bin=alt.Bin(maxbins=20),
        scale=alt.Scale(domain=[0, 1000]),
        title = "Personal Care"),
    alt.Color('count()',
        scale=alt.Scale(type='log',scheme='lighttealblue',domain=[10, 1500]),
        # scale=alt.Scale(scheme='blues',domain=[0, 10, 300, 1500], range=["white", "white", "pink", "red"]),
        legend=alt.Legend(title='Total Records')
    )
)

circ_weekEarn_PC = colorAvg_weekEarn_PC.mark_point(clip=True).encode(
    alt.ColorValue('grey'),
    alt.Size('count()',
        # bin=True,
        legend=alt.Legend(title='Records in Selection'),
        scale = alt.Scale(domain=[10,1500],range=[0,300])
    ),
    tooltip=['TRERNWA', 'count()']
).transform_filter(
    alex & morgan
)

colorAvg_weekEarn_sports = alt.Chart(data).mark_rect(clip=True).encode(
    alt.X(
        'TRERNWA', 
        bin=alt.Bin(maxbins=16),
        # scale= alt.Scale(domain=[0,3000])
        ),
    alt.Y(
        't13:Q',bin=alt.Bin(maxbins=20),
        scale=alt.Scale(domain=[0, 1000]),
        title="Sports"),
    alt.Color('count()',
        scale=alt.Scale(type='log',scheme='lighttealblue',domain=[10, 1500]),
        legend=alt.Legend(title='Total Records')
    )
)

circ_weekEarn_sports = colorAvg_weekEarn_sports.mark_point(clip=True).encode(
    alt.ColorValue('grey'),
    alt.Size('count()',
        legend=alt.Legend(title='Records in Selection'),
        scale = alt.Scale(domain=[10,1500],range=[0,300])
    ),
    tooltip=['TRERNWA', 'count()']
).transform_filter(
    alex & morgan
)

alt.vconcat(
    (MF_bar|Employment_bar|Race_bar),
    colorAvg_weekEarn_work+circ_weekEarn_work | colorAvg_work+circ_work,
    colorAvg_weekEarn_PC+circ_weekEarn_PC| colorAvg_PC+circ_PC,
    colorAvg_weekEarn_sports+circ_weekEarn_sports | colorAvg_sports+circ_sports,

).resolve_legend(
    color="independent",
    size="independent"
)
