## Compute histograms

1. activity_type
    * elections
    * residents
    * registered voters
2. precinct
    * P1-P21
3. party
    * Democrat
    * Republican
    * Unenrolled
    * Other Party
4. sex
    * Male
    * Female
    * Unknown Sex

In [None]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv (
        find_dotenv (
            usecwd=True
        ),
    override=True
) # read local .env file and override any existing

from sqlalchemy import create_engine
from os import environ

username     =  environ.get("POSTGRES_USERNAME", "postgres")
password     =  environ.get("POSTGRES_PASSWORD", "postgres")
ipaddress    =  environ.get("POSTGRES_IPADDRESS", "localhost")
port         =  environ.get("POSTGRES_PORT", "5432")
dbname       =  environ.get("POSTGRES_DBNAME", "ArlingtonMA")

#establish database connection for Transform queries and Loads
cnx= create_engine(f'postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}')


## Transform

In [None]:
def people_expand_for_histo ( peeps, attributes , date) :
    from numpy import timedelta64
       
    date = pd.to_datetime(date).date()

    df = peeps.merge(attributes,how='left',on='people_id')


    for col in ['name','address_id','party','precinct']:
        df = df.explode([col,'date_'+col])

        mask = df['date_'+col]<= date
        a = df [  mask ] [ ~df [ mask ] . duplicated ( ['people_id'] , keep = 'last')]
        b = df [ ~mask ] [ ~df [ ~mask] . duplicated ( ['people_id'] , keep = 'first')]
        df = pd . concat ( [ a , b [ ~b . people_id . isin ( a . people_id ) ] ] )
        
    df = df. replace({pd.isnull:'',None:''})

    df [ 'age' ]  =  round(((pd.to_datetime(date)-pd.to_datetime(df['dob']))/ timedelta64(1, 'Y')),0).astype(float).fillna(0).astype(int)
    
    cols = ['name','precinct','party','sex', 'age', 'people_id']
        
    return df[cols]


def get_histo_by_date(df,col):
    
    if col == 'party':
        mask = df.party.isin([1,2,4])
        df.loc[~mask,col]=0

    cols = [col,'age']
    histo = df.groupby(cols).count().reset_index()[cols + ['people_id']]
    histo.columns = histo.columns .str.replace('people_id','count')

    histo=histo.pivot_table(
        index=[col,'age'],values='count',fill_value=0)\
            .reset_index()
    histo['date']=date
    return histo


def create_histograms(activity, attributes):
    idx = 0
    age_histogram=pd.DataFrame()
    party_histogram = pd.DataFrame()
    precinct_histogram = pd.DataFrame()
    sex_histogram = pd.DataFrame()

    for ptype in ['elections','registered','residents']:

        tmp = activity[ptype].copy()

        combo = tmp.merge(attributes,how='left',on='people_id').explode('date').sort_values(['people_id','date'])
        combo['age'] = ((pd.to_datetime(combo['date'])-pd.to_datetime(combo['dob']))/ timedelta64(1, 'Y'))
        mask = (pd.isnull(combo['age'])) | (combo['age']=='')
        combo.loc[mask,'age']='0'
        combo['age']=combo['age'].astype(int)

        cols = ['date', 'age']
        h_age = combo.groupby(cols).count().reset_index()[cols + ['people_id']]
        h_age.columns = h_age.columns .str.replace('people_id','count')

        h_age=h_age.pivot_table(
            index=['date','age'],values='count',fill_value=0)\
                .reset_index()
        h_age['activity_type']=idx
        age_histogram = pd.concat([age_histogram,h_age])

        idx+=1

        for date in combo.date.sort_values().unique():

            mask = tmp.explode(['date']).date==pd.to_datetime(date).date()
            df = people_expand_for_histo ( tmp.explode(['date'])[mask], attributes , date)

            #party
            histo = get_histo_by_date(df,'party')
            party_histogram = pd.concat([party_histogram,histo])

            histo = get_histo_by_date(df,'precinct')
            precinct_histogram = pd.concat([precinct_histogram,histo])

            histo = get_histo_by_date(df,'sex')
            sex_histogram = pd.concat([sex_histogram,histo])
            
    return (
        age_histogram, 
        precinct_histogram,
        party_histogram,
        sex_histogram
    )

In [None]:
## get data
activity = {}
for ptype in ['elections','residents','registered']:
    activity[ptype] = pd . read_sql_query ( 'select * from people.{ptype}'.format(ptype=ptype), con = cnx )
    
attributes = pd . read_sql_query ( 'select * from people.attributes' , con = cnx )

## group buckets, age and type
activity_type_histogram, precinct_histogram,party_histogram, sex_histogram =\
        create_histograms(activity, attributes)

## Load

In [None]:
table_create_query = \
    """
        DROP TABLE IF EXISTS people.{group}_histogram;
        CREATE TABLE people.{group}_histogram (
            "date"  DATE,
            "age"   SMALLINT,
            "count" SMALLINT,
            "{group}" SMALLINT
            );
        CREATE INDEX people_{group}_histogram_idx 
            ON people.{group}_histogram("date");
    """
            
for group in ['activity_type','precinct','party','sex']:
    cnx.execute(table_create_query.format(group=group))
    eval(group+"_histogram").to_sql(group+"_histogram",
                                   schema='people',
                                   con=cnx,
                                   if_exists='append',
                                   index=False
                                  )    