In [2]:
import pandas as pd
import numpy as np

In [228]:
# neighborhoods
nb = pd.read_csv('voter_with_nbd2.csv')
nb = nb.NAME.unique()

In [229]:
# 4 age groups
age_groups = [18, 26, 33, 40, 50, 150]
ages = np.ones([5, 2])
for i in range(0, len(age_groups) - 1):
    ages[i, :] = [age_groups[i], age_groups[i+1]]

In [230]:
idx = pd.MultiIndex.from_product([age_groups[:-1], nb], names=['ages', 'nb'])
df = pd.DataFrame(index=idx)
df = df.reset_index()

In [231]:
# a line plot of occupancy for age groups over time (years)

# for each neighborhood, need count of ages for each year
years = ['2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016']

for year in years:
    # load each year
    temp = pd.read_csv('{}.csv'.format(year), sep='\t')
    # get rid of dates (the exact month and date in the year that this was recorded)
    # counts was used for the bar graph of all the unique addresses the person had
    temp = temp.drop(['dates', 'counts'], axis=1)
    # remove duplicate voter ids (for the people who had address recorded multiple times that year)
    # we are counting people who lived in multiple different neighborhoods in the same year!!!
    temp = temp.drop_duplicates(['VOTER_ID', 'NAME'])
    temp2 = pd.DataFrame()
    
    for i in range(ages.shape[0]):
        # nb counts for each age group
        nbcounts = temp.loc[(temp.curr_age>=ages[i, 0]) & (temp.curr_age<ages[i, 1]) , 'NAME'].value_counts()
        # rename the series to year
        nbcounts = nbcounts.rename(year)
        nbcounts = pd.DataFrame(nbcounts)
        nbcounts['ages'] = ages[i, 0]
        nbcounts = nbcounts.reset_index()
        nbcounts = nbcounts.rename(columns={'index':'nb'})
        temp2 = temp2.append(nbcounts)
    
    # fully construct temp2 first and merge, now
    df = pd.merge(df, temp2,  how='left', left_on=['ages','nb'], right_on = ['ages','nb'])

In [251]:
# df: counts for age, neighborhood, year
# now separate for each nb
temp = df[df.nb=='SUNNYSIDE-MULTNOMAH COUNTY']

In [333]:
# separate into neighborhoods with complete data and neighborhoods with missing data
dfc = pd.DataFrame()
dfm = pd.DataFrame()
for n in nb:
    # create subset for each neighborhood
    temp = df[df.nb==n]
    # if missing values, append to the missing dataframe
    if temp.isnull().values.any():
        dfm = dfm.append(temp)
    else:
        # make copy to avoid the settingwithcopywarning
        temp2 = temp.copy()
        for year in years:
            temp2.loc[:,year] = temp[year]/temp[year].sum()
            
        dfc = dfc.append(temp2)

In [403]:
dfc.head()

Unnamed: 0,ages,nb,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
0,18,SUNNYSIDE-MULTNOMAH COUNTY,0.113145,0.098171,0.120214,0.085301,0.102891,0.076203,0.094595,0.068265,0.074547,0.047324,0.082575
174,26,SUNNYSIDE-MULTNOMAH COUNTY,0.273156,0.272727,0.28829,0.26428,0.284226,0.243038,0.272854,0.253618,0.254909,0.232739,0.263696
348,33,SUNNYSIDE-MULTNOMAH COUNTY,0.224348,0.221356,0.221728,0.226707,0.216199,0.210127,0.199324,0.208684,0.219447,0.211792,0.213704
522,40,SUNNYSIDE-MULTNOMAH COUNTY,0.173322,0.182625,0.167854,0.191165,0.186862,0.220759,0.204094,0.216893,0.204385,0.221102,0.191822
696,50,SUNNYSIDE-MULTNOMAH COUNTY,0.216029,0.225121,0.201915,0.232546,0.209821,0.249873,0.229134,0.252538,0.246711,0.287044,0.248203


In [404]:
# dump into a csv
dfc.to_csv('nb_age_fraction_over_time_complete.csv', sep='\t', mode = 'w', index=False)

In [378]:
# data frame for stuff I wanna keep
dfck = pd.DataFrame()
for n in nb:
    temp = dfm[dfm.nb==n]
    c = 0
    for age in age_groups[:-1]:
        # at most allow 1 missing value for each age group
        if temp[temp.ages==age].isnull().values.sum() < 2:
            c += 1
    if c == 5:
        dfck = dfck.append(temp)

In [406]:
dfmk = pd.DataFrame()
for n in dfck.nb.unique():
    # create subset for each neighborhood
    temp = dfck[dfck.nb==n]

    # make copy to avoid the settingwithcopywarning
    temp2 = temp.copy()
    for year in years:
        temp2.loc[:,year] = temp[year]/temp[year].sum()

    dfmk = dfmk.append(temp2)

In [408]:
# filled in the following nans:
# for age group 26, neighborhood BRIDLEMILE/SOUTHWEST HILLS, year 2015, filled with 13 (avg of 2014 nd 2016 values)
# for age group 18, neighborhood ARDENWALD-JOHNSON CREEK, year 2015, filled with 14 (avg of 2014 and 2016 values)
dfmk.to_csv('nb_age_fraction_over_time_filled.csv', sep='\t', mode = 'w', index=False)

In [400]:
# dfck.loc[(dfck.ages==26) & (dfck.nb=='BRIDLEMILE/SOUTHWEST HILLS'), '2015'] = 13
# dfck.loc[(dfck.ages==18) & (dfck.nb=='ARDENWALD-JOHNSON CREEK'), '2015'] = 14

In [28]:
df = pd.read_csv('nb_age_fraction_over_time_complete.csv', sep='\t')

In [29]:
df.head()

Unnamed: 0,ages,nb,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
0,18,SUNNYSIDE-MULTNOMAH COUNTY,0.113145,0.098171,0.120214,0.085301,0.102891,0.076203,0.094595,0.068265,0.074547,0.047324,0.082575
1,26,SUNNYSIDE-MULTNOMAH COUNTY,0.273156,0.272727,0.28829,0.26428,0.284226,0.243038,0.272854,0.253618,0.254909,0.232739,0.263696
2,33,SUNNYSIDE-MULTNOMAH COUNTY,0.224348,0.221356,0.221728,0.226707,0.216199,0.210127,0.199324,0.208684,0.219447,0.211792,0.213704
3,40,SUNNYSIDE-MULTNOMAH COUNTY,0.173322,0.182625,0.167854,0.191165,0.186862,0.220759,0.204094,0.216893,0.204385,0.221102,0.191822
4,50,SUNNYSIDE-MULTNOMAH COUNTY,0.216029,0.225121,0.201915,0.232546,0.209821,0.249873,0.229134,0.252538,0.246711,0.287044,0.248203


In [76]:
# find the largest difference for each age group
c = df.columns
ag = df.ages.unique()
dec = dict.fromkeys(ag)
inc = dict.fromkeys(ag)
for a in ag:
    temp = df.loc[df.ages==a]
    diff = temp['2006']-temp['2016']
    dec[a] = temp.loc[diff==max(diff),'nb'].item()
    diff = temp['2016']-temp['2006']
    inc[a] = temp.loc[diff==max(diff),'nb'].item()

In [80]:
dec

{18: 'WOODLAND PARK',
 26: 'ARDENWALD-JOHNSON CREEK/WOODSTOCK',
 33: 'GRANT PARK/HOLLYWOOD',
 40: 'BEAVER CREEK',
 50: 'GOOSE HOLLOW/SOUTHWEST HILLS'}

In [78]:
inc

{18: 'HILLSIDE/NORTHWEST DISTRICT',
 26: 'GLENFAIR',
 33: 'MARSHALL PARK',
 40: 'MARKHAM',
 50: 'WOODLAND PARK'}

In [79]:
df[df.nb=='HILLSIDE/NORTHWEST DISTRICT']

Unnamed: 0,ages,nb,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
85,18,HILLSIDE/NORTHWEST DISTRICT,0.05,0.071942,0.13125,0.118519,0.126582,0.090323,0.079812,0.051613,0.052632,0.027778,0.135714
86,26,HILLSIDE/NORTHWEST DISTRICT,0.125,0.158273,0.1875,0.177778,0.177215,0.187097,0.244131,0.212903,0.25731,0.166667,0.2
87,33,HILLSIDE/NORTHWEST DISTRICT,0.108333,0.122302,0.13125,0.125926,0.158228,0.154839,0.183099,0.167742,0.152047,0.111111,0.071429
88,40,HILLSIDE/NORTHWEST DISTRICT,0.208333,0.230216,0.1625,0.133333,0.126582,0.129032,0.131455,0.187097,0.157895,0.25,0.171429
89,50,HILLSIDE/NORTHWEST DISTRICT,0.508333,0.417266,0.3875,0.444444,0.411392,0.43871,0.361502,0.380645,0.380117,0.444444,0.421429


In [62]:
years = ['2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016']
age_groups = [18, 26, 33, 40, 50, 150]
temp2 = pd.DataFrame(index=age_groups[:-1])

for year in years:
    temp = dict.fromkeys(age_groups[:-1])
    for age in age_groups[:-1]:
        temp[age] = df.loc[(df.ages==age), year].sum()
    
    temp = pd.DataFrame.from_dict(temp, orient='index')
    temp = temp.rename(columns={temp.columns[0]:year})
    temp = temp/temp.sum()
    
    temp2 = temp2.join(temp)

In [64]:
temp2.to_csv('fraction_age_group_registered_each_year.csv', sep='\t')

In [66]:
temp2['2006'].sum()

1.0