# Census

We will be using census data from [United States Census Bureau](http://www.census.gov/popest/data/counties/totals/2015/CO-EST2015-alldata.html)

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("data/census.csv")
df.head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
0,40,3,6,1,0,Alabama,Alabama,4779736,4780127,4785161,...,0.002295,-0.193196,0.381066,0.582002,-0.467369,1.030015,0.826644,1.383282,1.724718,0.712594
1,50,3,6,1,1,Alabama,Autauga County,54571,54571,54660,...,7.242091,-2.915927,-3.012349,2.265971,-2.530799,7.606016,-2.626146,-2.722002,2.59227,-2.187333
2,50,3,6,1,3,Alabama,Baldwin County,182265,182265,183193,...,14.83296,17.647293,21.845705,19.243287,17.197872,15.844176,18.559627,22.727626,20.317142,18.293499
3,50,3,6,1,5,Alabama,Barbour County,27457,27457,27341,...,-4.728132,-2.50069,-7.056824,-3.904217,-10.543299,-4.874741,-2.758113,-7.167664,-3.978583,-10.543299
4,50,3,6,1,7,Alabama,Bibb County,22915,22919,22861,...,-5.527043,-5.068871,-6.201001,-0.177537,0.177258,-5.088389,-4.363636,-5.403729,0.754533,1.107861


In [2]:
df.columns

Index(['SUMLEV', 'REGION', 'DIVISION', 'STATE', 'COUNTY', 'STNAME', 'CTYNAME',
       'CENSUS2010POP', 'ESTIMATESBASE2010', 'POPESTIMATE2010',
       'POPESTIMATE2011', 'POPESTIMATE2012', 'POPESTIMATE2013',
       'POPESTIMATE2014', 'POPESTIMATE2015', 'NPOPCHG_2010', 'NPOPCHG_2011',
       'NPOPCHG_2012', 'NPOPCHG_2013', 'NPOPCHG_2014', 'NPOPCHG_2015',
       'BIRTHS2010', 'BIRTHS2011', 'BIRTHS2012', 'BIRTHS2013', 'BIRTHS2014',
       'BIRTHS2015', 'DEATHS2010', 'DEATHS2011', 'DEATHS2012', 'DEATHS2013',
       'DEATHS2014', 'DEATHS2015', 'NATURALINC2010', 'NATURALINC2011',
       'NATURALINC2012', 'NATURALINC2013', 'NATURALINC2014', 'NATURALINC2015',
       'INTERNATIONALMIG2010', 'INTERNATIONALMIG2011', 'INTERNATIONALMIG2012',
       'INTERNATIONALMIG2013', 'INTERNATIONALMIG2014', 'INTERNATIONALMIG2015',
       'DOMESTICMIG2010', 'DOMESTICMIG2011', 'DOMESTICMIG2012',
       'DOMESTICMIG2013', 'DOMESTICMIG2014', 'DOMESTICMIG2015', 'NETMIG2010',
       'NETMIG2011', 'NETMIG2012', 'NETMI

### Finding the state with most counties

In [3]:
def counties(df, counties):
    unique_counties = df.groupby('STNAME')['CTYNAME'].nunique()
    if counties == "most":
        return unique_counties.idxmax()
    else:
        return unique_counties.idxmin()

In [4]:
counties(df, "most")

'Texas'

In [5]:
counties(df, "least")

'District of Columbia'

__Only looking at the three most populous counties for each state, what are the three most populous states (in order of highest population to lowest population)?__

In [6]:
def states(df):
    # subset the dataframe
    df11 = df.copy()
    county_level = df11.SUMLEV.values == 50 # county_level for SUMLEV is 50
    data = df11[['CENSUS2010POP', 'STNAME', 'CTYNAME']].values[county_level]
    
    # build a pandas series with State and County in the index.
    # values are from CENSUS2010POP
    my_series = pd.Series(data[:, 0], [data[:, 1], data[:,2]], dtype=np.int64)
    
    # now take the three largest county population in each state and add them.
    def sum_largest(x, n=3):
        return x.nlargest(n).sum()
    
    
    return my_series.groupby(level=0).apply(sum_largest).nlargest(3).index.tolist()

In [7]:
states(df)

['California', 'Texas', 'Illinois']

__Which county has had the largest absolute change in population within the period 2010-2015?__

In [8]:
def absolute_change_pop(df):
    df11 = df.copy()
    # get rid of data at state level and just keep the ones at county level
    df11 = df11[df11['SUMLEV'] == 50]
    
    # let's reduce the data to the columns that we need
    cols_to_keep = ['STNAME', 'CTYNAME', 'POPESTIMATE2010', 'POPESTIMATE2011', 'POPESTIMATE2012',
                   'POPESTIMATE2013', 'POPESTIMATE2014', 'POPESTIMATE2015']
    df11 = df11[cols_to_keep]
    
    # find the highest and lowest population counts for each county
    df11['highest_pop'] = df11[['POPESTIMATE2010', 'POPESTIMATE2011', 'POPESTIMATE2012',
                               'POPESTIMATE2013', 'POPESTIMATE2014', 'POPESTIMATE2015']].max(axis=1)
    
    df11['lowest_pop'] = df11[['POPESTIMATE2010', 'POPESTIMATE2011', 'POPESTIMATE2012',
                               'POPESTIMATE2013', 'POPESTIMATE2014', 'POPESTIMATE2015']].min(axis=1)
    
    # now find the difference
    df11['highest_diff'] = df11['highest_pop'] - df11['lowest_pop']
    
    # now build a pandas series with State and County in the index, to find the county with the highest difference
    df12 = df11[['CTYNAME', 'highest_diff']]
    
    # now sort in descending order
    df13 = df12.sort_values('highest_diff', ascending=False)
    
    #return df13.iloc[0.:]
    return df13.iloc[0,0]

In [9]:
absolute_change_pop(df)

'Harris County'

__In this datafile, the United States is broken up into four regions using the "REGION" column. Create a query that finds the counties that belong to regions 1 or 2, whose name starts with 'Washington', and whose POPESTIMATE2015 was greater than their POPESTIMATE 2014.__

In [10]:
def region(df, county):
    df11 = df.copy()
    # let's get rid of data at state level and just keep the ones at county level
    df11 = df11[df11['SUMLEV'] == 50]
    
    # let's reduce the data to the columns that we need
    cols_to_keep = ['REGION', 'STNAME', 'CTYNAME', 'POPESTIMATE2014', 'POPESTIMATE2015']
    df11 = df11[cols_to_keep]
    
    # find the difference in popestimates between 2015 and 2014
    df11['diff_2015-2014'] = df11['POPESTIMATE2015'] - df11['POPESTIMATE2014']
    
    # create a variable with TRUE if the region is 1 or 2
    region = df11['REGION'] < 3
    
    # create a variable with TRUE if diff in popestimate2015 and 2014 is positive
    growth = df11['diff_2015-2014'] > 0
    
    # Select all cases where region is 1 or 2 and growth is positive
    df12 = df11[region & growth]
    
    # now drop the columns not required
    del df12['REGION']
    del df12['POPESTIMATE2014']
    del df12['POPESTIMATE2015']
    del df12['diff_2015-2014']
    
    df13 = df12[df12['CTYNAME'].str.contains(county)]
    
    return df13

In [11]:
region(df, 'Washington')

Unnamed: 0,STNAME,CTYNAME
896,Iowa,Washington County
1419,Minnesota,Washington County
2345,Pennsylvania,Washington County
2355,Rhode Island,Washington County
3163,Wisconsin,Washington County
