# Import data and modules

In [29]:
import pandas as pd
import numpy as np
import seaborn as sns
from functools import reduce
import matplotlib.pyplot as plt

In [2]:
dec_2010 = pd.read_csv('DECENNIALPL2010.P2_data_with_overlays_2021-09-19T020624.csv')
dec_2020 = pd.read_csv('DECENNIALPL2020.P2_data_with_overlays_2021-09-19T020624.csv')

# Clean data

In [3]:
# inspect elements in the "Total" column (P002001) that contain an 'r' and therefore prevent the clean_data function from working
r = dec_2010[dec_2010['P002001'].str.contains("r")]
r_lst = r[['NAME','P002001']]

# new data frame with split value columns
new = dec_2010["P002001"].str.split("(", n = 1, expand = True)
  
# making separate column with values before '(' from new data frame
dec_2010["P002001_split"]= new[0]

# df display
dec_2010[['NAME','P002001','P002001_split']]

Unnamed: 0,NAME,P002001,P002001_split
0,Geographic Area Name,Total,Total
1,"Autauga County, Alabama",54571,54571
2,"Baldwin County, Alabama",182265,182265
3,"Barbour County, Alabama",27457,27457
4,"Bibb County, Alabama",22915,22915
...,...,...,...
3217,"Vega Baja Municipio, Puerto Rico",59662,59662
3218,"Vieques Municipio, Puerto Rico",9301,9301
3219,"Villalba Municipio, Puerto Rico",26073,26073
3220,"Yabucoa Municipio, Puerto Rico",37941,37941


In [4]:
def clean_data(df, df_codes, df_names_short, df_non_number_fields):    
    df_key = 'NAME' #assign df key
    non_numbers = ['N','2,500-','250,000+','-','(X)']

    #DROP FIRST ROW
    df.drop([0], inplace=True)
    df.reset_index(inplace = True)

    #EXTRACT COLUMNS FROM FULL DF; SAVE TO DF_SLICE
    df_slice = pd.DataFrame()
    for column in df_codes:
        df_slice[column] = df[column]

    #RENAME COLUMNS
    for x in range(0,len(df_codes)):
        df_slice = df_slice.rename(columns={df_codes[x] : df_names_short[x]})

    #REPLACE NON NUMBERS
    for x in non_numbers:
        df_slice = df_slice.replace(x, 'NaN')

    for column in df_slice.columns:
        if column not in df_non_number_fields:
            df_slice[column] = df_slice[column].astype(float, copy = False)
        if 'pct' in column.lower():
            df_slice[column] = df_slice[column].div(100)

    #MERGE WITH GEOCODE
    #df_gc = (pd.merge(geocodes, df_slice,
    #    left_on= geocode_key, right_on= df_key, how ='left'))

    return(df_slice)

In [5]:
# CLEAN dec_2010

#DEFINE VARIABLES AND LISTS
df = dec_2010 #assign df
df_codes = [
    'NAME',
    'P002001_split',
    'P002005',
    'P002006',
    'P002007',
    'P002008',
    'P002009',
    'P002010',
    'P002011',
    'P002002'
    ]
df_names_full = [
    'Geographic Area Name',
    'Total',
    'Total!!Not Hispanic or Latino!!Population of one race!!White alone',
    'Total!!Not Hispanic or Latino!!Population of one race!!Black or African American alone',
    'Total!!Not Hispanic or Latino!!Population of one race!!American Indian and Alaska Native alone',
    'Total!!Not Hispanic or Latino!!Population of one race!!Asian alone',
    'Total!!Not Hispanic or Latino!!Population of one race!!Native Hawaiian and Other Pacific Islander alone',
    'Total!!Not Hispanic or Latino!!Population of one race!!Some Other Race alone',
    'Total!!Not Hispanic or Latino!!Two or More Races',
    'Total!!Hispanic or Latino'
    ]
df_names_short = [
    'NAME',
    'pop_total_10',
    'pop_white_nh_10',
    'pop_black_nh_10',
    'pop_americanindian_nh_10',
    'pop_asian_nh_10',
    'pop_hawaiianpacificislander_nh_10',
    'pop_otherrace_nh_10',
    'pop_multiracial_nh_10',
    'pop_hispanic_10'
    ]
df_non_number_fields = ['NAME']

race_hisp_2010 = clean_data(df, df_codes, df_names_short, df_non_number_fields)

race_hisp_2010.head()

Unnamed: 0,NAME,pop_total_10,pop_white_nh_10,pop_black_nh_10,pop_americanindian_nh_10,pop_asian_nh_10,pop_hawaiianpacificislander_nh_10,pop_otherrace_nh_10,pop_multiracial_nh_10,pop_hispanic_10
0,"Autauga County, Alabama",54571.0,42154.0,9595.0,217.0,467.0,22.0,45.0,761.0,1310.0
1,"Baldwin County, Alabama",182265.0,152200.0,16966.0,1146.0,1340.0,79.0,245.0,2297.0,7992.0
2,"Barbour County, Alabama",27457.0,12837.0,12820.0,60.0,107.0,24.0,13.0,209.0,1387.0
3,"Bibb County, Alabama",22915.0,17191.0,5024.0,64.0,22.0,7.0,20.0,181.0,406.0
4,"Blount County, Alabama",57322.0,50952.0,724.0,285.0,115.0,18.0,35.0,567.0,4626.0


In [6]:
# CLEAN dec_2020

#DEFINE VARIABLES AND LISTS
df = dec_2020 #assign df
df_codes = [
    'NAME',
    'P2_001N',
    'P2_005N',
    'P2_006N',
    'P2_007N',
    'P2_008N',
    'P2_009N',
    'P2_010N',
    'P2_011N',
    'P2_002N']
df_names_full = [
    'Geographic Area Name',
    '!!Total:',
    '!!Total:!!Not Hispanic or Latino:!!Population of one race:!!White alone',
    '!!Total:!!Not Hispanic or Latino:!!Population of one race:!!Black or African American alone',
    '!!Total:!!Not Hispanic or Latino:!!Population of one race:!!American Indian and Alaska Native alone',
    '!!Total:!!Not Hispanic or Latino:!!Population of one race:!!Asian alone',
    '!!Total:!!Not Hispanic or Latino:!!Population of one race:!!Native Hawaiian and Other Pacific Islander alone',
    '!!Total:!!Not Hispanic or Latino:!!Population of one race:!!Some Other Race alone',
    '!!Total:!!Not Hispanic or Latino:!!Population of two or more races',
    '!!Total:!!Hispanic or Latino'
    ]
df_names_short = [
    'NAME',
    'pop_total_20',
    'pop_white_nh_20',
    'pop_black_nh_20',
    'pop_americanindian_nh_20',
    'pop_asian_nh_20',
    'pop_hawaiianpacificislander_nh_20',
    'pop_otherrace_nh_20',
    'pop_multiracial_nh_20',
    'pop_hispanic_20'
    ]
df_non_number_fields = ['NAME']


race_hisp_2020 = clean_data(df, df_codes, df_names_short, df_non_number_fields)

race_hisp_2020.head()

Unnamed: 0,NAME,pop_total_20,pop_white_nh_20,pop_black_nh_20,pop_americanindian_nh_20,pop_asian_nh_20,pop_hawaiianpacificislander_nh_20,pop_otherrace_nh_20,pop_multiracial_nh_20,pop_hispanic_20
0,"Autauga County, Alabama",58805.0,41582.0,11352.0,184.0,873.0,22.0,185.0,2490.0,2117.0
1,"Baldwin County, Alabama",231767.0,186495.0,18001.0,1291.0,2029.0,122.0,775.0,10368.0,12686.0
2,"Barbour County, Alabama",25223.0,11086.0,11850.0,58.0,103.0,0.0,63.0,553.0,1510.0
3,"Bibb County, Alabama",22293.0,16442.0,4390.0,39.0,26.0,9.0,47.0,600.0,740.0
4,"Blount County, Alabama",59134.0,49764.0,826.0,188.0,174.0,11.0,100.0,2300.0,5771.0


In [7]:
# COMBINE ALL DATAFRAMES
data_frames_key = 'NAME' #assign data_frames_key
data_frames = [race_hisp_2010,race_hisp_2020]
all_data = reduce(lambda  left,right: pd.merge(left,right,on = data_frames_key,
                                            how='outer'), data_frames)


In [8]:
all_data

Unnamed: 0,NAME,pop_total_10,pop_white_nh_10,pop_black_nh_10,pop_americanindian_nh_10,pop_asian_nh_10,pop_hawaiianpacificislander_nh_10,pop_otherrace_nh_10,pop_multiracial_nh_10,pop_hispanic_10,pop_total_20,pop_white_nh_20,pop_black_nh_20,pop_americanindian_nh_20,pop_asian_nh_20,pop_hawaiianpacificislander_nh_20,pop_otherrace_nh_20,pop_multiracial_nh_20,pop_hispanic_20
0,"Autauga County, Alabama",54571.0,42154.0,9595.0,217.0,467.0,22.0,45.0,761.0,1310.0,58805.0,41582.0,11352.0,184.0,873.0,22.0,185.0,2490.0,2117.0
1,"Baldwin County, Alabama",182265.0,152200.0,16966.0,1146.0,1340.0,79.0,245.0,2297.0,7992.0,231767.0,186495.0,18001.0,1291.0,2029.0,122.0,775.0,10368.0,12686.0
2,"Barbour County, Alabama",27457.0,12837.0,12820.0,60.0,107.0,24.0,13.0,209.0,1387.0,25223.0,11086.0,11850.0,58.0,103.0,0.0,63.0,553.0,1510.0
3,"Bibb County, Alabama",22915.0,17191.0,5024.0,64.0,22.0,7.0,20.0,181.0,406.0,22293.0,16442.0,4390.0,39.0,26.0,9.0,47.0,600.0,740.0
4,"Blount County, Alabama",57322.0,50952.0,724.0,285.0,115.0,18.0,35.0,567.0,4626.0,59134.0,49764.0,826.0,188.0,174.0,11.0,100.0,2300.0,5771.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3223,"Kusilvak Census Area, Alaska",,,,,,,,,,8368.0,173.0,16.0,7946.0,23.0,0.0,14.0,182.0,14.0
3224,"Petersburg Borough, Alaska",,,,,,,,,,3398.0,2444.0,46.0,231.0,117.0,12.0,21.0,363.0,164.0
3225,"LaSalle Parish, Louisiana",,,,,,,,,,14791.0,11263.0,1283.0,133.0,252.0,10.0,26.0,422.0,1402.0
3226,"Doña Ana County, New Mexico",,,,,,,,,,219561.0,59749.0,3172.0,1463.0,2458.0,115.0,839.0,4093.0,147672.0


In [20]:
columns_2010 = [    
    'pop_white_nh_10',
    'pop_black_nh_10',
    'pop_americanindian_nh_10',
    'pop_asian_nh_10',
    'pop_hawaiianpacificislander_nh_10',
    'pop_otherrace_nh_10',
    'pop_multiracial_nh_10',
    'pop_hispanic_10'
]

for x in columns_2010:
    all_data['pct_' + x] = all_data[x] / all_data['pop_total_10']

In [21]:
columns_2020 = [
    'pop_total_20',
    'pop_white_nh_20',
    'pop_black_nh_20',
    'pop_americanindian_nh_20',
    'pop_asian_nh_20',
    'pop_hawaiianpacificislander_nh_20',
    'pop_otherrace_nh_20',
    'pop_multiracial_nh_20',
    'pop_hispanic_20'
]

for x in columns_2020:
    all_data['pct_' + x] = all_data[x] / all_data['pop_total_20']

In [25]:
all_data['pop_growth_1020'] = (all_data['pop_total_20'] - all_data['pop_total_10']) / all_data['pop_total_10']

In [35]:
# create data_slice of only relevant columns
all_data_slice = all_data[['NAME', 
                           'pop_total_10',
                           'pop_total_20',
                           'pop_growth_1020',
                           'pct_pop_white_nh_10',
                           'pct_pop_white_nh_20',
                           'pct_pop_black_nh_10',
                           'pct_pop_black_nh_20',
                           'pct_pop_americanindian_nh_10',
                           'pct_pop_americanindian_nh_20',
                           'pct_pop_asian_nh_10',
                           'pct_pop_asian_nh_20',
                           'pct_pop_hawaiianpacificislander_nh_10',
                           'pct_pop_hawaiianpacificislander_nh_20',
                           'pct_pop_otherrace_nh_10',
                           'pct_pop_otherrace_nh_20',
                           'pct_pop_multiracial_nh_10',
                           'pct_pop_multiracial_nh_20',
                           'pct_pop_hispanic_10',
                           'pct_pop_hispanic_20'
                          ]]

In [36]:
all_data_slice.head()

Unnamed: 0,NAME,pop_total_10,pop_total_20,pop_growth_1020,pct_pop_white_nh_10,pct_pop_white_nh_20,pct_pop_black_nh_10,pct_pop_black_nh_20,pct_pop_americanindian_nh_10,pct_pop_americanindian_nh_20,pct_pop_asian_nh_10,pct_pop_asian_nh_20,pct_pop_hawaiianpacificislander_nh_10,pct_pop_hawaiianpacificislander_nh_20,pct_pop_otherrace_nh_10,pct_pop_otherrace_nh_20,pct_pop_multiracial_nh_10,pct_pop_multiracial_nh_20,pct_pop_hispanic_10,pct_pop_hispanic_20
0,"Autauga County, Alabama",54571.0,58805.0,0.077587,0.772462,0.707117,0.175826,0.193045,0.003976,0.003129,0.008558,0.014846,0.000403,0.000374,0.000825,0.003146,0.013945,0.042343,0.024005,0.036
1,"Baldwin County, Alabama",182265.0,231767.0,0.271594,0.835048,0.804666,0.093084,0.077669,0.006288,0.00557,0.007352,0.008754,0.000433,0.000526,0.001344,0.003344,0.012603,0.044735,0.043848,0.054736
2,"Barbour County, Alabama",27457.0,25223.0,-0.081364,0.467531,0.439519,0.466912,0.469809,0.002185,0.002299,0.003897,0.004084,0.000874,0.0,0.000473,0.002498,0.007612,0.021924,0.050515,0.059866
3,"Bibb County, Alabama",22915.0,22293.0,-0.027144,0.750207,0.737541,0.219245,0.196923,0.002793,0.001749,0.00096,0.001166,0.000305,0.000404,0.000873,0.002108,0.007899,0.026914,0.017718,0.033194
4,"Blount County, Alabama",57322.0,59134.0,0.031611,0.888873,0.841546,0.01263,0.013968,0.004972,0.003179,0.002006,0.002942,0.000314,0.000186,0.000611,0.001691,0.009891,0.038895,0.080702,0.097592


In [37]:
all_data_slice.to_csv('race_hispanic_origin_2010_2020.csv')