# Oakland Race and Equity - Capstone Project

### Import/Clean Data

This is the script used to prepare ACS PUMS data for analysis. For exploratory analysis on the data and the methodology behind the script, check the `ORE Wrangling` notebook

In [2]:
#Import Libraries
import pandas as pd
pd.set_option('display.max_columns', None)
import plotly.graph_objects as go


In [3]:
def cleanData(person_df, household_df, post2018 = True):
    person_df = person_df
    household_df = household_df
    pFeatures = ["SERIALNO", "SPORDER", "PUMA", "PWGTP", "AGEP", "CIT", "COW", "ENG", "FER", "JWMNP"
               , "MAR", "MIL", "SCH", "SCHL", "SEX", "PAP", "INTP", "SSIP", "SSP", "WAGP"
               , "OIP", "RETP", "SEMP", "PERNP", "PINCP", "WKL", "DIS", "ESR", "HICOV", "HISP"
               , "PAOC", "POVPIP", "RAC1P", "RACASN", "RACBLK", "RACWHT", "RACSOR", "SCIENGP", "WKHP"
               , "SOCP"]
    if post2018:
        pFeatures = pFeatures + ["RELSHIPP", "JWTRNS", "WKWN"]
    else:
        pFeatures = pFeatures + ["RELP", "JWTR", "WKW"]
    hFeatures = ["SERIALNO", "PUMA", "NP", "ACCESS", "ACR", "BATH", "FS", "ELEP", "FULP", "GASP", "HISPEED"
              , "LAPTOP", "RNTP", "RWATPR", "TEN", "VALP", "VEH", "WATP", "HINCP", "HUPAC", "KIT", "PLM"
              , "GRPIP", "RMSP"]
    person_df = person_df.loc[:, pFeatures].copy()
    household_df = household_df.loc[:, hFeatures].copy()
    person_df = person_df.loc[(person_df["PUMA"] > 101) & (person_df["PUMA"] <= 105)].copy()
    household_df = household_df.loc[(household_df["PUMA"] > 101) & (household_df["PUMA"] <= 105)].copy()
    person_df.loc[(person_df.RAC1P == 1) & (person_df.HISP == 1), "RACE"] = "White"
    person_df.loc[(person_df.RAC1P == 2) & (person_df.HISP == 1), "RACE"] = "African American"
    person_df.loc[(person_df.RAC1P == 6) & (person_df.HISP == 1), "RACE"] = "Asian"
    person_df.loc[(person_df.RAC1P.isin([3, 4, 5, 7, 8, 9])) & (person_df.HISP == 1), "RACE"] = "Other"
    person_df.loc[person_df.HISP != 1, "RACE"] = "Latino"
    person_df = person_df.merge(household_df, how = 'left', on = 'SERIALNO').copy()
    person_df = person_df.reindex(person_df.index.repeat(person_df.PWGTP)).reset_index(drop=True).drop(['PWGTP', 'PUMA_y'], axis=1).rename(columns={"PUMA_x": "PUMA"}).copy()
    return person_df

In [4]:
def getSize(df, showDim = False, name = ""):
    if name != "":
        name = name + " "
    if showDim:
        print("Dimensions for the " + name + "dataset: " + str(df.shape))
    print("New DataFrame size: " + str(int(df.memory_usage(index=True).sum()/1000000)) + " MB")

#### Preparing 2016, 2018, and 2019 ACS PUMS data

Data can be found at:<br><br>
https://www2.census.gov/programs-surveys/acs/data/pums/2016/1-Year/<br>
https://www2.census.gov/programs-surveys/acs/data/pums/2018/1-Year/<br>
https://www2.census.gov/programs-surveys/acs/data/pums/2019/1-Year/<br>
<br>Datasets for California are `csv_pca.zip` and `csv_hca.zip `

In [5]:
pca191 = pd.read_csv("./Data/PUMS/2019pCA1.csv")
hca191 = pd.read_csv("./Data/PUMS/2019hCA1.csv")
pca191_Ok = cleanData(pca191, hca191)
del pca191, hca191

pca181 = pd.read_csv("./Data/PUMS/2018pCA1.csv")
hca181 = pd.read_csv("./Data/PUMS/2018hCA1.csv")
pca181_Ok = cleanData(pca181, hca181, False)
del pca181, hca181

pca161 = pd.read_csv("./Data/PUMS/2016pCA1.csv")
hca161 = pd.read_csv("./Data/PUMS/2016hCA1.csv")
pca161_Ok = cleanData(pca161, hca161, False)
del pca161, hca161

In [6]:
bo = pca191_Ok.loc[pca191_Ok["COW"] == 7, ["RACE"]].groupby("RACE")['RACE'].count().to_frame(name='counts')
ep = pca191_Ok.loc[(pca191_Ok["COW"] != 9) & (pca191_Ok["COW"].notna()), ["RACE"]].groupby("RACE")['RACE'].count().to_frame('counts')

ind1 = pd.DataFrame((bo/ep)).reset_index().reindex([0,1,2,4,3])
ind1 = ind1.append({'RACE': 'Oakland', 'counts': 0.025139}, ignore_index=True)

In [7]:
ind1

Unnamed: 0,RACE,counts
0,African American,0.012638
1,Asian,0.016906
2,Latino,0.032385
3,White,0.032185
4,Other,0.021311
5,Oakland,0.025139


In [8]:
bo = pca181_Ok.loc[pca181_Ok["COW"] == 7, ["RACE"]].groupby("RACE")['RACE'].count().to_frame(name='counts')
ep = pca181_Ok.loc[(pca181_Ok["COW"] != 9) & (pca181_Ok["COW"].notna()), ["RACE"]].groupby("RACE")['RACE'].count().to_frame('counts')

ind1p = pd.DataFrame((bo/ep)).reset_index().reindex([0,1,2,4,3])
ind1p = ind1p.append({'RACE': 'Oakland', 'counts': 0.030133}, ignore_index=True)

In [9]:
ind1p

Unnamed: 0,RACE,counts
0,African American,0.031903
1,Asian,0.026288
2,Latino,0.01413
3,White,0.038595
4,Other,0.051757
5,Oakland,0.030133


In [10]:
ind1.counts.min()

0.01263815769218928

In [11]:
dh = pca161_Ok.loc[(pca161_Ok.RELP == 0) & ((pca161_Ok.TEN == 2) | (pca161_Ok.TEN == 1)), ["RACE"]].groupby("RACE")['RACE'].count()
da = pca161_Ok.loc[pca161_Ok.RELP == 0, ["RACE"]].groupby("RACE")['RACE'].count()

In [12]:
ind1.count

<bound method DataFrame.count of                RACE    counts
0  African American  0.012638
1             Asian  0.016906
2            Latino  0.032385
3             White  0.032185
4             Other  0.021311
5           Oakland  0.025139>

In [13]:
ind45 = {'RACE': ['African American', 'Asian', 'Latino', 'White', 'Other', 'Oakland'], 'counts': [0.833, 0.855, 0.781, 0.849, 0.927, 0.867]}
ind45 = pd.DataFrame(data=ind45)

In [41]:
colors = ['#D55E00','#56B4E9','#40B0A6','#F0E442','#004488', '#D35FB7']
fig = go.Figure(data=[go.Bar(
    x=ind1.RACE,
    y=ind1.counts,
    marker_color=colors,
    text = str(ind1.RACE),
    textposition='outside'
)])

fig.add_shape(type="line",
    xref="x", yref="y",
    x0=-0.5, y0=ind1.counts.min(), x1=5.5, y1=ind1.counts.min(),
    line=dict(
        color="#e63600",
        width=3,
        dash="dash",
    ),
    opacity=1,
              
             )
fig.update_layout(yaxis_tickformat = '.1%', template="plotly_white", title_text='<b>Percent of Employed Individuals who are Business Owners</b>')

In [20]:
colors = ['#D55E00','#56B4E9','#40B0A6','#ded340','#004488', '#D35FB7']

fig = go.Figure(data=[go.Bar(
    x=ind1p.RACE,
    y=ind1p.counts,
    marker_color=colors
)])

fig.add_shape(type="line",
    xref="x", yref="y",
    x0=-0.5, y0=ind1p.counts.min(), x1=5.5, y1=ind1p.counts.min(),
    line=dict(
        color="#e63600",
        width=3,
        dash="dash",
    ),
    opacity=1,
              
             )

fig.update_layout(yaxis_tickformat = '.1%', template="plotly_white", title_text='<b>Percent of Employed Individuals who are Business Owners, 2018</b>')

In [16]:
colors = ['#56B4E9',] * 6
colors[ind1p.counts.idxmin()] = '#D55E00'

fig = go.Figure(data=[go.Bar(
    x=ind1p.RACE,
    y=ind1p.counts,
    marker_color=colors
)])

fig.add_shape(type="line",
    xref="x", yref="y",
    x0=-0.5, y0=ind1p.counts.min(), x1=5.5, y1=ind1p.counts.min(),
    line=dict(
        color="#e63600",
        width=3,
        dash="dash",
    ),
    opacity=1,
              
             )

fig.update_layout(yaxis_tickformat = '.1%', template="plotly_white", title_text='<b>Percent of Employed Individuals who are Business Owners, 2018</b>')

In [36]:
colors = ['#D55E00','#56B4E9','#40B0A6','#ded340','#004488', '#D35FB7']

fig = go.Figure(data=[go.Bar(
    x=ind1p.RACE,
    y=ind1p.counts,
    marker_color=colors,
    text = '<b>2018</b>',
    textposition='outside',
    offsetgroup=0,
),
                     go.Bar(
    x=ind1.RACE,
    y=ind1.counts,
    marker_color=colors,
    text = '<b>2019</b>',
    textposition='outside',
    offsetgroup=1,
)])


fig.update_layout(yaxis_tickformat = '.1%', template="plotly_white", title_text='<b>Percent of Employed Individuals who are Business Owners</b>')

In [18]:
ind1.counts.idxmin()

0

In [39]:
colors = ['#56B4E9',]* 6

fig = go.Figure(data=[go.Bar(
    x=ind1p.RACE,
    y=ind1p.counts,
    marker_color=colors,
    text = '<b>2018</b>',
    textposition='outside',
    offsetgroup=0,
),
                     go.Bar(
    x=ind1.RACE,
    y=ind1.counts,
    marker_color=colors,
    text = '<b>2019</b>',
    textposition='outside',
    offsetgroup=1,
)])


fig.update_layout(yaxis_tickformat = '.1%', template="plotly_white", title_text='<b>Percent of Employed Individuals who are Business Owners</b>')