In [1]:
#Import Libraries
import pandas as pd
pd.set_option('display.max_columns', None)

In [2]:
def cleanData(person_df, household_df, post2018 = True):
    person_df = person_df
    household_df = household_df
    pFeatures = ["SERIALNO", "SPORDER", "PUMA", "PWGTP", "AGEP", "CIT", "COW", "ENG", "FER", "JWMNP"
               , "MAR", "MIL", "SCH", "SCHL", "SEX", "PAP", "INTP", "SSIP", "SSP", "WAGP"
               , "OIP", "RETP", "SEMP", "PERNP", "PINCP", "WKL", "DIS", "ESR", "HICOV", "HISP"
               , "PAOC", "POVPIP", "RAC1P", "RACASN", "RACBLK", "RACWHT", "RACSOR", "SCIENGP", "WKHP"
               , "SOCP"]
    if post2018:
        pFeatures = pFeatures + ["RELSHIPP", "JWTRNS", "WKWN"]
    else:
        pFeatures = pFeatures + ["RELP", "JWTR", "WKW"]
    hFeatures = ["SERIALNO", "PUMA", "NP", "ACCESS", "ACR", "BATH", "FS", "ELEP", "FULP", "GASP", "HISPEED"
              , "LAPTOP", "RNTP", "RWATPR", "TEN", "VALP", "VEH", "WATP", "HINCP", "HUPAC", "KIT", "PLM"
              , "GRPIP", "RMSP"]
    person_df = person_df.loc[:, pFeatures].copy()
    household_df = household_df.loc[:, hFeatures].copy()
    person_df = person_df.loc[(person_df["PUMA"] > 101) & (person_df["PUMA"] <= 105)].copy()
    household_df = household_df.loc[(household_df["PUMA"] > 101) & (household_df["PUMA"] <= 105)].copy()
    person_df.loc[(person_df.RAC1P == 1) & (person_df.HISP == 1), "RACE"] = "White"
    person_df.loc[(person_df.RAC1P == 2) & (person_df.HISP == 1), "RACE"] = "African American"
    person_df.loc[(person_df.RAC1P == 6) & (person_df.HISP == 1), "RACE"] = "Asian"
    person_df.loc[(person_df.RAC1P.isin([3, 4, 5, 7, 8, 9])) & (person_df.HISP == 1), "RACE"] = "Other"
    person_df.loc[person_df.HISP != 1, "RACE"] = "Latino"
    person_df = person_df.merge(household_df, how = 'left', on = 'SERIALNO').copy()
    person_df = person_df.reindex(person_df.index.repeat(person_df.PWGTP)).reset_index(drop=True).drop(['PWGTP', 'PUMA_y'], axis=1).rename(columns={"PUMA_x": "PUMA"}).copy()
    return person_df

def getSize(df, showDim = False, name = ""):
    if name != "":
        name = name + " "
    if showDim:
        print("Dimensions for the " + name + "dataset: " + str(df.shape))
    print("New DataFrame size: " + str(int(df.memory_usage(index=True).sum()/1000000)) + " MB")

In [4]:
pca191 = pd.read_csv("./Data/PUMS/2019pCA1.csv")
hca191 = pd.read_csv("./Data/PUMS/2019hCA1.csv")
pca191_Ok = cleanData(pca191, hca191)
del pca191, hca191

pca181 = pd.read_csv("./Data/PUMS/2018pCA1.csv")
hca181 = pd.read_csv("./Data/PUMS/2018hCA1.csv")
pca181_Ok = cleanData(pca181, hca181, False)
del pca181, hca181

pca161 = pd.read_csv("./Data//PUMS/2016pCA1.csv")
hca161 = pd.read_csv("./Data/PUMS/2016hCA1.csv")
pca161_Ok = cleanData(pca161, hca161, False)
del pca161, hca161

# Indicators

## Indicator 1

In [19]:
def ind1(df):
    numerator = df.loc[df["COW"] == 7, ["RACE"]].groupby("RACE")['RACE'].count() ## .Self-employed in own incorporated business
    numerator = numerator.append(pd.Series(numerator.sum(), ['Oakland']))
    denominator = df.loc[(pca191_Ok["COW"] !=9) & (df["COW"].notna()), ["RACE"]].groupby("RACE")['RACE'].count() # not include
    # >>> 9=unemployed and NA = who aged below 16
    denominator = denominator.append(pd.Series(denominator.sum(), ["Oakland"]))
    ind1_cal = (numerator/denominator)
    table = pd.DataFrame(ind1_cal, columns = ["ind1"])
    return table

In [21]:
display(ind1(pca181_Ok))
display(ind1(pca191_Ok))

Unnamed: 0,ind1
African American,0.031427
Asian,0.026145
Latino,0.013882
Other,0.051349
White,0.038705
Oakland,0.030133


Unnamed: 0,ind1
African American,0.012638
Asian,0.016906
Latino,0.032385
Other,0.021311
White,0.032185
Oakland,0.025139


## Indicator 4

In [28]:
def ind4(df):
    filter_df = df.loc[(df['AGEP'] >= 16) & (df['AGEP'] < 25)]
    index_1 = (filter_df['ESR']==3) | (filter_df['ESR']==6) ## Unemployed
    index_2 = (filter_df['SCH']==1) ## not attended school
    numerator =filter_df.loc[index_1 & index_2, ["RACE"]].groupby("RACE")['RACE'].count()
    numerator = numerator.append(pd.Series(numerator.sum(), ['Oakland']))
    employed_df = filter_df[(filter_df['ESR']!=3) & (['ESR']!=6)] # not include "#3 Unemployed" and "#6 Not in labor force"
    denominator = employed_df.loc[(employed_df['ESR'].notna()) & (employed_df['SCH'].notna()) , ["RACE"]].groupby("RACE")['RACE'].count()
    denominator = denominator.append(pd.Series(denominator.sum(), ["Oakland"]))
    ind4_cal= numerator/denominator*100
    table = pd.DataFrame(ind4_cal, columns = ["ind4"])
    return table

In [61]:
display(ind4(pca181_Ok))
display(ind4(pca191_Ok))

Unnamed: 0,ind4
African American,16.519337
Asian,8.591808
Latino,13.547457
Other,10.876664
White,7.483856
Oakland,12.076101


Unnamed: 0,ind4
African American,16.394616
Asian,4.397843
Latino,8.33192
Other,7.808564
White,6.021306
Oakland,9.480224


## Indicator 5

In [62]:
def ind5(df):
    numerator = df.loc[df["ESR"] == 6].groupby("RACE")['RACE'].count() # 6 = Not in labor force
    numerator = numerator.append(pd.Series(numerator.sum(), ['Oakland']))
    denominator = df.loc[(df["ESR"].notna())].groupby("RACE")['RACE'].count() # not including ppl who aged <16
    denominator = denominator.append(pd.Series(denominator.sum(), ["Oakland"]))
    return pd.DataFrame((numerator/denominator), columns=['ind5'])

In [63]:
display(ind5(pca181_Ok))
display(ind5(pca191_Ok))

Unnamed: 0,ind5
African American,0.36688
Asian,0.322828
Latino,0.278825
Other,0.230472
White,0.26743
Oakland,0.298269


Unnamed: 0,ind5
African American,0.411477
Asian,0.363419
Latino,0.269747
Other,0.237155
White,0.263162
Oakland,0.314522


### Indicator 29

In [64]:
physFitYouth = pd.read_csv("./Data/Public Health/physicalFitnessYouth.csv")

In [111]:
def ind29(df):
    df = df.copy()
    df = df.iloc[:, [2, 3, 4, 7]]
    df.columns = ['race', 'year', 'score', 'ratio']
    df.loc[:, "ratio"] = df.loc[:, 'ratio'].str.rstrip('%').astype('float')/100
    df.loc[(df.race == 'Multiple Ethnicity') | (df.race == "Native American"), ['race']] = "Other" 
    df.loc[(df.race == 'Pacific Islander') | (df.race == "Filipino"), ['race']] = "Asian"

    return df

In [112]:
ind29(physFitYouth)

Unnamed: 0,race,year,score,ratio
0,African American,2017-18,0 out of 6,0.028
1,African American,2017-18,1 out of 6,0.076
2,African American,2017-18,2 out of 6,0.148
3,African American,2017-18,3 out of 6,0.195
4,African American,2017-18,4 out of 6,0.208
5,African American,2017-18,5 out of 6,0.186
6,African American,2017-18,6 out of 6,0.159
7,African American,2018-19,0 out of 6,0.042
8,African American,2018-19,1 out of 6,0.094
9,African American,2018-19,2 out of 6,0.151


In [108]:
test = physFitYouth.iloc[:, [2, 3, 4, 7]]
test.columns = ['race', 'year', 'score', 'ratio']
test.loc[:, "ratio"] = test.loc[:, 'ratio'].str.rstrip('%').astype('float')/100
test.loc[:, ['race']] == 'Multiple Ethnicity']

SyntaxError: invalid syntax (<ipython-input-108-f66f26613f04>, line 4)

In [90]:
test.loc[:, ['Select Subgroup']]

Unnamed: 0,Select Subgroup
0,African American
1,African American
2,African American
3,African American
4,African American
5,African American
6,African American
7,African American
8,African American
9,African American


In [83]:
test.loc[test.iloc[:, 2] == 'Multiple Ethnicity']

Unnamed: 0,Select District/School,Select Grade,Select Subgroup,Academic Year,Score,Total Tested,Score.1,COUNT([Academic Year]) / TOTAL(COUNT([Academic Year])) along Score,COUNT([Academic Year]) / TOTAL(COUNT([Academic Year])) along Score.1,Count of Academic Year
55,All Schools,All Grades,Multiple Ethnicity,2017-18,0 out of 6,245,0 out of 6,0.008,0.008,2
56,All Schools,All Grades,Multiple Ethnicity,2017-18,1 out of 6,245,1 out of 6,0.045,0.045,11
57,All Schools,All Grades,Multiple Ethnicity,2017-18,2 out of 6,245,2 out of 6,0.094,0.094,23
58,All Schools,All Grades,Multiple Ethnicity,2017-18,3 out of 6,245,3 out of 6,0.114,0.114,28
59,All Schools,All Grades,Multiple Ethnicity,2017-18,4 out of 6,245,4 out of 6,0.171,0.171,42
60,All Schools,All Grades,Multiple Ethnicity,2017-18,5 out of 6,245,5 out of 6,0.265,0.265,65
61,All Schools,All Grades,Multiple Ethnicity,2017-18,6 out of 6,245,6 out of 6,0.302,0.302,74
62,All Schools,All Grades,Multiple Ethnicity,2018-19,0 out of 6,298,0 out of 6,0.013,0.013,4
63,All Schools,All Grades,Multiple Ethnicity,2018-19,1 out of 6,298,1 out of 6,0.027,0.027,8
64,All Schools,All Grades,Multiple Ethnicity,2018-19,2 out of 6,298,2 out of 6,0.121,0.121,36
