In [None]:
#Install cenpy
!pip install cenpy

In [None]:
#Setup
import altair as alt
import cenpy as cen
import pandas as pd
import geopandas as gpd
import numpy as np

from google.colab import files

In [3]:
#Load county data
all = pd.read_csv("county.csv")
df = all[['GEO_ID','NAME','P1_001N','P1_006N']]

#Remove first row, which has alternative column names
df = df.iloc[1: , :]

#Convert 001N(Total) & 006(Asian) to int
df['P1_001N'] = df['P1_001N'].astype(int)
df['P1_006N'] = df['P1_006N'].astype(int)

In [4]:
#Create new variables in county
#Make percent variable
df['PCT'] = (df['P1_006N']/df['P1_001N'])*100

#Separate name into county and state variables
df['COUNTY'] = df['NAME'].str.split(',').apply(lambda x: x[0])

#Make state variable
df['STATE'] = df['NAME'].str.split(',').apply(lambda x: x[1])

#Strip extra spaces
df['STATE']=df['STATE'].str.strip()

In [5]:
#Load state data
all_state = pd.read_csv("state.csv")
state = all_state[['GEO_ID','NAME','P1_001N','P1_006N']]

#Remove first row, which has alternative column names
state = state.iloc[1: , :]

#Duplicate NAME variable as state
state['STATE'] = state['NAME']

#Strip extra spaces
state['STATE'] = state['STATE'].str.strip()

#Convert variable types
state['P1_001N'] = state['P1_001N'].astype(int)
state['P1_006N'] = state['P1_006N'].astype(int)

#Make percent variable and convert type
state['PCT_STATE'] = (state['P1_006N']/state['P1_001N'])*100
#state['PCT_STATE'] = state['PCT_STATE'].astype(int)

#Creating dataset with only state and percent for merge
state_pct = state[['STATE','PCT_STATE']]
state_pct

Unnamed: 0,STATE,PCT_STATE
1,Alabama,1.525791
2,Alaska,6.003892
3,Arizona,3.599663
4,Arkansas,1.721354
5,California,15.392566
6,Colorado,3.460978
7,Connecticut,4.78252
8,Delaware,4.313257
9,District of Columbia,4.864802
10,Florida,2.988562


In [6]:
#Test matches for merge
df['STATE'].isin(state_pct['STATE']).value_counts()

True    3221
Name: STATE, dtype: int64

In [7]:
#Merge in state averages
df_merge = pd.merge(df, state_pct, how="outer", on='STATE')
df_merge

Unnamed: 0,GEO_ID,NAME,P1_001N,P1_006N,PCT,COUNTY,STATE,PCT_STATE
0,0500000US01001,"Autauga County, Alabama",58805,881,1.498172,Autauga County,Alabama,1.525791
1,0500000US01003,"Baldwin County, Alabama",231767,2067,0.891844,Baldwin County,Alabama,1.525791
2,0500000US01005,"Barbour County, Alabama",25223,117,0.463862,Barbour County,Alabama,1.525791
3,0500000US01007,"Bibb County, Alabama",22293,32,0.143543,Bibb County,Alabama,1.525791
4,0500000US01009,"Blount County, Alabama",59134,178,0.301011,Blount County,Alabama,1.525791
...,...,...,...,...,...,...,...,...
3216,0500000US72145,"Vega Baja Municipio, Puerto Rico",54414,28,0.051457,Vega Baja Municipio,Puerto Rico,0.121764
3217,0500000US72147,"Vieques Municipio, Puerto Rico",8249,7,0.084859,Vieques Municipio,Puerto Rico,0.121764
3218,0500000US72149,"Villalba Municipio, Puerto Rico",22093,10,0.045263,Villalba Municipio,Puerto Rico,0.121764
3219,0500000US72151,"Yabucoa Municipio, Puerto Rico",30426,8,0.026293,Yabucoa Municipio,Puerto Rico,0.121764


In [8]:
#Filter by greater than national average (~7.2%)
aboveavg_natl = df_merge[df_merge['PCT']>7.2]
aboveavg_natl

Unnamed: 0,GEO_ID,NAME,P1_001N,P1_006N,PCT,COUNTY,STATE,PCT_STATE
67,0500000US02013,"Aleutians East Borough, Alaska",3420,771,22.543860,Aleutians East Borough,Alaska,6.003892
68,0500000US02016,"Aleutians West Census Area, Alaska",5232,1513,28.918196,Aleutians West Census Area,Alaska,6.003892
69,0500000US02020,"Anchorage Municipality, Alaska",291247,27646,9.492287,Anchorage Municipality,Alaska,6.003892
72,0500000US02063,"Chugach Census Area, Alaska",7102,529,7.448606,Chugach Census Area,Alaska,6.003892
81,0500000US02130,"Ketchikan Gateway Borough, Alaska",13948,1129,8.094350,Ketchikan Gateway Borough,Alaska,6.003892
...,...,...,...,...,...,...,...,...
2936,0500000US51685,"Manassas Park city, Virginia",17219,1835,10.656833,Manassas Park city,Virginia,7.130205
2950,0500000US51810,"Virginia Beach city, Virginia",459470,34305,7.466211,Virginia Beach city,Virginia,7.130205
2970,0500000US53033,"King County, Washington",2269675,452475,19.935674,King County,Washington,9.481757
2984,0500000US53061,"Snohomish County, Washington",827957,101464,12.254743,Snohomish County,Washington,9.481757


In [20]:
aboveavg_natl['PCT_DIFF'] = aboveavg_natl['PCT'] - 7.2
aboveavg_natl.sort_values('PCT_DIFF', ascending=False).head(25)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,GEO_ID,NAME,P1_001N,P1_006N,PCT,COUNTY,STATE,PCT_STATE,PCT_DIFF
548,0500000US15003,"Honolulu County, Hawaii",1016508,436853,42.975855,Honolulu County,Hawaii,37.237188,35.775855
229,0500000US06085,"Santa Clara County, California",1936259,759030,39.200851,Santa Clara County,California,15.392566,32.000851
224,0500000US06075,"San Francisco County, California",873965,296505,33.926416,San Francisco County,California,15.392566,26.726416
187,0500000US06001,"Alameda County, California",1682353,545261,32.410618,Alameda County,California,15.392566,25.210618
227,0500000US06081,"San Mateo County, California",764442,230242,30.118963,San Mateo County,California,15.392566,22.918963
68,0500000US02016,"Aleutians West Census Area, Alaska",5232,1513,28.918196,Aleutians West Census Area,Alaska,6.003892,21.718196
550,0500000US15007,"Kauai County, Hawaii",73298,21102,28.789326,Kauai County,Hawaii,37.237188,21.589326
1869,0500000US36081,"Queens County, New York",2405464,660631,27.463766,Queens County,New York,9.569344,20.263766
551,0500000US15009,"Maui County, Hawaii",164754,44328,26.905568,Maui County,Hawaii,37.237188,19.705568
1786,0500000US34023,"Middlesex County, New Jersey",863162,228813,26.508697,Middlesex County,New Jersey,10.228126,19.308697


In [10]:
#Filter by greater than their state average
aboveavg_state = df_merge.query('PCT > PCT_STATE')
aboveavg_state

Unnamed: 0,GEO_ID,NAME,P1_001N,P1_006N,PCT,COUNTY,STATE,PCT_STATE
15,0500000US01031,"Coffee County, Alabama",53465,909,1.700178,Coffee County,Alabama,1.525791
36,0500000US01073,"Jefferson County, Alabama",674721,13109,1.942877,Jefferson County,Alabama,1.525791
40,0500000US01081,"Lee County, Alabama",174241,8572,4.919623,Lee County,Alabama,1.525791
41,0500000US01083,"Limestone County, Alabama",103570,1869,1.804577,Limestone County,Alabama,1.525791
44,0500000US01089,"Madison County, Alabama",388153,10292,2.651532,Madison County,Alabama,1.525791
...,...,...,...,...,...,...,...,...
3190,0500000US72093,"Maricao Municipio, Puerto Rico",4755,9,0.189274,Maricao Municipio,Puerto Rico,0.121764
3192,0500000US72097,"Mayagüez Municipio, Puerto Rico",73077,102,0.139579,Mayagüez Municipio,Puerto Rico,0.121764
3207,0500000US72127,"San Juan Municipio, Puerto Rico",342259,1060,0.309707,San Juan Municipio,Puerto Rico,0.121764
3212,0500000US72137,"Toa Baja Municipio, Puerto Rico",75293,99,0.131486,Toa Baja Municipio,Puerto Rico,0.121764


In [16]:
aboveavg_state['PCT_DIFF'] = aboveavg_state['PCT'] - aboveavg_state['PCT_STATE']
aboveavg_state.sort_values('PCT_DIFF', ascending=False).head(25)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,GEO_ID,NAME,P1_001N,P1_006N,PCT,COUNTY,STATE,PCT_STATE,PCT_DIFF
229,0500000US06085,"Santa Clara County, California",1936259,759030,39.200851,Santa Clara County,California,15.392566,23.808285
68,0500000US02016,"Aleutians West Census Area, Alaska",5232,1513,28.918196,Aleutians West Census Area,Alaska,6.003892,22.914304
224,0500000US06075,"San Francisco County, California",873965,296505,33.926416,San Francisco County,California,15.392566,18.53385
1869,0500000US36081,"Queens County, New York",2405464,660631,27.463766,Queens County,New York,9.569344,17.894422
187,0500000US06001,"Alameda County, California",1682353,545261,32.410618,Alameda County,California,15.392566,17.018052
2602,0500000US48157,"Fort Bend County, Texas",822779,182537,22.185423,Fort Bend County,Texas,5.439878,16.745545
67,0500000US02013,"Aleutians East Borough, Alaska",3420,771,22.54386,Aleutians East Borough,Alaska,6.003892,16.539968
1786,0500000US34023,"Middlesex County, New Jersey",863162,228813,26.508697,Middlesex County,New Jersey,10.228126,16.280571
82,0500000US02150,"Kodiak Island Borough, Alaska",13101,2803,21.395313,Kodiak Island Borough,Alaska,6.003892,15.391422
227,0500000US06081,"San Mateo County, California",764442,230242,30.118963,San Mateo County,California,15.392566,14.726397


In [17]:
#Exporting files
df_merge.to_csv('df.csv') 
files.download('df.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [18]:
aboveavg_natl.to_csv('aboveavg_natl.csv') 
files.download('aboveavg_natl.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [19]:
aboveavg_state.to_csv('aboveavg_state.csv') 
files.download('aboveavg_state.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>