### POPULATION

Source of data : https://www.census.gov/data/tables/time-series/demo/popest/2020s-total-cities-and-towns.html#ds

In [1]:
# Import libraries
import pandas as pd
import numpy as  np

### We extract the data from our S3 bucket

In [2]:
from private.s3_aws import access_key, secret_access_key

In [3]:
# Read files
# Table #1
population2010 = pd.read_csv(f"s3://rawdatagrupo07/SUB-EST2020_ALL.csv",
    storage_options={
        "key": access_key,
        "secret": secret_access_key
    },encoding = "ISO-8859-1"
)

print('Shape table 2010-2020 : ' ,  population2010.shape)
#Table #2
population2021 = pd.read_csv(f"s3://rawdatagrupo07/sub-est2021_all.csv",
    storage_options={
        "key": access_key,
        "secret": secret_access_key
    },engine='python',encoding='latin1'
)
print('Shape table 2021: ', population2021.shape)

Shape table 2010-2020 :  (81415, 24)
Shape table 2021:  (81416, 13)


In [4]:
population2010.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81415 entries, 0 to 81414
Data columns (total 24 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   SUMLEV             81415 non-null  int64 
 1   STATE              81415 non-null  int64 
 2   COUNTY             81415 non-null  int64 
 3   PLACE              81415 non-null  int64 
 4   COUSUB             81415 non-null  int64 
 5   CONCIT             81415 non-null  int64 
 6   PRIMGEO_FLAG       81415 non-null  int64 
 7   FUNCSTAT           81415 non-null  object
 8   NAME               81415 non-null  object
 9   STNAME             81415 non-null  object
 10  CENSUS2010POP      81415 non-null  object
 11  ESTIMATESBASE2010  81415 non-null  int64 
 12  POPESTIMATE2010    81415 non-null  int64 
 13  POPESTIMATE2011    81415 non-null  int64 
 14  POPESTIMATE2012    81415 non-null  int64 
 15  POPESTIMATE2013    81415 non-null  int64 
 16  POPESTIMATE2014    81415 non-null  int64

In [5]:
population2010[population2010.duplicated()]

Unnamed: 0,SUMLEV,STATE,COUNTY,PLACE,COUSUB,CONCIT,PRIMGEO_FLAG,FUNCSTAT,NAME,STNAME,...,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015,POPESTIMATE2016,POPESTIMATE2017,POPESTIMATE2018,POPESTIMATE2019,POPESTIMATE042020,POPESTIMATE2020


In [6]:
population2021[population2021.duplicated()]

Unnamed: 0,SUMLEV,STATE,COUNTY,PLACE,COUSUB,CONCIT,PRIMGEO_FLAG,FUNCSTAT,NAME,STNAME,ESTIMATESBASE2020,POPESTIMATE2020,POPESTIMATE2021


In [7]:
population2010.columns

Index(['SUMLEV', 'STATE', 'COUNTY', 'PLACE', 'COUSUB', 'CONCIT',
       'PRIMGEO_FLAG', 'FUNCSTAT', 'NAME', 'STNAME', 'CENSUS2010POP',
       'ESTIMATESBASE2010', 'POPESTIMATE2010', 'POPESTIMATE2011',
       'POPESTIMATE2012', 'POPESTIMATE2013', 'POPESTIMATE2014',
       'POPESTIMATE2015', 'POPESTIMATE2016', 'POPESTIMATE2017',
       'POPESTIMATE2018', 'POPESTIMATE2019', 'POPESTIMATE042020',
       'POPESTIMATE2020'],
      dtype='object')

In [8]:
# Dataframe population2010
population2010['SUMLEV']= population2010['SUMLEV'].astype(str)
population2010['STATE']= population2010['STATE'].astype(str)
population2010['COUNTY']= population2010['COUNTY'].astype(str)
population2010[ 'PLACE']= population2010[ 'PLACE'].astype(str)
population2010['COUSUB']= population2010['COUSUB'].astype(str)
population2010['CONCIT']= population2010['CONCIT'].astype(str)
population2010['PRIMGEO_FLAG']= population2010['PRIMGEO_FLAG'].astype(str)

#Dataframe population2021
population2021['SUMLEV']= population2021['SUMLEV'].astype(str)
population2021['STATE']= population2021['STATE'].astype(str)
population2021['COUNTY']= population2021['COUNTY'].astype(str)
population2021[ 'PLACE']= population2021[ 'PLACE'].astype(str)
population2021['COUSUB']= population2021['COUSUB'].astype(str)
population2021['CONCIT']= population2021['CONCIT'].astype(str)
population2021['PRIMGEO_FLAG']= population2021['PRIMGEO_FLAG'].astype(str)

In [9]:
# Create a new column 'Indicator' fot two tables
population2010['Indicator']= population2010[['SUMLEV','STATE','COUNTY','PLACE','COUSUB','CONCIT','PRIMGEO_FLAG','FUNCSTAT']].apply(''.join, axis = 1)

population2021['Indicator']= population2021[['SUMLEV','STATE','COUNTY','PLACE','COUSUB','CONCIT','PRIMGEO_FLAG','FUNCSTAT']].apply(''.join, axis = 1)

In [10]:
population2010.columns

Index(['SUMLEV', 'STATE', 'COUNTY', 'PLACE', 'COUSUB', 'CONCIT',
       'PRIMGEO_FLAG', 'FUNCSTAT', 'NAME', 'STNAME', 'CENSUS2010POP',
       'ESTIMATESBASE2010', 'POPESTIMATE2010', 'POPESTIMATE2011',
       'POPESTIMATE2012', 'POPESTIMATE2013', 'POPESTIMATE2014',
       'POPESTIMATE2015', 'POPESTIMATE2016', 'POPESTIMATE2017',
       'POPESTIMATE2018', 'POPESTIMATE2019', 'POPESTIMATE042020',
       'POPESTIMATE2020', 'Indicator'],
      dtype='object')

In [11]:
population= pd.merge(population2010,population2021, how = 'inner', left_on='Indicator', right_on='Indicator')
population.head(10)

Unnamed: 0,SUMLEV_x,STATE_x,COUNTY_x,PLACE_x,COUSUB_x,CONCIT_x,PRIMGEO_FLAG_x,FUNCSTAT_x,NAME_x,STNAME_x,...,PLACE_y,COUSUB_y,CONCIT_y,PRIMGEO_FLAG_y,FUNCSTAT_y,NAME_y,STNAME_y,ESTIMATESBASE2020,POPESTIMATE2020_y,POPESTIMATE2021
0,40,1,0,0,0,0,0,A,Alabama,Alabama,...,0,0,0,0,A,Alabama,Alabama,5024279,5024803,5039877
1,162,1,0,124,0,0,0,A,Abbeville city,Alabama,...,124,0,0,0,A,Abbeville city,Alabama,2368,2368,2379
2,162,1,0,460,0,0,0,A,Adamsville city,Alabama,...,460,0,0,0,A,Adamsville city,Alabama,4372,4356,4294
3,162,1,0,484,0,0,0,A,Addison town,Alabama,...,484,0,0,0,A,Addison town,Alabama,664,664,668
4,162,1,0,676,0,0,0,A,Akron town,Alabama,...,676,0,0,0,A,Akron town,Alabama,227,227,226
5,162,1,0,820,0,0,0,A,Alabaster city,Alabama,...,820,0,0,0,A,Alabaster city,Alabama,33360,33381,33676
6,162,1,0,988,0,0,0,A,Albertville city,Alabama,...,988,0,0,0,A,Albertville city,Alabama,22390,22385,22522
7,162,1,0,1132,0,0,0,A,Alexander City city,Alabama,...,1132,0,0,0,A,Alexander City city,Alabama,14807,14761,14618
8,162,1,0,1228,0,0,0,A,Aliceville city,Alabama,...,1228,0,0,0,A,Aliceville city,Alabama,2175,2166,2123
9,162,1,0,1396,0,0,0,A,Allgood town,Alabama,...,1396,0,0,0,A,Allgood town,Alabama,544,543,545


In [12]:
population.columns

Index(['SUMLEV_x', 'STATE_x', 'COUNTY_x', 'PLACE_x', 'COUSUB_x', 'CONCIT_x',
       'PRIMGEO_FLAG_x', 'FUNCSTAT_x', 'NAME_x', 'STNAME_x', 'CENSUS2010POP',
       'ESTIMATESBASE2010', 'POPESTIMATE2010', 'POPESTIMATE2011',
       'POPESTIMATE2012', 'POPESTIMATE2013', 'POPESTIMATE2014',
       'POPESTIMATE2015', 'POPESTIMATE2016', 'POPESTIMATE2017',
       'POPESTIMATE2018', 'POPESTIMATE2019', 'POPESTIMATE042020',
       'POPESTIMATE2020_x', 'Indicator', 'SUMLEV_y', 'STATE_y', 'COUNTY_y',
       'PLACE_y', 'COUSUB_y', 'CONCIT_y', 'PRIMGEO_FLAG_y', 'FUNCSTAT_y',
       'NAME_y', 'STNAME_y', 'ESTIMATESBASE2020', 'POPESTIMATE2020_y',
       'POPESTIMATE2021'],
      dtype='object')

In [13]:
population = population[['NAME_x', 'STNAME_x', 'POPESTIMATE2010', 'POPESTIMATE2011',
       'POPESTIMATE2012', 'POPESTIMATE2013', 'POPESTIMATE2014',
       'POPESTIMATE2015', 'POPESTIMATE2016', 'POPESTIMATE2017',
       'POPESTIMATE2018', 'POPESTIMATE2019',
       'POPESTIMATE2020_x', 'POPESTIMATE2021']]

In [14]:
population.head(2)

Unnamed: 0,NAME_x,STNAME_x,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015,POPESTIMATE2016,POPESTIMATE2017,POPESTIMATE2018,POPESTIMATE2019,POPESTIMATE2020_x,POPESTIMATE2021
0,Alabama,Alabama,4785514,4799642,4816632,4831586,4843737,4854803,4866824,4877989,4891628,4907965,4921532,5039877
1,Abbeville city,Alabama,2699,2694,2645,2629,2610,2602,2587,2578,2565,2555,2553,2379


In [15]:
population['NAME_x']= population['NAME_x'].str.strip('city')
population['NAME_x']= population['NAME_x'].str.strip()
population['NAME_x']

0                         Alabama
1                       Abbeville
2                      Adamsville
3                    Addison town
4                      Akron town
                   ...           
81553    Balance of Washakie Coun
81554                 Weston Coun
81555                   Newcastle
81556                  Upton town
81557      Balance of Weston Coun
Name: NAME_x, Length: 81558, dtype: object

In [16]:
population['NAME_x']= population['NAME_x'].str.strip('town')
population['NAME_x']= population['NAME_x'].str.strip()
population['NAME_x']

0                        Alabama
1                      Abbeville
2                     Adamsville
3                        Addison
4                          Akron
                  ...           
81553    Balance of Washakie Cou
81554                 Weston Cou
81555                  Newcastle
81556                      Upton
81557      Balance of Weston Cou
Name: NAME_x, Length: 81558, dtype: object

In [17]:
#Change names  of colums NAME_x, STNAME_x
population = population.rename(columns= {'NAME_x':'City'})
population = population.rename(columns= {'STNAME_x':'State'})
population = population.rename(columns= {'POPESTIMATE2020_x':'POPESTIMATE2020'})

In [18]:
population.columns

Index(['City', 'State', 'POPESTIMATE2010', 'POPESTIMATE2011',
       'POPESTIMATE2012', 'POPESTIMATE2013', 'POPESTIMATE2014',
       'POPESTIMATE2015', 'POPESTIMATE2016', 'POPESTIMATE2017',
       'POPESTIMATE2018', 'POPESTIMATE2019', 'POPESTIMATE2020',
       'POPESTIMATE2021'],
      dtype='object')

In [19]:
us_state_to_abbrev = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY",
    "District of Columbia": "DC",
    "American Samoa": "AS",
    "Guam": "GU",
    "Northern Mariana Islands": "MP",
    "Puerto Rico": "PR",
    "United States Minor Outlying Islands": "UM",
    "U.S. Virgin Islands": "VI",
    "United States": "US"}

In [20]:
population['State'] = population.State.astype(str).apply(lambda x: us_state_to_abbrev[x])

In [21]:
population['State']

0        AL
1        AL
2        AL
3        AL
4        AL
         ..
81553    WY
81554    WY
81555    WY
81556    WY
81557    WY
Name: State, Length: 81558, dtype: object

In [23]:
# Read cities.csv to normalize data
cities = pd.read_csv(f"s3://cleandatagrupo07/cities.csv",
        storage_options={
            "key": access_key,
            "secret": secret_access_key
        }
    )

In [24]:
cities.head(3)

Unnamed: 0,Unique_City_ID,City,County,State
0,oak_grovechristianky,Oak Grove,Christian,KY
1,jarvisburgcurritucknc,Jarvisburg,Currituck,NC
2,mcminnvilleyamhillor,McMinnville,Yamhill,OR


In [25]:
df3 = pd.merge(cities, population, how = 'inner', on =['City', 'State'])

In [26]:
df3

Unnamed: 0,Unique_City_ID,City,County,State,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015,POPESTIMATE2016,POPESTIMATE2017,POPESTIMATE2018,POPESTIMATE2019,POPESTIMATE2020,POPESTIMATE2021
0,oak_grovechristianky,Oak Grove,Christian,KY,7515,7430,7629,7496,7490,7469,7372,7240,7320,7352,7369,7793
1,oak_grovechristianky,Oak Grove,Christian,KY,7515,7430,7629,7496,7490,7469,7372,7240,7320,7352,7369,7793
2,mcminnvilleyamhillor,McMinnville,Yamhill,OR,32213,32315,32415,32374,32667,33012,33854,34207,34434,34674,35185,34666
3,mcminnvilleyamhillor,McMinnville,Yamhill,OR,32213,32315,32415,32374,32667,33012,33854,34207,34434,34674,35185,34666
4,oshkoshwinnebagowi,Oshkosh,Winnebago,WI,66323,66361,66867,66951,66810,66707,66671,66736,66744,66933,66495,66607
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22830,baxtercrow_wingmn,Baxter,Crow Wing,MN,7653,7656,7692,7742,7802,7902,8178,8265,8323,8379,8483,8830
22831,baxtercrow_wingmn,Baxter,Crow Wing,MN,7653,7656,7692,7742,7802,7902,8178,8265,8323,8379,8483,8830
22832,baxtercrow_wingmn,Baxter,Crow Wing,MN,7653,7656,7692,7742,7802,7902,8178,8265,8323,8379,8483,8830
22833,benddeschutesor,Bend,Deschutes,OR,76658,77582,78637,80943,83581,86345,90739,94667,97548,100274,101886,102059


In [27]:
df3.drop_duplicates(subset = ['Unique_City_ID'], keep= 'first')

Unnamed: 0,Unique_City_ID,City,County,State,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015,POPESTIMATE2016,POPESTIMATE2017,POPESTIMATE2018,POPESTIMATE2019,POPESTIMATE2020,POPESTIMATE2021
0,oak_grovechristianky,Oak Grove,Christian,KY,7515,7430,7629,7496,7490,7469,7372,7240,7320,7352,7369,7793
2,mcminnvilleyamhillor,McMinnville,Yamhill,OR,32213,32315,32415,32374,32667,33012,33854,34207,34434,34674,35185,34666
4,oshkoshwinnebagowi,Oshkosh,Winnebago,WI,66323,66361,66867,66951,66810,66707,66671,66736,66744,66933,66495,66607
9,garrettellistx,Garrett,Ellis,TX,810,814,814,821,826,839,848,853,869,880,891,854
11,mayesvillesumtersc,Mayesville,Sumter,SC,729,725,729,727,729,724,723,716,716,718,715,550
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22824,sawmillscaldwellnc,Sawmills,Caldwell,NC,5205,5170,5151,5149,5126,5118,5134,5144,5153,5164,5156,5036
22826,frederickweldco,Frederick,Weld,CO,8702,8830,8974,9121,9328,9543,9762,9908,10178,10695,11515,15761
22828,deerfieldoneidany,Deerfield,Oneida,NY,4125,4135,4135,4129,4110,4086,4067,4063,4064,4063,4050,3939
22829,baxtercrow_wingmn,Baxter,Crow Wing,MN,7653,7656,7692,7742,7802,7902,8178,8265,8323,8379,8483,8830


In [28]:
df3.drop(['City','County','State'],axis=1,inplace=True)

In [29]:
df3.drop_duplicates(subset=['Unique_City_ID'],inplace=True)

In [30]:
values = [i for i in range(2010,2022)]

In [31]:
keys = [f"POPESTIMATE{i}" for i in range(2010,2022)]

In [32]:
df3.rename(columns=dict(zip(keys, values)),inplace=True)

In [33]:
df_melt = pd.melt(df3, id_vars =['Unique_City_ID'], value_vars = values)

In [34]:
df_melt.rename(columns={'variable':'Year','value':'PopEstimate'},inplace=True)

In [35]:
df_melt.head()

Unnamed: 0,Unique_City_ID,Year,PopEstimate
0,oak_grovechristianky,2010,7515
1,mcminnvilleyamhillor,2010,32213
2,oshkoshwinnebagowi,2010,66323
3,garrettellistx,2010,810
4,mayesvillesumtersc,2010,729
