In [1]:
# Import dependencies
import pandas as pd
import numpy as np

### Read in City Data from  CSV (US Census Data)

In [2]:
# Read the data into a Pandas DataFrame
city_info_df = pd.read_excel('Resources/city_data.xlsx', header=1)
city_info_df.head()

Unnamed: 0,Rank,"City, State",Est Pop 2020,Est Pop 2021,Est Pop 2022,Est Pop 2023
0,1,"New York city, New York",8740292.0,8462216.0,8335798.0,8258035.0
1,2,"Los Angeles city, California",3895848.0,3832573.0,3822782.0,3820914.0
2,3,"Chicago city, Illinois",2743329.0,2704101.0,2672660.0,2664452.0
3,4,"Houston city, Texas",2299269.0,2291020.0,2302488.0,2314157.0
4,5,"Phoenix city, Arizona",1612459.0,1625187.0,1643899.0,1650070.0


In [3]:
# Get a brief summary of the city_info DataFrame.
city_info_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1915 entries, 0 to 1914
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Rank          1915 non-null   object 
 1   City, State   1909 non-null   object 
 2   Est Pop 2020  1909 non-null   float64
 3   Est Pop 2021  1909 non-null   float64
 4   Est Pop 2022  1909 non-null   float64
 5   Est Pop 2023  1909 non-null   float64
dtypes: float64(4), object(2)
memory usage: 89.9+ KB


### Clean City Data

1) Create a city and state column
2) Change Dtypes
3) Update column names
4) Delete columns not needed
5) Create city ids
6) Clean to only NFL stadium cities

In [4]:
# Create city and state column
city_info_df[['City', 'State']] = city_info_df['City, State'].str.split(',', expand=True)
city_info_df.head()

Unnamed: 0,Rank,"City, State",Est Pop 2020,Est Pop 2021,Est Pop 2022,Est Pop 2023,City,State
0,1,"New York city, New York",8740292.0,8462216.0,8335798.0,8258035.0,New York city,New York
1,2,"Los Angeles city, California",3895848.0,3832573.0,3822782.0,3820914.0,Los Angeles city,California
2,3,"Chicago city, Illinois",2743329.0,2704101.0,2672660.0,2664452.0,Chicago city,Illinois
3,4,"Houston city, Texas",2299269.0,2291020.0,2302488.0,2314157.0,Houston city,Texas
4,5,"Phoenix city, Arizona",1612459.0,1625187.0,1643899.0,1650070.0,Phoenix city,Arizona


In [5]:
# Drop City,State and Columns for 2020, 2021, and 2022 (we are only using 2023 metrics)
city_info_df = city_info_df.drop(['City, State', 'Est Pop 2020', 'Est Pop 2021','Est Pop 2022', 'Rank'], axis=1)

# Reorder
city_info_df = city_info_df[['City', 'State', 'Est Pop 2023']]

#Rename 2023 column
city_info_df = city_info_df.rename(columns={'Est Pop 2023': 'Population'})

city_info_df

Unnamed: 0,City,State,Population
0,New York city,New York,8258035.0
1,Los Angeles city,California,3820914.0
2,Chicago city,Illinois,2664452.0
3,Houston city,Texas,2314157.0
4,Phoenix city,Arizona,1650070.0
...,...,...,...
1910,,,
1911,,,
1912,,,
1913,,,


In [6]:
# Drop NaN
city_info_df = city_info_df.dropna()
city_info_df

Unnamed: 0,City,State,Population
0,New York city,New York,8258035.0
1,Los Angeles city,California,3820914.0
2,Chicago city,Illinois,2664452.0
3,Houston city,Texas,2314157.0
4,Phoenix city,Arizona,1650070.0
...,...,...,...
1904,Plainview city,Texas,19420.0
1905,Harvey city,Illinois,19397.0
1906,Ypsilanti city,Michigan,19393.0
1907,Albany city,California,19097.0


In [7]:
# Drop "city" from city column
city_info_df.loc[:, 'City'] = city_info_df['City'].str.replace(r'\s+city$', '', regex=True)
city_info_df

Unnamed: 0,City,State,Population
0,New York,New York,8258035.0
1,Los Angeles,California,3820914.0
2,Chicago,Illinois,2664452.0
3,Houston,Texas,2314157.0
4,Phoenix,Arizona,1650070.0
...,...,...,...
1904,Plainview,Texas,19420.0
1905,Harvey,Illinois,19397.0
1906,Ypsilanti,Michigan,19393.0
1907,Albany,California,19097.0


In [8]:
# Filter for NFL cities
filtered_cities = ["Glendale",
                   "Atlanta",
                   "Baltimore",
                   "Buffalo",
                   "Charlotte"
                   "Chicago",
                   "Cincinnati",
                   "Cleveland",
                   "Arlington",
                   "Denver",
                   "Detroit",
                   "Green Bay",
                   "Houston",
                   "Indianapolis",
                   "Jacksonville",
                   "Kansas City",
                   "Las Vegas",
                   "Los Angeles",
                   "Miami",
                   "Minneapolis",
                   "Boston",
                   "New Orleans",
                   "New York",
                   "Philadelphia",
                   "Pittsburgh",
                   "San Francisco",
                   "Seattle",
                   "Tampa",
                   "Nashville",
                   "Washington"]

filtered_df = city_info_df[city_info_df['City'].isin(filtered_cities)]
filtered_df

Unnamed: 0,City,State,Population
0,New York,New York,8258035.0
1,Los Angeles,California,3820914.0
3,Houston,Texas,2314157.0
5,Philadelphia,Pennsylvania,1550542.0
9,Jacksonville,Florida,985843.0
16,San Francisco,California,808988.0
17,Seattle,Washington,755078.0
18,Denver,Colorado,716577.0
21,Washington,District of Columbia,678972.0
23,Las Vegas,Nevada,660929.0


In [9]:
# Drop duplicate name cities
filtered_df = filtered_df.drop([137,174,514,820,1190,1360,1801]) 
filtered_df = filtered_df.reset_index(drop=True)

filtered_df

Unnamed: 0,City,State,Population
0,New York,New York,8258035.0
1,Los Angeles,California,3820914.0
2,Houston,Texas,2314157.0
3,Philadelphia,Pennsylvania,1550542.0
4,Jacksonville,Florida,985843.0
5,San Francisco,California,808988.0
6,Seattle,Washington,755078.0
7,Denver,Colorado,716577.0
8,Washington,District of Columbia,678972.0
9,Las Vegas,Nevada,660929.0


In [10]:
nfl_city_cleaned_df=filtered_df.copy()
nfl_city_cleaned_df

Unnamed: 0,City,State,Population
0,New York,New York,8258035.0
1,Los Angeles,California,3820914.0
2,Houston,Texas,2314157.0
3,Philadelphia,Pennsylvania,1550542.0
4,Jacksonville,Florida,985843.0
5,San Francisco,California,808988.0
6,Seattle,Washington,755078.0
7,Denver,Colorado,716577.0
8,Washington,District of Columbia,678972.0
9,Las Vegas,Nevada,660929.0


In [11]:
nfl_city_cleaned_df['city_id'] = [223,218,213,224,215,226,227,210,230,217,221,211,
                                  203,202,216,219,220,228,209,222,208,207,225,204,201,212,
                                 ]

nfl_city_cleaned_df

Unnamed: 0,City,State,Population,city_id
0,New York,New York,8258035.0,223
1,Los Angeles,California,3820914.0,218
2,Houston,Texas,2314157.0,213
3,Philadelphia,Pennsylvania,1550542.0,224
4,Jacksonville,Florida,985843.0,215
5,San Francisco,California,808988.0,226
6,Seattle,Washington,755078.0,227
7,Denver,Colorado,716577.0,210
8,Washington,District of Columbia,678972.0,230
9,Las Vegas,Nevada,660929.0,217


In [12]:
# Add rows missing data

nfl_city_cleaned_df.loc[len(nfl_city_cleaned_df)] = ["Charlotte","North Carolina",911311,205]
nfl_city_cleaned_df.loc[len(nfl_city_cleaned_df)] = ["Chicago","Illinois",2664452,206]
nfl_city_cleaned_df.loc[len(nfl_city_cleaned_df)] = ["Indianapolis","Indiana",879293,214]
nfl_city_cleaned_df.loc[len(nfl_city_cleaned_df)] = ["Nashville","Tennessee",687788,229]
nfl_city_cleaned_df

Unnamed: 0,City,State,Population,city_id
0,New York,New York,8258035.0,223
1,Los Angeles,California,3820914.0,218
2,Houston,Texas,2314157.0,213
3,Philadelphia,Pennsylvania,1550542.0,224
4,Jacksonville,Florida,985843.0,215
5,San Francisco,California,808988.0,226
6,Seattle,Washington,755078.0,227
7,Denver,Colorado,716577.0,210
8,Washington,District of Columbia,678972.0,230
9,Las Vegas,Nevada,660929.0,217


In [13]:
# Sort based on city_id numbers
nfl_city_cleaned_df_sorted = nfl_city_cleaned_df.sort_values(by='city_id')
nfl_city_cleaned_df_sorted

Unnamed: 0,City,State,Population,city_id
24,Glendale,Arizona,253855.0,201
13,Atlanta,Georgia,510823.0,202
12,Baltimore,Maryland,565239.0,203
23,Buffalo,New York,274678.0,204
26,Charlotte,North Carolina,911311.0,205
27,Chicago,Illinois,2664452.0,206
21,Cincinnati,Ohio,311097.0,207
20,Cleveland,Ohio,362656.0,208
18,Arlington,Texas,398431.0,209
7,Denver,Colorado,716577.0,210


In [15]:
# Reorder Columns
nfl_city_cleaned_df_sorted = nfl_city_cleaned_df_sorted[['city_id', 'City', 'State','Population']]
nfl_city_cleaned_df_sorted_reset = nfl_city_cleaned_df_sorted.reset_index(drop=True)
nfl_city_cleaned_df_sorted_reset

Unnamed: 0,city_id,City,State,Population
0,201,Glendale,Arizona,253855.0
1,202,Atlanta,Georgia,510823.0
2,203,Baltimore,Maryland,565239.0
3,204,Buffalo,New York,274678.0
4,205,Charlotte,North Carolina,911311.0
5,206,Chicago,Illinois,2664452.0
6,207,Cincinnati,Ohio,311097.0
7,208,Cleveland,Ohio,362656.0
8,209,Arlington,Texas,398431.0
9,210,Denver,Colorado,716577.0


In [16]:
# Save the updated DataFrame 
nfl_city_cleaned_df_sorted_reset.to_csv('updated_nfl_cities.csv', index=False)