In [2]:
import pandas as pd
from pathlib import Path
import requests
from datetime import datetime

In [3]:
#Store the filepath for each csv file into a variable

state_path = "MapData.csv"
population_path = "SUB-IP-EST2023-POP.csv"

In [4]:
#Read the csv files using pandas

state = pd.read_csv(state_path)
population = pd.read_csv(population_path)

In [5]:
#Store each csv file in a DataFrame

state_df = pd.DataFrame(state)
pop_df = pd.DataFrame(population)

In [6]:
#Removed some of the unnecessary columns by specifying the columns to keep.

state_df = state_df.loc[:, ['Geography', 'Estimate (Consumer expenditures per household on Entertainment / Recreation)', 'Estimate (Percent working age (25 to 64 years))']]
pop_df = pop_df.loc[:, ['City', 'State', 'avg_population']]

state_df.rename(columns={
    'Geography': 'State',
    'Estimate (Consumer expenditures per household on Entertainment / Recreation)' : 'Entertainment/Recreation (per household)',
    'Estimate (Percent working age (25 to 64 years))' : '% of Working age(25-64)',
    
}, inplace=True)




In [7]:
state_df.head()

Unnamed: 0,State,Entertainment/Recreation (per household),% of Working age(25-64)
0,Alabama,3050.64,50.9
1,Arizona,3641.95,49.9
2,Arkansas,2973.2,50.1
3,California,4509.62,53.3
4,Colorado,4266.24,54.2


In [8]:
pop_df.head()

Unnamed: 0,City,State,avg_population
0,Abbeville city,Alabama,2362
1,Adamsville city,Alabama,4262
2,Addison town,Alabama,666
3,Akron town,Alabama,226
4,Alabaster city,Alabama,33779


In [9]:
#The data in the avg_population column was listed as a string. We had to clean the data by changing the data type before being able to sort.

pop_df['avg_population'] = pop_df['avg_population'].str.replace(',', '').astype(float)

In [10]:
#This is where we found that there was a blank space before each state name in the pop_df
unique_states = pop_df['State'].unique()
unique_states

array([' Alabama', ' Alaska', ' Arizona', ' Arkansas', ' California',
       ' Colorado', ' Connecticut', ' Delaware', ' District of Columbia',
       ' Florida', ' Village of Islands village', ' Georgia', ' Hawaii',
       ' Idaho', ' Illinois', ' Indiana', ' Iowa', ' Kansas', ' Kentucky',
       ' Louisiana', ' Maine', ' Maryland', ' Massachusetts', ' Michigan',
       ' Minnesota', ' Mississippi', ' Missouri', ' Montana', ' Nebraska',
       ' Nevada', ' New Hampshire', ' New Jersey', ' New Mexico',
       ' New York', ' North Carolina', ' North Dakota', ' Ohio',
       ' Oklahoma', ' Oregon', ' Pennsylvania', ' Rhode Island',
       ' South Carolina', ' South Dakota', ' Tennessee',
       ' Moore County metropolitan government', ' Texas', ' Utah',
       ' Vermont', ' Virginia', ' Washington', ' West Virginia',
       ' Wisconsin', ' Wyoming'], dtype=object)

In [11]:
#Cleaned the States column in the pop_df using the .str.strip() method

pop_df['State'] = pop_df['State'].str.strip()
unique_states_cleaned = pop_df['State'].unique()
unique_states_cleaned

array(['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
       'Colorado', 'Connecticut', 'Delaware', 'District of Columbia',
       'Florida', 'Village of Islands village', 'Georgia', 'Hawaii',
       'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky',
       'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan',
       'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska',
       'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York',
       'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon',
       'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota',
       'Tennessee', 'Moore County metropolitan government', 'Texas',
       'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia',
       'Wisconsin', 'Wyoming'], dtype=object)

In [12]:
#Finding the data for one specific state to verify.
alabama_cities = pop_df[pop_df['State'] == 'Alabama']
alabama_cities_sorted = alabama_cities.sort_values(by='avg_population', ascending=False)
alabama_cities_sorted.head()



Unnamed: 0,City,State,avg_population
215,Huntsville city,Alabama,220695.0
42,Birmingham city,Alabama,197945.0
282,Montgomery city,Alabama,197683.0
279,Mobile city,Alabama,184421.0
422,Tuscaloosa city,Alabama,108802.0


In [13]:
first_city_name = alabama_cities_sorted.iloc[0]['City']
print("The city in Alabama with the highest population is:", first_city_name)

The city in Alabama with the highest population is: Huntsville city


In [14]:
#Grouped the data by the states column and finds the index of the row with the maximum average population value for each group.

max_population_cities = pop_df.loc[pop_df.groupby('State')['avg_population'].idxmax()]
max_population_cities.head()

Unnamed: 0,City,State,avg_population
215,Huntsville city,Alabama,220695.0
471,Anchorage municipality,Alaska,288280.0
665,Phoenix city,Arizona,1632904.0
978,Little Rock city,Arkansas,202970.0
1442,Los Angeles city,California,3843029.0


In [15]:
max_population_cities.to_csv('max_population_cities.csv', index=False)