# VA City Coordinate Scraping

In [97]:
import requests

def fetch(url: str) -> str:
    response = requests.get(
        url,
        headers={"User-Agent": "Mozilla/5.0"},
        timeout=10
    )
    response.raise_for_status()
    return response.text

In [98]:
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO

def parse_locations(html: str) -> pd.DataFrame:
    soup = BeautifulSoup(html, "html.parser")
    text = soup.find_all("table", class_="tableizer-table")
    return pd.read_html(StringIO(str(text)))[1]

def parse_populations(html: str) -> pd.DataFrame:
    soup = BeautifulSoup(html, "html.parser")
    text = soup.find(id="ts")
    return pd.read_html(StringIO(str(text)))[0]

In [99]:
LOCATIONS_URL = "https://www.mapsofworld.com/usa/states/virginia/lat-long.html"
POPULATION_URL = "https://www.citypopulation.de/en/usa/cities/virginia/"

locations_html = fetch(LOCATIONS_URL)
locations_df = parse_locations(locations_html)

populations_html = fetch(POPULATION_URL)
populations_df = parse_populations(populations_html)

print(locations_df.head())
print(populations_df.head())

          Location  Latitude  Longitude
0    Abingdon town     36.71     -81.97
1     Accomac town     37.72     -75.67
2           Adwolf     36.79     -81.59
3     Alberta town     36.86     -77.89
4  Alexandria city     38.82     -77.09
                  Name Status  Population Census (C) 1990-04-01  \
0           Alexandria   City                            111183   
1            Annandale    CDP                             34582   
2            Arlington   Cnty                            170895   
3              Ashburn    CDP                              2128   
4  Bailey's Crossroads    CDP                             19507   

   Population Census (C) 2000-04-01  Population Census (C) 2010-04-01  \
0                            128278                            139998   
1                             38212                             41008   
2                            189211                            207627   
3                             25279                             3

## Locations

In [100]:
locations_df.dtypes

Unnamed: 0,0
Location,object
Latitude,float64
Longitude,float64


In [101]:
def isolate_name(name: str) -> str:
  word_list = name.split()
  match word_list:
    # 'city' not in names, but 'City' is
    case [*_, last] if last in ["city", "town"]:
      return " ".join(word_list[:-1])
    case _:
      return name

In [102]:
locations_df['Location'] = locations_df['Location'].apply(isolate_name)

In [103]:
locations_df['Location'].value_counts()

Unnamed: 0_level_0,count
Location,Unnamed: 1_level_1
Rose Hill,2
Wakefield,2
Woodlawn,2
Belle Haven,2
Oak Level,1
...,...
Floyd,1
Forest,1
Fort Belvoir,1
Fort Chiswell,1


In [104]:
locations_df[locations_df['Location'].duplicated(keep=False)]

Unnamed: 0,Location,Latitude,Longitude
29,Belle Haven,38.78,-77.06
30,Belle Haven,37.56,-75.83
459,Rose Hill,38.79,-77.11
460,Rose Hill,36.67,-83.38
557,Wakefield,38.82,-77.24
558,Wakefield,36.97,-76.99
584,Woodlawn,38.73,-77.11
585,Woodlawn,36.74,-80.82


In [105]:
locations_df = locations_df.drop_duplicates(subset=['Location'])

In [106]:
locations_df.value_counts()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count
Location,Latitude,Longitude,Unnamed: 3_level_1
Yorktown,37.24,-76.51,1
Abingdon,36.71,-81.97,1
Accomac,37.72,-75.67,1
Adwolf,36.79,-81.59,1
Alberta,36.86,-77.89,1
...,...,...,...
Arrington,37.67,-78.89,1
Arlington,38.88,-77.10,1
Arcola,38.94,-77.53,1
Aquia Harbour,38.46,-77.38,1


## Populations

In [107]:
populations_df.dtypes

Unnamed: 0,0
Name,object
Status,object
Population Census (C) 1990-04-01,int64
Population Census (C) 2000-04-01,int64
Population Census (C) 2010-04-01,int64
Population Census (C) 2020-04-01,int64
Population Estimate (E) 2024-07-01,int64
Area,int64
Unnamed: 8,object


In [108]:
populations_df.columns

Index(['Name', 'Status', 'Population Census (C) 1990-04-01',
       'Population Census (C) 2000-04-01', 'Population Census (C) 2010-04-01',
       'Population Census (C) 2020-04-01',
       'Population Estimate (E) 2024-07-01', 'Area', 'Unnamed: 8'],
      dtype='object')

In [109]:
populations_df = populations_df.drop(columns=[
    'Status',
    'Population Census (C) 1990-04-01',
    'Population Census (C) 2000-04-01',
    'Population Census (C) 2010-04-01',
    'Population Census (C) 2020-04-01',
    'Area',
    'Unnamed: 8'
    ])

populations_df.head()

Unnamed: 0,Name,Population Estimate (E) 2024-07-01
0,Alexandria,159102
1,Annandale,44360
2,Arlington,251820
3,Ashburn,49240
4,Bailey's Crossroads,25220


In [110]:
populations_df = populations_df.rename(columns={'Name': 'Location', 'Population Estimate (E) 2024-07-01': 'Population Estimate'})

In [111]:
populations_df.columns

Index(['Location', 'Population Estimate'], dtype='object')

In [112]:
merged_df = pd.merge(locations_df, populations_df, how='inner', on='Location')
merged_df.head(20)

Unnamed: 0,Location,Latitude,Longitude,Population Estimate
0,Alexandria,38.82,-77.09,159102
1,Annandale,38.83,-77.2,44360
2,Arlington,38.88,-77.1,251820
3,Ashburn,39.03,-77.47,49240
4,Blacksburg,37.23,-80.43,45452
5,Brambleton,38.98,-77.53,29280
6,Buckhall,38.72,-77.45,22170
7,Burke,38.78,-77.26,42850
8,Cave Spring,37.23,-80.01,27530
9,Centreville,38.84,-77.44,74530


In [113]:
cities_df = merged_df.sort_values(by='Population Estimate', ascending=False).head(40)

In [114]:
cities_df

Unnamed: 0,Location,Latitude,Longitude,Population Estimate
60,Virginia Beach,36.78,-76.02,454808
13,Chesapeake,36.68,-76.3,254997
2,Arlington,38.88,-77.1,251820
48,Richmond,37.53,-77.48,233655
43,Norfolk,36.92,-76.25,231105
42,Newport News,37.08,-76.52,183056
0,Alexandria,38.82,-77.09,159102
25,Hampton,37.05,-76.3,137596
58,Suffolk,36.7,-76.63,103105
49,Roanoke,37.28,-79.96,97912


In [115]:
cities_df.reset_index(drop=True, inplace=True)

In [116]:
cities_df

Unnamed: 0,Location,Latitude,Longitude,Population Estimate
0,Virginia Beach,36.78,-76.02,454808
1,Chesapeake,36.68,-76.3,254997
2,Arlington,38.88,-77.1,251820
3,Richmond,37.53,-77.48,233655
4,Norfolk,36.92,-76.25,231105
5,Newport News,37.08,-76.52,183056
6,Alexandria,38.82,-77.09,159102
7,Hampton,37.05,-76.3,137596
8,Suffolk,36.7,-76.63,103105
9,Roanoke,37.28,-79.96,97912


In [118]:
cities_df_no_pop = cities_df.drop(columns='Population Estimate')
cities_df_no_pop.to_csv('cities.csv', index=False)