# VA City Coordinate Scraping

In [91]:
import requests
from typing import Any

def fetch(url: str) -> requests.Response:
    response = requests.get(
        url,
        headers={"User-Agent": "Mozilla/5.0"},
        timeout=10
    )
    response.raise_for_status()
    return response

In [92]:
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO

def parse_locations(html: str) -> pd.DataFrame:
    soup = BeautifulSoup(html, "html.parser")
    text = soup.find_all("table", class_="tableizer-table")
    return pd.read_html(StringIO(str(text)))[1]

def parse_populations(html: str) -> pd.DataFrame:
    soup = BeautifulSoup(html, "html.parser")
    text = soup.find(id="ts")
    return pd.read_html(StringIO(str(text)))[0]

In [93]:
LOCATIONS_URL = "https://www.mapsofworld.com/usa/states/virginia/lat-long.html"
POPULATION_URL = "https://www.citypopulation.de/en/usa/cities/virginia/"

locations_html = fetch(LOCATIONS_URL).text
locations_df = parse_locations(locations_html)

populations_html = fetch(POPULATION_URL).text
populations_df = parse_populations(populations_html)

print(locations_df.head())
print(populations_df.head())

          Location  Latitude  Longitude
0    Abingdon town     36.71     -81.97
1     Accomac town     37.72     -75.67
2           Adwolf     36.79     -81.59
3     Alberta town     36.86     -77.89
4  Alexandria city     38.82     -77.09
                  Name Status  Population Census (C) 1990-04-01  \
0           Alexandria   City                            111183   
1            Annandale    CDP                             34582   
2            Arlington   Cnty                            170895   
3              Ashburn    CDP                              2128   
4  Bailey's Crossroads    CDP                             19507   

   Population Census (C) 2000-04-01  Population Census (C) 2010-04-01  \
0                            128278                            139998   
1                             38212                             41008   
2                            189211                            207627   
3                             25279                             3

## Locations

In [94]:
locations_df.dtypes

Location      object
Latitude     float64
Longitude    float64
dtype: object

In [95]:
def isolate_name(name: str) -> str:
  word_list = name.split()
  match word_list:
    # 'city' not in names, but 'City' is
    case [*_, last] if last in ["city", "town"]:
      return " ".join(word_list[:-1])
    case _:
      return name

In [96]:
locations_df['Location'] = locations_df['Location'].apply(isolate_name)

In [97]:
locations_df['Location'].value_counts()

Location
Rose Hill        2
Wakefield        2
Woodlawn         2
Belle Haven      2
Oak Level        1
                ..
Floyd            1
Forest           1
Fort Belvoir     1
Fort Chiswell    1
Ferrum           1
Name: count, Length: 588, dtype: int64

In [98]:
locations_df[locations_df['Location'].duplicated(keep=False)]

Unnamed: 0,Location,Latitude,Longitude
29,Belle Haven,38.78,-77.06
30,Belle Haven,37.56,-75.83
459,Rose Hill,38.79,-77.11
460,Rose Hill,36.67,-83.38
557,Wakefield,38.82,-77.24
558,Wakefield,36.97,-76.99
584,Woodlawn,38.73,-77.11
585,Woodlawn,36.74,-80.82


In [99]:
locations_df = locations_df.drop_duplicates(subset=['Location'])

In [100]:
locations_df.value_counts()

Location    Latitude  Longitude
Abingdon    36.71     -81.97       1
Accomac     37.72     -75.67       1
Adwolf      36.79     -81.59       1
Alberta     36.86     -77.89       1
Alexandria  38.82     -77.09       1
                                  ..
Wyndham     37.69     -77.61       1
Wytheville  36.95     -81.09       1
Yogaville   37.67     -78.69       1
Yorkshire   38.79     -77.45       1
Yorktown    37.24     -76.51       1
Name: count, Length: 588, dtype: int64

In [101]:
locations_df = locations_df.round(1)

## Populations

In [102]:
populations_df.dtypes

Name                                  object
Status                                object
Population Census (C) 1990-04-01       int64
Population Census (C) 2000-04-01       int64
Population Census (C) 2010-04-01       int64
Population Census (C) 2020-04-01       int64
Population Estimate (E) 2024-07-01     int64
Area                                   int64
Unnamed: 8                            object
dtype: object

In [103]:
populations_df.columns

Index(['Name', 'Status', 'Population Census (C) 1990-04-01',
       'Population Census (C) 2000-04-01', 'Population Census (C) 2010-04-01',
       'Population Census (C) 2020-04-01',
       'Population Estimate (E) 2024-07-01', 'Area', 'Unnamed: 8'],
      dtype='object')

In [104]:
populations_df = populations_df.drop(columns=[
    'Status',
    'Population Census (C) 1990-04-01',
    'Population Census (C) 2000-04-01',
    'Population Census (C) 2010-04-01',
    'Population Census (C) 2020-04-01',
    'Area',
    'Unnamed: 8'
    ])

populations_df.head()

Unnamed: 0,Name,Population Estimate (E) 2024-07-01
0,Alexandria,159102
1,Annandale,44360
2,Arlington,251820
3,Ashburn,49240
4,Bailey's Crossroads,25220


In [105]:
populations_df = populations_df.rename(columns={'Name': 'Location', 'Population Estimate (E) 2024-07-01': 'Population Estimate'})

In [106]:
populations_df.columns

Index(['Location', 'Population Estimate'], dtype='object')

In [107]:
merged_df = pd.merge(locations_df, populations_df, how='inner', on='Location')
merged_df.head(20)

Unnamed: 0,Location,Latitude,Longitude,Population Estimate
0,Alexandria,38.8,-77.1,159102
1,Annandale,38.8,-77.2,44360
2,Arlington,38.9,-77.1,251820
3,Ashburn,39.0,-77.5,49240
4,Blacksburg,37.2,-80.4,45452
5,Brambleton,39.0,-77.5,29280
6,Buckhall,38.7,-77.4,22170
7,Burke,38.8,-77.3,42850
8,Cave Spring,37.2,-80.0,27530
9,Centreville,38.8,-77.4,74530


In [None]:
NUM_CITIES = 40

cities_df = merged_df.sort_values(by='Population Estimate', ascending=False).head(NUM_CITIES)

In [109]:
cities_df

Unnamed: 0,Location,Latitude,Longitude,Population Estimate
60,Virginia Beach,36.8,-76.0,454808
13,Chesapeake,36.7,-76.3,254997
2,Arlington,38.9,-77.1,251820
48,Richmond,37.5,-77.5,233655
43,Norfolk,36.9,-76.2,231105
42,Newport News,37.1,-76.5,183056
0,Alexandria,38.8,-77.1,159102
25,Hampton,37.0,-76.3,137596
58,Suffolk,36.7,-76.6,103105
49,Roanoke,37.3,-80.0,97912


In [110]:
cities_df.reset_index(drop=True, inplace=True)

In [111]:
cities_df

Unnamed: 0,Location,Latitude,Longitude,Population Estimate
0,Virginia Beach,36.8,-76.0,454808
1,Chesapeake,36.7,-76.3,254997
2,Arlington,38.9,-77.1,251820
3,Richmond,37.5,-77.5,233655
4,Norfolk,36.9,-76.2,231105
5,Newport News,37.1,-76.5,183056
6,Alexandria,38.8,-77.1,159102
7,Hampton,37.0,-76.3,137596
8,Suffolk,36.7,-76.6,103105
9,Roanoke,37.3,-80.0,97912


In [112]:
cities_df = cities_df.drop(columns='Population Estimate') # Population not needed
cities_df

Unnamed: 0,Location,Latitude,Longitude
0,Virginia Beach,36.8,-76.0
1,Chesapeake,36.7,-76.3
2,Arlington,38.9,-77.1
3,Richmond,37.5,-77.5
4,Norfolk,36.9,-76.2
5,Newport News,37.1,-76.5
6,Alexandria,38.8,-77.1
7,Hampton,37.0,-76.3
8,Suffolk,36.7,-76.6
9,Roanoke,37.3,-80.0


## NOAA Regions
In response to rate limits imposed on the NOAA weather alerts API, optimization using forecast zones is necessary to lower total API requests per hour.

In [113]:
NOAA_POINTS_URL = 'https://api.weather.gov/points/'

In [114]:
location = tuple(cities_df.iloc[0])
location

('Virginia Beach', np.float64(36.8), np.float64(-76.0))

In [115]:
json = fetch(f"{NOAA_POINTS_URL}{location[1]},{location[2]}").json()
print(f"JSON for {location[0]}:\n{json}")

JSON for Virginia Beach:
{'@context': ['https://geojson.org/geojson-ld/geojson-context.jsonld', {'@version': '1.1', 'wx': 'https://api.weather.gov/ontology#', 's': 'https://schema.org/', 'geo': 'http://www.opengis.net/ont/geosparql#', 'unit': 'http://codes.wmo.int/common/unit/', '@vocab': 'https://api.weather.gov/ontology#', 'geometry': {'@id': 's:GeoCoordinates', '@type': 'geo:wktLiteral'}, 'city': 's:addressLocality', 'state': 's:addressRegion', 'distance': {'@id': 's:Distance', '@type': 's:QuantitativeValue'}, 'bearing': {'@type': 's:QuantitativeValue'}, 'value': {'@id': 's:value'}, 'unitCode': {'@id': 's:unitCode', '@type': '@id'}, 'forecastOffice': {'@type': '@id'}, 'forecastGridData': {'@type': '@id'}, 'publicZone': {'@type': '@id'}, 'county': {'@type': '@id'}}], 'id': 'https://api.weather.gov/points/36.8,-76', 'type': 'Feature', 'geometry': {'type': 'Point', 'coordinates': [-76, 36.8]}, 'properties': {'@id': 'https://api.weather.gov/points/36.8,-76', '@type': 'wx:Point', 'cwa': 

In [116]:
# Get the forecast zone
json['properties']['forecastZone'].split('/')[-1]

'VAZ098'

In [117]:
def get_forecast_zone(row: pd.Series) -> str:
    json = fetch(f"{NOAA_POINTS_URL}{row['Latitude']},{row['Longitude']}").json()
    return json['properties']['forecastZone'].split('/')[-1]

cities_df['NOAA Forecast Zone'] = cities_df.apply(get_forecast_zone, axis=1)
cities_df

Unnamed: 0,Location,Latitude,Longitude,NOAA Forecast Zone
0,Virginia Beach,36.8,-76.0,VAZ098
1,Chesapeake,36.7,-76.3,VAZ097
2,Arlington,38.9,-77.1,VAZ054
3,Richmond,37.5,-77.5,VAZ515
4,Norfolk,36.9,-76.2,VAZ095
5,Newport News,37.1,-76.5,VAZ524
6,Alexandria,38.8,-77.1,VAZ053
7,Hampton,37.0,-76.3,ANZ632
8,Suffolk,36.7,-76.6,VAZ096
9,Roanoke,37.3,-80.0,VAZ022


In [119]:
cities_df['NOAA Forecast Zone'].value_counts()

NOAA Forecast Zone
VAZ053    8
VAZ506    5
VAZ527    3
VAZ515    3
VAZ022    2
VAZ054    1
VAZ097    1
VAZ098    1
ANZ632    1
VAZ524    1
VAZ095    1
VAZ096    1
VAZ045    1
ANZ638    1
VAZ026    1
VAZ505    1
VAZ014    1
VAZ037    1
VAZ526    1
VAZ044    1
VAZ516    1
VAZ081    1
VAZ056    1
VAZ028    1
Name: count, dtype: int64

In [122]:
len(cities_df['NOAA Forecast Zone'].unique())

24

API requests get cut down by almost half when using forecast zones!

## Saving CSV

In [118]:
from pathlib import Path

folder_name = 'data'
file_name = 'cities.csv'

folder = Path('..') / folder_name
folder.mkdir(parents=True, exist_ok=True)

file_path = folder / file_name
cities_df.to_csv(file_path, index=False)