In [1]:
# Import some libraries
import numpy as np              # useful for many scientific computing in Python
import pandas as pd             # primary data structure library
from bs4 import BeautifulSoup   # For webscraping
import requests

In [2]:
# Load the data file.
# Force Code to load as string type because we want to preserve the "+".
# This is because we will use this later to determine if the code is US/Canada
# area code or a country code.
df = pd.read_csv("chat_area_codes_from_members.csv", dtype={'Code': np.string_})
df.head()

Unnamed: 0,Code
0,223
1,233
2,31
3,33
4,34


In [3]:
# Check the data types
df.dtypes

Code    object
dtype: object

In [4]:
# Load the country codes from the URL
countr_code_url = "https://countrycode.org/"
response = requests.get(countr_code_url)
soup = BeautifulSoup(response.content, "html.parser")
soup.title

<title>Country Codes, Phone Codes, Dialing Codes, Telephone Codes, ISO Country Codes</title>

In [5]:
rows = soup.find("table").find("tbody").find_all("tr")  # Extract all rows from the table on the page
country_codes = {}                                      # Initialize an empty dictionary to store the country + code
for row in rows:                                        # Loop through all the rows
    cells = row.find_all("td")                          # Extract the cells from the row
    country = cells[0].get_text()                       # First cell is the country
    code = cells[1].get_text()                          # Second cell is the calling code
    country_codes[code] = country                       # Save code/country to the dictionary

# Note: the calling code is the key and the country is the value.  This is
# because we want to look up the country by code later on.

In [6]:
# Display the first 5 elements in the dictionary to verify we did it right
first_few = dict(list(country_codes.items())[:5])
first_few

{'93': 'Afghanistan',
 '355': 'Albania',
 '213': 'Algeria',
 '1-684': 'American Samoa',
 '376': 'Andorra'}

In [7]:
# Load Canada geo data
df_ca_area_codes = pd.read_csv("ca-area-code-geo.csv")
df_ca_area_codes.head()

Unnamed: 0,area_code,latitude,longitude
0,204,51.203034,-98.729935
1,226,43.233831,-81.230922
2,236,50.432725,-121.515369
3,249,46.041823,-80.003286
4,250,50.837259,-121.84152


In [8]:
# Function to return the country based on the calling code
def getCountry(code):
  if code[0] == "+":                                      # Does the code begin with "+"?
    new_code = code[1:]                                   # Yes, then extract the code without the "+"
    return country_codes[new_code]                        # Return the country
  elif int(code) in df_ca_area_codes['area_code'].values: # No "+", is the code in Canada?
    return 'Canada'                                       # Yes, return 'Canada'
  else:
    return 'United States'                                # No, default to "United States"

In [9]:
# Test the function with some values
test_code = '+49'
print('{} belongs to {}'.format(test_code, getCountry(test_code)))
test_code = '201'
print('{} belongs to {}'.format(test_code, getCountry(test_code)))
test_code = '204'
print('{} belongs to {}'.format(test_code, getCountry(test_code)))

+49 belongs to Germany
201 belongs to United States
204 belongs to Canada


In [10]:
df2 = df.copy()                                                         # Make a copy of the original data
df2['Country'] = df2.apply(lambda row: getCountry(row['Code']), axis=1) # Create country column based on the code

In [11]:
# Check to make sure there are no nulls
df2.isnull().sum()

Code       0
Country    0
dtype: int64

In [12]:
# Check the value counts just to be curious
df2.value_counts()

Code  Country             
+52   Mexico                  13
+971  United Arab Emirates     7
703   United States            7
+57   Colombia                 5
+966  Saudi Arabia             3
                              ..
407   United States            1
+233  Ghana                    1
438   Canada                   1
440   United States            1
949   United States            1
Length: 74, dtype: int64

In [13]:
# Load the world geo data
df_codes = pd.read_csv("world_country_and_usa_states_latitude_and_longitude_values.csv")
df_codes.head()

Unnamed: 0,country_code,latitude,longitude,country,usa_state_code,usa_state_latitude,usa_state_longitude,usa_state
0,AD,42.546245,1.601554,Andorra,AK,63.588753,-154.493062,Alaska
1,AE,23.424076,53.847818,United Arab Emirates,AL,32.318231,-86.902298,Alabama
2,AF,33.93911,67.709953,Afghanistan,AR,35.20105,-91.831833,Arkansas
3,AG,17.060816,-61.796428,Antigua and Barbuda,AZ,34.048928,-111.093731,Arizona
4,AI,18.220554,-63.068615,Anguilla,CA,36.778261,-119.417932,California


In [14]:
# Load US geo data
df_area_codes = pd.read_csv("us-area-code-geo.csv")
df_area_codes.head()

Unnamed: 0,area_code,latitude,longitude
0,201,40.83885,-74.045678
1,202,38.89511,-77.03637
2,203,41.291798,-73.122453
3,205,33.427671,-86.886473
4,206,47.564027,-122.348976


In [15]:
# Function to return latitude/longitude based on the country and/or area code of the row
def getLatitudeLongitude(row):
  ret_val = None
  match row['Country']:
    case 'United States':
      area_code = int(row['Code'])          # Covert the 'Code' from string to integer
      retVal = df_area_codes[['latitude', 'longitude']].loc[df_area_codes['area_code'] == area_code]
    case 'Canada':
      area_code = int(row['Code'])          # Covert the 'Code' from string to integer
      retVal = df_ca_area_codes[['latitude', 'longitude']].loc[df_ca_area_codes['area_code'] == area_code]
    case _:
      retVal = df_codes[['latitude', 'longitude']].loc[df_codes['country'] == row['Country']]

  if len(retVal) == 1:                                        # Found exactly one row?  Should always be true
    return pd.Series([retVal.iloc[0, 0], retVal.iloc[0, 1]])  # Yes, return latitude/longitude
  return None                                                 # No, return None

In [16]:
# Test the function with the first row of the dataset
row = df2.iloc[0]
ret = getLatitudeLongitude(row)
print('Code {} is located at {}, {}'.format(row['Code'], ret[0], ret[1]))

Code +223 is located at 17.570692, -3.996166


In [17]:
# Test the function with the last row of the dataset
row = df2.iloc[-1]
ret = getLatitudeLongitude(row)
print('Code {} is located at {}, {}'.format(row['Code'], ret[0], ret[1]))

Code 949 is located at 33.57348615, -117.7337162


In [18]:
# Create a copy of the dataset
df3 = df2.copy()

# Now, for each row, call our function and save the data to new columns
df3[['latitude', 'longitude']] = df3.apply(getLatitudeLongitude, axis=1)
df3.head()

Unnamed: 0,Code,Country,latitude,longitude
0,223,Mali,17.570692,-3.996166
1,233,Ghana,7.946527,-1.023194
2,31,Netherlands,52.132633,5.291266
3,33,France,46.227638,2.213749
4,34,Spain,40.463667,-3.74922


In [19]:
# Verify that there are no null rows
df3[df3.isnull().any(axis=1)]

Unnamed: 0,Code,Country,latitude,longitude


In [20]:
# Save the data to a CSV file
df3.to_csv("chat_geo_data.csv", index=False)