In [200]:
# Necessary libraries
import pandas as pd
import Levenshtein as lev
import re
import unicodedata

In [201]:
# Load datasets
traffic = pd.read_csv("original data/trafficlist_forcountry.csv")
forest = pd.read_csv("original data/forest-cover-v1.csv")
air_city = pd.read_csv("original data/aap_air_quality_database_2018_v14.csv", skiprows=2)
air_country = pd.read_csv("original data/【12】GlobalPM25-1998-2022.csv")
weather = pd.read_csv("original data/GlobalWeatherRepository.csv")

In [202]:
# Data normalization function, to lowercase, remove special chars, and standardize
def normalize_text(text):
    if pd.isna(text):
        return ""
    text = str(text)
    # Convert to lowercase and remove extra whitespace
    text = text.lower().strip()
    # Remove special characters and accents
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')
    # Remove remaining special chars (keep only letters, numbers, spaces)
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

In [203]:
# Normalize all country/city columns upfront
traffic['Location_normalized'] = traffic['Location'].apply(normalize_text)
forest['Country_normalized'] = forest['Country Name'].apply(normalize_text)
air_city['Country_normalized'] = air_city['Country'].apply(normalize_text)
air_country['Region_normalized'] = air_country['Region'].apply(normalize_text)
weather['country_normalized'] = weather['country'].apply(normalize_text)

# Get unique normalized values
countries_traffic = traffic['Location_normalized'].dropna().unique()
countries_forest = forest['Country_normalized'].dropna().unique()
countries_air_city = air_city['Country_normalized'].dropna().unique()
countries_air_country = air_country['Region_normalized'].dropna().unique()
countries_weather = weather['country_normalized'].dropna().unique()

In [204]:
# Defining the Levenshtein function
max_distance = 1  # Maximum distance between the data

def levenshtein_match(a, b):
    # Pure Levenshtein distance check with pre-normalized data
    return lev.distance(a, b) <= max_distance


In [205]:
# Traffic-Forest matching
matches_tf = []
for loc in countries_traffic:
    for country in countries_forest:
        if levenshtein_match(loc, country):
            distance = lev.distance(str(loc), str(country))
            matches_tf.append({
                "Source": "Traffic",
                "Target": "Forest",
                "Value_A": loc,
                "Value_B": country,
                "Distance": distance
            })

tf_df = pd.DataFrame(matches_tf)
if not tf_df.empty:
    print("Traffic-Forest matches (sorted by distance):")
    display(tf_df.sort_values('Distance'))
else:
    print("No Traffic-Forest matches found")

Traffic-Forest matches (sorted by distance):


Unnamed: 0,Source,Target,Value_A,Value_B,Distance
0,Traffic,Forest,san marino,san marino,0
98,Traffic,Forest,sao tome and principe,sao tome and principe,0
99,Traffic,Forest,el salvador,el salvador,0
100,Traffic,Forest,bhutan,bhutan,0
101,Traffic,Forest,tonga,tonga,0
...,...,...,...,...,...
48,Traffic,Forest,bahrain,bahrain,0
89,Traffic,Forest,iran,iraq,1
7,Traffic,Forest,iceland,ireland,1
117,Traffic,Forest,gambia,zambia,1


In [206]:
# Traffic-AirCountry matching
matches_tac = []
for loc in countries_traffic:
    for country in countries_air_country:
        if levenshtein_match(loc, country):
            distance = lev.distance(str(loc), str(country))
            matches_tac.append({
                "Source": "Traffic",
                "Target": "Air_Country",
                "Value_A": loc,
                "Value_B": country,
                "Distance": distance
            })

tac_df = pd.DataFrame(matches_tac)
if not tac_df.empty:
    print("Traffic-Air_Country matches (sorted by distance):")
    display(tac_df.sort_values('Distance'))
else:
    print("No Traffic-Air_Country matches found")

Traffic-Air_Country matches (sorted by distance):


Unnamed: 0,Source,Target,Value_A,Value_B,Distance
0,Traffic,Air_Country,san marino,san marino,0
111,Traffic,Air_Country,sao tome and principe,sao tome and principe,0
112,Traffic,Air_Country,el salvador,el salvador,0
113,Traffic,Air_Country,bhutan,bhutan,0
114,Traffic,Air_Country,tonga,tonga,0
...,...,...,...,...,...
102,Traffic,Air_Country,iran,iraq,1
136,Traffic,Air_Country,gambia,zambia,1
146,Traffic,Air_Country,zambia,gambia,1
128,Traffic,Air_Country,iraq,iran,1


In [207]:
# Traffic-Weather matching
matches_tw = []
for loc in countries_traffic:
    for country in countries_weather:
        if levenshtein_match(loc, country):
            distance = lev.distance(str(loc), str(country))
            matches_tw.append({
                "Source": "Traffic",
                "Target": "Weather",
                "Value_A": loc,
                "Value_B": country,
                "Distance": distance
            })

tw_df = pd.DataFrame(matches_tw)
if not tw_df.empty:
    print("Traffic-Weather matches (sorted by distance):")
    display(tw_df.sort_values('Distance'))
else:
    print("No Traffic-Weather matches found")

Traffic-Weather matches (sorted by distance):


Unnamed: 0,Source,Target,Value_A,Value_B,Distance
0,Traffic,Weather,san marino,san marino,0
103,Traffic,Weather,tonga,tonga,0
104,Traffic,Weather,india,india,0
105,Traffic,Weather,bolivia,bolivia,0
106,Traffic,Weather,venezuela,venezuela,0
...,...,...,...,...,...
7,Traffic,Weather,iceland,ireland,1
92,Traffic,Weather,iran,iraq,1
133,Traffic,Weather,zambia,gambia,1
124,Traffic,Weather,gambia,zambia,1


In [208]:
# AirCountry-Weather matching
matches_acw = []
for air in countries_air_country:
    for weather_loc in countries_weather:
        if levenshtein_match(air, weather_loc):
            distance = lev.distance(str(air), str(weather_loc))
            matches_acw.append({
                "Source": "Air_Country",
                "Target": "Weather",
                "Value_A": air,
                "Value_B": weather_loc,
                "Distance": distance
            })

acw_df = pd.DataFrame(matches_acw)
if not acw_df.empty:
    print("Air_Country-Weather matches (sorted by distance):")
    display(acw_df.sort_values('Distance'))
else:
    print("No Air_Country-Weather matches found")

Air_Country-Weather matches (sorted by distance):


Unnamed: 0,Source,Target,Value_A,Value_B,Distance
0,Air_Country,Weather,afghanistan,afghanistan,0
117,Air_Country,Weather,mauritius,mauritius,0
118,Air_Country,Weather,malawi,malawi,0
119,Air_Country,Weather,malaysia,malaysia,0
120,Air_Country,Weather,namibia,namibia,0
...,...,...,...,...,...
73,Air_Country,Weather,ireland,iceland,1
60,Air_Country,Weather,gambia,zambia,1
88,Air_Country,Weather,kyrgyzstan,kyrghyzstan,1
182,Air_Country,Weather,zambia,gambia,1


In [209]:
# City-level matching setup
cities_air_city = air_city['City/Town'].dropna().unique()
cities_weather = weather['location_name'].dropna().unique()

In [None]:
# Air-City-Weather city matching
matches_city = []
for city in cities_air_city:
    for weather_city in cities_weather:
        if levenshtein_match(city, weather_city):
            distance = lev.distance(str(city), str(weather_city))
            matches_city.append({
                "Source": "Air_City",
                "Target": "Weather",
                "Value_A": city,
                "Value_B": weather_city,
                "Distance": distance
            })

city_df = pd.DataFrame(matches_city)
if not city_df.empty:
    print("Air_City-Weather city matches (sorted by distance):")
    display(city_df.sort_values('Distance'))
else:
    print("No city-level matches found")

Air_City-Weather city matches (sorted by distance):


Unnamed: 0,Source,Target,Value_A,Value_B,Distance
0,Air_City,Weather,Tirana,Tirana,0
55,Air_City,Weather,Amsterdam,Amsterdam,0
54,Air_City,Weather,Ulaanbaatar,Ulaanbaatar,0
53,Air_City,Weather,Skopje,Skopje,0
52,Air_City,Weather,Mexico City,Mexico City,0
...,...,...,...,...,...
44,Air_City,Weather,Roma,Rome,1
31,Air_City,Weather,Vienne,Vienna,1
39,Air_City,Weather,Mehran,Tehran,1
11,Air_City,Weather,Rigi,Riga,1
