In [1]:
# Necessary libraries
import pandas as pd
import Levenshtein as lev
import re
import unicodedata

In [2]:
# Load datasets
traffic = pd.read_csv("original data/trafficlist_forcountry.csv")
forest = pd.read_csv("original data/forest-cover-v1.csv")
air_city = pd.read_csv("original data/aap_air_quality_database_2018_v14.csv", skiprows=2)
air_country = pd.read_csv("original data/【12】GlobalPM25-1998-2022.csv")
weather = pd.read_csv("original data/GlobalWeatherRepository.csv")

In [3]:
# Data normalization function, to lowercase, remove special chars, and standardize
def normalize_text(text):
    if pd.isna(text):
        return ""
    text = str(text)
    # Convert to lowercase and remove extra whitespace
    text = text.lower().strip()
    # Remove special characters and accents
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')
    # Remove remaining special chars (keep only letters, numbers, spaces)
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

In [4]:
# Normalize all country/city columns upfront
traffic['Location_normalized'] = traffic['Location'].apply(normalize_text)
forest['Country_normalized'] = forest['Country Name'].apply(normalize_text)
air_city['Country_normalized'] = air_city['Country'].apply(normalize_text)
air_country['Region_normalized'] = air_country['Region'].apply(normalize_text)
weather['country_normalized'] = weather['country'].apply(normalize_text)

# Get unique normalized values
countries_traffic = traffic['Location_normalized'].dropna().unique()
countries_forest = forest['Country_normalized'].dropna().unique()
countries_air_city = air_city['Country_normalized'].dropna().unique()
countries_air_country = air_country['Region_normalized'].dropna().unique()
countries_weather = weather['country_normalized'].dropna().unique()

In [5]:
# Defining the Levenshtein function
max_distance = 1  # Maximum distance between the data

def levenshtein_match(a, b):
    # Pure Levenshtein distance check with pre-normalized data
    return lev.distance(a, b) <= max_distance


In [6]:
# Pratice traffic - forest
import dp

matches_tf = []

for prefix in dp.TF_common_prefixes:
    subset_a = dp.traffic[dp.traffic['country_prefix'] == prefix]
    subset_b = dp.forest[dp.forest['country_prefix'] == prefix]
    for loc in subset_a['Country_normalized'].unique():
        for country in subset_b['Country_normalized'].unique():
            if levenshtein_match(loc, country):
                distance = dp.lev.distance(str(loc), str(country))
                matches_tf.append({
                    "Prefix": prefix,
                    "Source": "Traffic",
                    "Target": "Forest",
                    "Country_A": loc,
                    "Country_B": country,
                    "Distance": distance

                })

tf_df = dp.pd.DataFrame(matches_tf)
if not tf_df.empty:
    print("Traffic-Forest matches (sorted by distance):")
    display(tf_df.sort_values('Distance'))
else:
    print("No Traffic-Forest matches found")

    

  from .autonotebook import tqdm as notebook_tqdm


Traffic-Forest matches (sorted by distance):


Unnamed: 0,Prefix,Source,Target,Country_A,Country_B,Distance
0,ind,Traffic,Forest,india,india,0
96,pol,Traffic,Forest,poland,poland,0
97,hun,Traffic,Forest,hungary,hungary,0
98,gre,Traffic,Forest,greece,greece,0
99,gre,Traffic,Forest,grenada,grenada,0
...,...,...,...,...,...,...
52,cos,Traffic,Forest,costa rica,costa rica,0
53,eri,Traffic,Forest,eritrea,eritrea,0
46,lib,Traffic,Forest,liberia,liberia,0
150,mon,Traffic,Forest,mongolia,mongolia,0


In [7]:
# traffic - air_city
matches_tcity = []

for prefix in dp.TCity_common_prefixes:
    subset_a = dp.traffic[dp.traffic['country_prefix'] == prefix]
    subset_b = dp.air_city[dp.air_city['country_prefix'] == prefix]
    for loc in subset_a['Country_normalized'].unique():
        for country in subset_b['Country_normalized'].unique():
            if levenshtein_match(loc, country):
                distance = dp.lev.distance(str(loc), str(country))
                matches_tcity.append({
                    "Prefix": prefix,
                    "Source": "Traffic",
                    "Target": "Air_City",
                    "Country_A": loc,
                    "Country_B": country,
                    "Distance": distance
                })

tac_df = dp.pd.DataFrame(matches_tcity)
if not tac_df.empty:
    print("Traffic-Air_City matches (sorted by distance):")
    display(tac_df.sort_values('Distance'))
else:
    print("No Traffic-Air_City matches found")

Traffic-Air_City matches (sorted by distance):


Unnamed: 0,Prefix,Source,Target,Country_A,Country_B,Distance
0,ind,Traffic,Air_City,india,india,0
57,net,Traffic,Air_City,netherlands,netherlands,0
56,chi,Traffic,Air_City,china,china,0
55,chi,Traffic,Air_City,chile,chile,0
54,uga,Traffic,Air_City,uganda,uganda,0
...,...,...,...,...,...,...
22,kuw,Traffic,Air_City,kuwait,kuwait,0
21,fij,Traffic,Air_City,fiji,fiji,0
28,mex,Traffic,Air_City,mexico,mexico,0
80,mon,Traffic,Air_City,mongolia,mongolia,0


In [8]:
# Traffic - AirCountry matching
matches_tc = []

for prefix in dp.TC_common_prefixes:
    subset_a = dp.traffic[dp.traffic['country_prefix'] == prefix]
    subset_b = dp.air_country[dp.air_country['country_prefix'] == prefix]
    for loc in subset_a['Country_normalized'].unique():
        for country in subset_b['Country_normalized'].unique():
            if levenshtein_match(loc, country):
                distance = dp.lev.distance(str(loc), str(country))
                matches_tc.append({
                    "Prefix": prefix,
                    "Source": "Traffic",
                    "Target": "Air_Country",
                    "Country_A": loc,
                    "Country_B": country,
                    "Distance": distance
                })

tac_df = dp.pd.DataFrame(matches_tc)
if not tac_df.empty:
    print("Traffic-Air_Country matches (sorted by distance):")
    display(tac_df.sort_values('Distance'))
else:
    print("No Traffic-Air_Country matches found")

Traffic-Air_Country matches (sorted by distance):


Unnamed: 0,Prefix,Source,Target,Country_A,Country_B,Distance
0,ind,Traffic,Air_Country,india,india,0
109,hun,Traffic,Air_Country,hungary,hungary,0
110,gre,Traffic,Air_Country,greece,greece,0
111,gre,Traffic,Air_Country,grenada,grenada,0
112,alb,Traffic,Air_Country,albania,albania,0
...,...,...,...,...,...,...
53,swe,Traffic,Air_Country,sweden,sweden,0
172,mon,Traffic,Air_Country,mongolia,mongolia,0
22,pal,Traffic,Air_Country,palestine,palestina,1
135,ira,Traffic,Air_Country,iraq,iran,1


In [9]:
# Traffic-Weather matching
matches_tw = []

for prefix in dp.TW_common_prefixes:
    subset_a = dp.traffic[dp.traffic['country_prefix'] == prefix]
    subset_b = dp.weather[dp.weather['country_prefix'] == prefix]
    for loc in subset_a['Country_normalized'].unique():
        for country in subset_b['Country_normalized'].unique():
            if levenshtein_match(loc, country):
                distance = dp.lev.distance(str(loc), str(country))
                matches_tw.append({
                    "Prefix": prefix,
                    "Source": "Traffic",
                    "Target": "Weather",
                    "Value_A": loc,
                    "Value_B": country,
                    "Distance": distance
                })

tw_df = dp.pd.DataFrame(matches_tw)
if not tw_df.empty:
    print("Traffic-Weather matches (sorted by distance):")
    display(tw_df.sort_values('Distance'))
else:
    print("No Traffic-Weather matches found")

Traffic-Weather matches (sorted by distance):


Unnamed: 0,Prefix,Source,Target,Value_A,Value_B,Distance
0,ind,Traffic,Weather,india,india,0
101,hun,Traffic,Weather,hungary,hungary,0
102,gre,Traffic,Weather,greece,greece,0
103,gre,Traffic,Weather,grenada,grenada,0
104,alb,Traffic,Weather,albania,albania,0
...,...,...,...,...,...,...
55,bot,Traffic,Weather,botswana,botswana,0
48,slo,Traffic,Weather,slovakia,slovakia,0
124,ira,Traffic,Weather,iraq,iran,1
123,ira,Traffic,Weather,iran,iraq,1


In [10]:
# AirCountry-Weather matching
matches_cw = []

for prefix in dp.CW_common_prefixes:
    subset_a = dp.air_country[dp.air_country['country_prefix'] == prefix]
    subset_b = dp.weather[dp.weather['country_prefix'] == prefix]
    for loc in subset_a['Country_normalized'].unique():
        for country in subset_b['Country_normalized'].unique():
            if levenshtein_match(loc, country):
                distance = dp.lev.distance(str(loc), str(country))
                matches_cw.append({
                    "Prefix": prefix,
                    "Source": "Air_Country",
                    "Target": "Weather",
                    "Value_A": loc,
                    "Value_B": country,
                    "Distance": distance
                })

acw_df = dp.pd.DataFrame(matches_cw)
if not acw_df.empty:
    print("Air_Country-Weather matches (sorted by distance):")
    display(acw_df.sort_values('Distance'))
else:
    print("No Air_Country-Weather matches found")

Air_Country-Weather matches (sorted by distance):


Unnamed: 0,Prefix,Source,Target,Value_A,Value_B,Distance
0,ind,Air_Country,Weather,indonesia,indonesia,0
114,hun,Air_Country,Weather,hungary,hungary,0
115,gre,Air_Country,Weather,greece,greece,0
116,gre,Air_Country,Weather,grenada,grenada,0
117,alb,Air_Country,Weather,albania,albania,0
...,...,...,...,...,...,...
55,slo,Air_Country,Weather,slovenia,slovenia,0
180,van,Air_Country,Weather,vanuatu,vanuatu,0
97,kyr,Air_Country,Weather,kyrgyzstan,kyrghyzstan,1
139,ira,Air_Country,Weather,iraq,iran,1


In [11]:
# Forest - air_city matching
matches_fcity = []

for prefix in dp.FCity_common_prefixes:
    subset_a = dp.forest[dp.forest['country_prefix'] == prefix]
    subset_b = dp.air_city[dp.air_city['country_prefix'] == prefix]
    for loc in subset_a['Country_normalized'].unique():
        for country in subset_b['Country_normalized'].unique():
            if levenshtein_match(loc, country):
                distance = dp.lev.distance(str(loc), str(country))
                matches_fcity.append({
                    "Prefix": prefix,
                    "Source": "Forest",
                    "Target": "Air_City",
                    "Value_A": loc,
                    "Value_B": country,
                    "Distance": distance
                })

acw_df = dp.pd.DataFrame(matches_fcity)
if not acw_df.empty:
    print("Forest-Air_City matches (sorted by distance):")
    display(acw_df.sort_values('Distance'))
else:
    print("No Forest-Air_City matches found")

Forest-Air_City matches (sorted by distance):


Unnamed: 0,Prefix,Source,Target,Value_A,Value_B,Distance
0,ind,Forest,Air_City,indonesia,indonesia,0
57,chi,Forest,Air_City,china,china,0
56,chi,Forest,Air_City,chile,chile,0
55,uga,Forest,Air_City,uganda,uganda,0
54,bah,Forest,Air_City,bahrain,bahrain,0
...,...,...,...,...,...,...
22,fij,Forest,Air_City,fiji,fiji,0
21,ukr,Forest,Air_City,ukraine,ukraine,0
28,mex,Forest,Air_City,mexico,mexico,0
80,mon,Forest,Air_City,mongolia,mongolia,0


In [12]:
# Forest - air_country matching
matches_fc = []

for prefix in dp.FC_common_prefixes:
    subset_a = dp.forest[dp.forest['country_prefix'] == prefix]
    subset_b = dp.air_country[dp.air_country['country_prefix'] == prefix]
    for loc in subset_a['Country_normalized'].unique():
        for country in subset_b['Country_normalized'].unique():
            if levenshtein_match(loc, country):
                distance = dp.lev.distance(str(loc), str(country))
                matches_fc.append({
                    "Prefix": prefix,
                    "Source": "Forest",
                    "Target": "Air_Country",
                    "Country_A": loc,
                    "Country_B": country,
                    "Distance": distance
                })

acw_df = dp.pd.DataFrame(matches_fc)
if not acw_df.empty:
    print("Forest-Air_City matches (sorted by distance):")
    display(acw_df.sort_values('Distance'))
else:
    print("No Forest-Air_City matches found")

Forest-Air_City matches (sorted by distance):


Unnamed: 0,Prefix,Source,Target,Country_A,Country_B,Distance
0,ind,Forest,Air_Country,indonesia,indonesia,0
116,sol,Forest,Air_Country,solomon islands,solomon islands,0
117,pol,Forest,Air_Country,poland,poland,0
118,hun,Forest,Air_Country,hungary,hungary,0
119,gre,Forest,Air_Country,greece,greece,0
...,...,...,...,...,...,...
64,eri,Forest,Air_Country,eritrea,eritrea,0
65,pue,Forest,Air_Country,puerto rico,puerto rico,0
56,lib,Forest,Air_Country,libya,libya,0
182,van,Forest,Air_Country,vanuatu,vanuatu,0


In [13]:
# air_country - air_city matching
matches_ccity = []

for prefix in dp.CCity_common_prefixes:
    subset_a = dp.air_country[dp.air_country['country_prefix'] == prefix]
    subset_b = dp.air_city[dp.air_city['country_prefix'] == prefix]
    for loc in subset_a['Country_normalized'].unique():
        for country in subset_b['Country_normalized'].unique():
            if levenshtein_match(loc, country):
                distance = dp.lev.distance(str(loc), str(country))
                matches_ccity.append({
                    "Prefix": prefix,
                    "Source": "Air_Country",
                    "Target": "Air_City",
                    "Country_A": loc,
                    "Country_B": country,
                    "Distance": distance
                })

acw_df = dp.pd.DataFrame(matches_ccity)
if not acw_df.empty:
    print("Air_Country-Air_City matches (sorted by distance):")
    display(acw_df.sort_values('Distance'))
else:
    print("No Air_Country-Air_City matches found")

Air_Country-Air_City matches (sorted by distance):


Unnamed: 0,Prefix,Source,Target,Country_A,Country_B,Distance
0,ind,Air_Country,Air_City,indonesia,indonesia,0
57,chi,Air_Country,Air_City,china,china,0
56,chi,Air_Country,Air_City,chile,chile,0
55,uga,Air_Country,Air_City,uganda,uganda,0
54,bah,Air_Country,Air_City,bahrain,bahrain,0
...,...,...,...,...,...,...
22,kuw,Air_Country,Air_City,kuwait,kuwait,0
21,fij,Air_Country,Air_City,fiji,fiji,0
28,mex,Air_Country,Air_City,mexico,mexico,0
81,mon,Air_Country,Air_City,mongolia,mongolia,0


In [14]:
# forest - weather
matches_fw = []

for prefix in dp.FW_common_prefixes:
    subset_a = dp.forest[dp.forest['country_prefix'] == prefix]
    subset_b = dp.weather[dp.weather['country_prefix'] == prefix]
    for loc in subset_a['Country_normalized'].unique():
        for country in subset_b['Country_normalized'].unique():
            if levenshtein_match(loc, country):
                distance = dp.lev.distance(str(loc), str(country))
                matches_fw.append({
                    "Prefix": prefix,
                    "Source": "Forest",
                    "Target": "Weather",
                    "Country_A": loc,
                    "Country_B": country,
                    "Distance": distance
                })

acw_df = dp.pd.DataFrame(matches_fw)
if not acw_df.empty:
    print("Forest - Weather matches (sorted by distance):")
    display(acw_df.sort_values('Distance'))
else:
    print("No Forest - Weather matches found")

Forest - Weather matches (sorted by distance):


Unnamed: 0,Prefix,Source,Target,Country_A,Country_B,Distance
0,ind,Forest,Weather,indonesia,indonesia,0
101,pol,Forest,Weather,poland,poland,0
102,hun,Forest,Weather,hungary,hungary,0
103,gre,Forest,Weather,greece,greece,0
104,gre,Forest,Weather,grenada,grenada,0
...,...,...,...,...,...,...
55,mex,Forest,Weather,mexico,mexico,0
56,bot,Forest,Weather,botswana,botswana,0
49,swe,Forest,Weather,sweden,sweden,0
158,van,Forest,Weather,vanuatu,vanuatu,0


In [15]:
# City-level matching setup
cities_air_city = air_city['City/Town'].dropna().unique()
cities_weather = weather['location_name'].dropna().unique()

In [16]:
# Air-City-Weather city matching
matches_city = []
for city in cities_air_city:
    for weather_city in cities_weather:
        if levenshtein_match(city, weather_city):
            distance = lev.distance(str(city), str(weather_city))
            matches_city.append({
                "Source": "Air_City",
                "Target": "Weather",
                "Value_A": city,
                "Value_B": weather_city,
                "Distance": distance
            })

city_df = pd.DataFrame(matches_city)
if not city_df.empty:
    print("Air_City-Weather city matches (sorted by distance):")
    display(city_df.sort_values('Distance'))
else:
    print("No city-level matches found")

Air_City-Weather city matches (sorted by distance):


Unnamed: 0,Source,Target,Value_A,Value_B,Distance
0,Air_City,Weather,Tirana,Tirana,0
55,Air_City,Weather,Amsterdam,Amsterdam,0
54,Air_City,Weather,Ulaanbaatar,Ulaanbaatar,0
53,Air_City,Weather,Skopje,Skopje,0
52,Air_City,Weather,Mexico City,Mexico City,0
...,...,...,...,...,...
44,Air_City,Weather,Roma,Rome,1
31,Air_City,Weather,Vienne,Vienna,1
39,Air_City,Weather,Mehran,Tehran,1
11,Air_City,Weather,Rigi,Riga,1
