In [2]:
# Necessary libraries
import pandas as pd
import Levenshtein as lev
import re
import unicodedata

In [3]:
# Load datasets
traffic = pd.read_csv("original data/trafficlist_forcountry.csv")
forest = pd.read_csv("original data/forest-cover-v1.csv")
air_city = pd.read_csv("original data/aap_air_quality_database_2018_v14.csv", skiprows=2)
air_country = pd.read_csv("original data/【12】GlobalPM25-1998-2022.csv")
weather = pd.read_csv("original data/GlobalWeatherRepository.csv")

In [4]:
# Data normalization function, to lowercase, remove special chars, and standardize
def normalize_text(text):
    if pd.isna(text):
        return ""
    text = str(text)
    # Convert to lowercase and remove extra whitespace
    text = text.lower().strip()
    # Remove special characters and accents
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')
    # Remove remaining special chars (keep only letters, numbers, spaces)
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

In [5]:
# Normalize all country/city columns upfront
traffic['Location_normalized'] = traffic['Location'].apply(normalize_text)
forest['Country_normalized'] = forest['Country Name'].apply(normalize_text)
air_city['Country_normalized'] = air_city['Country'].apply(normalize_text)
air_country['Region_normalized'] = air_country['Region'].apply(normalize_text)
weather['country_normalized'] = weather['country'].apply(normalize_text)

# Get unique normalized values
countries_traffic = traffic['Location_normalized'].dropna().unique()
countries_forest = forest['Country_normalized'].dropna().unique()
countries_air_city = air_city['Country_normalized'].dropna().unique()
countries_air_country = air_country['Region_normalized'].dropna().unique()
countries_weather = weather['country_normalized'].dropna().unique()

In [6]:
# Defining the Levenshtein function
max_distance = 1  # Maximum distance between the data

def levenshtein_match(a, b):
    # Pure Levenshtein distance check with pre-normalized data
    return lev.distance(a, b) <= max_distance


In [7]:
# Pratice traffic - forest
import dp

matches_tf = []

for prefix in dp.TF_common_prefixes:
    subset_a = dp.traffic[dp.traffic['country_prefix'] == prefix]
    subset_b = dp.forest[dp.forest['country_prefix'] == prefix]
    for loc in subset_a['Country_normalized'].unique():
        for country in subset_b['Country_normalized'].unique():
            if levenshtein_match(loc, country):
                distance = dp.lev.distance(str(loc), str(country))
                matches_tf.append({
                    "Prefix": prefix,
                    "Source": "Traffic",
                    "Target": "Forest",
                    "Country_A": loc,
                    "Country_B": country,
                    "Distance": distance

                })

tf_df = dp.pd.DataFrame(matches_tf)
if not tf_df.empty:
    print("Traffic-Forest matches (sorted by distance):")
    display(tf_df.sort_values('Distance'))
else:
    print("No Traffic-Forest matches found")

    

  from .autonotebook import tqdm as notebook_tqdm


Traffic-Forest matches (sorted by distance):


Unnamed: 0,Prefix,Source,Target,Country_A,Country_B,Distance
0,tun,Traffic,Forest,tunisia,tunisia,0
96,bar,Traffic,Forest,barbados,barbados,0
97,spa,Traffic,Forest,spain,spain,0
98,bhu,Traffic,Forest,bhutan,bhutan,0
99,dji,Traffic,Forest,djibouti,djibouti,0
...,...,...,...,...,...,...
52,lie,Traffic,Forest,liechtenstein,liechtenstein,0
53,ita,Traffic,Forest,italy,italy,0
47,ice,Traffic,Forest,iceland,iceland,0
150,mal,Traffic,Forest,mali,mali,0


In [8]:
# traffic - air_city
matches_tcity = []

for prefix in dp.TCity_common_prefixes:
    subset_a = dp.traffic[dp.traffic['country_prefix'] == prefix]
    subset_b = dp.air_city[dp.air_city['country_prefix'] == prefix]
    for loc in subset_a['Country_normalized'].unique():
        for country in subset_b['Country_normalized'].unique():
            if levenshtein_match(loc, country):
                distance = dp.lev.distance(str(loc), str(country))
                matches_tcity.append({
                    "Prefix": prefix,
                    "Source": "Traffic",
                    "Target": "Air_City",
                    "Country_A": loc,
                    "Country_B": country,
                    "Distance": distance
                })

tac_df = dp.pd.DataFrame(matches_tcity)
if not tac_df.empty:
    print("Traffic-Air_City matches (sorted by distance):")
    display(tac_df.sort_values('Distance'))
else:
    print("No Traffic-Air_City matches found")

Traffic-Air_City matches (sorted by distance):


Unnamed: 0,Prefix,Source,Target,Country_A,Country_B,Distance
40,mex,Traffic,Air_City,mexico,mexico,0
57,pan,Traffic,Air_City,panama,panama,0
56,bra,Traffic,Air_City,brazil,brazil,0
55,spa,Traffic,Air_City,spain,spain,0
54,cos,Traffic,Air_City,costa rica,costa rica,0
...,...,...,...,...,...,...
23,jor,Traffic,Air_City,jordan,jordan,0
22,bel,Traffic,Air_City,belgium,belgium,0
21,ecu,Traffic,Air_City,ecuador,ecuador,0
39,sen,Traffic,Air_City,senegal,senegal,0


In [9]:
# Traffic - AirCountry matching
matches_tc = []

for prefix in dp.TC_common_prefixes:
    subset_a = dp.traffic[dp.traffic['country_prefix'] == prefix]
    subset_b = dp.air_country[dp.air_country['country_prefix'] == prefix]
    for loc in subset_a['Country_normalized'].unique():
        for country in subset_b['Country_normalized'].unique():
            if levenshtein_match(loc, country):
                distance = dp.lev.distance(str(loc), str(country))
                matches_tc.append({
                    "Prefix": prefix,
                    "Source": "Traffic",
                    "Target": "Air_Country",
                    "Country_A": loc,
                    "Country_B": country,
                    "Distance": distance
                })

tac_df = dp.pd.DataFrame(matches_tc)
if not tac_df.empty:
    print("Traffic-Air_Country matches (sorted by distance):")
    display(tac_df.sort_values('Distance'))
else:
    print("No Traffic-Air_Country matches found")

Traffic-Air_Country matches (sorted by distance):


Unnamed: 0,Prefix,Source,Target,Country_A,Country_B,Distance
0,lao,Traffic,Air_Country,laos,laos,0
109,spa,Traffic,Air_Country,spain,spain,0
110,bhu,Traffic,Air_Country,bhutan,bhutan,0
111,dji,Traffic,Air_Country,djibouti,djibouti,0
112,sie,Traffic,Air_Country,sierra leone,sierra leone,0
...,...,...,...,...,...,...
60,lie,Traffic,Air_Country,liechtenstein,liechtenstein,0
172,mal,Traffic,Air_Country,mali,mali,0
140,ira,Traffic,Air_Country,iraq,iran,1
15,pal,Traffic,Air_Country,palestine,palestina,1


In [10]:
# Traffic-Weather matching
matches_tw = []

for prefix in dp.TW_common_prefixes:
    subset_a = dp.traffic[dp.traffic['country_prefix'] == prefix]
    subset_b = dp.weather[dp.weather['country_prefix'] == prefix]
    for loc in subset_a['Country_normalized'].unique():
        for country in subset_b['Country_normalized'].unique():
            if levenshtein_match(loc, country):
                distance = dp.lev.distance(str(loc), str(country))
                matches_tw.append({
                    "Prefix": prefix,
                    "Source": "Traffic",
                    "Target": "Weather",
                    "Value_A": loc,
                    "Value_B": country,
                    "Distance": distance
                })

tw_df = dp.pd.DataFrame(matches_tw)
if not tw_df.empty:
    print("Traffic-Weather matches (sorted by distance):")
    display(tw_df.sort_values('Distance'))
else:
    print("No Traffic-Weather matches found")

Traffic-Weather matches (sorted by distance):


Unnamed: 0,Prefix,Source,Target,Value_A,Value_B,Distance
0,tun,Traffic,Weather,tunisia,tunisia,0
100,bar,Traffic,Weather,barbados,barbados,0
101,spa,Traffic,Weather,spain,spain,0
102,bhu,Traffic,Weather,bhutan,bhutan,0
103,dji,Traffic,Weather,djibouti,djibouti,0
...,...,...,...,...,...,...
49,ice,Traffic,Weather,iceland,iceland,0
158,mal,Traffic,Weather,mali,mali,0
128,ira,Traffic,Weather,iran,iraq,1
8,kyr,Traffic,Weather,kyrgyzstan,kyrghyzstan,1


In [11]:
# AirCountry-Weather matching
matches_cw = []

for prefix in dp.CW_common_prefixes:
    subset_a = dp.air_country[dp.air_country['country_prefix'] == prefix]
    subset_b = dp.weather[dp.weather['country_prefix'] == prefix]
    for loc in subset_a['Country_normalized'].unique():
        for country in subset_b['Country_normalized'].unique():
            if levenshtein_match(loc, country):
                distance = dp.lev.distance(str(loc), str(country))
                matches_cw.append({
                    "Prefix": prefix,
                    "Source": "Air_Country",
                    "Target": "Weather",
                    "Value_A": loc,
                    "Value_B": country,
                    "Distance": distance
                })

acw_df = dp.pd.DataFrame(matches_cw)
if not acw_df.empty:
    print("Air_Country-Weather matches (sorted by distance):")
    display(acw_df.sort_values('Distance'))
else:
    print("No Air_Country-Weather matches found")

Air_Country-Weather matches (sorted by distance):


Unnamed: 0,Prefix,Source,Target,Value_A,Value_B,Distance
0,tun,Air_Country,Weather,tunisia,tunisia,0
114,cro,Air_Country,Weather,croatia,croatia,0
115,bar,Air_Country,Weather,barbados,barbados,0
116,cos,Air_Country,Weather,costa rica,costa rica,0
117,dji,Air_Country,Weather,djibouti,djibouti,0
...,...,...,...,...,...,...
56,cam,Air_Country,Weather,cambodia,cambodia,0
180,mal,Air_Country,Weather,malaysia,malaysia,0
8,kyr,Air_Country,Weather,kyrgyzstan,kyrghyzstan,1
148,ira,Air_Country,Weather,iran,iraq,1


In [20]:
# Forest - air_city matching
matches_fcity = []

for prefix in dp.FCity_common_prefixes:
    subset_a = dp.forest[dp.forest['country_prefix'] == prefix]
    subset_b = dp.air_city[dp.air_city['country_prefix'] == prefix]
    for loc in subset_a['Country_normalized'].unique():
        for country in subset_b['Country_normalized'].unique():
            if levenshtein_match(loc, country):
                distance = dp.lev.distance(str(loc), str(country))
                matches_fcity.append({
                    "Prefix": prefix,
                    "Source": "Forest",
                    "Target": "Air_City",
                    "Country_A": loc,
                    "Country_B": country,
                    "Distance": distance
                })

acw_df = dp.pd.DataFrame(matches_fcity)
if not acw_df.empty:
    print("Forest-Air_City matches (sorted by distance):")
    display(acw_df.sort_values('Distance'))
else:
    print("No Forest-Air_City matches found")

Forest-Air_City matches (sorted by distance):


Unnamed: 0,Prefix,Source,Target,Country_A,Country_B,Distance
40,den,Forest,Air_City,denmark,denmark,0
57,ken,Forest,Air_City,kenya,kenya,0
56,pan,Forest,Air_City,panama,panama,0
55,bra,Forest,Air_City,brazil,brazil,0
54,spa,Forest,Air_City,spain,spain,0
...,...,...,...,...,...,...
23,kuw,Forest,Air_City,kuwait,kuwait,0
22,jor,Forest,Air_City,jordan,jordan,0
21,bel,Forest,Air_City,belgium,belgium,0
39,mex,Forest,Air_City,mexico,mexico,0


In [13]:
# Forest - air_country matching
matches_fc = []

for prefix in dp.FC_common_prefixes:
    subset_a = dp.forest[dp.forest['country_prefix'] == prefix]
    subset_b = dp.air_country[dp.air_country['country_prefix'] == prefix]
    for loc in subset_a['Country_normalized'].unique():
        for country in subset_b['Country_normalized'].unique():
            if levenshtein_match(loc, country):
                distance = dp.lev.distance(str(loc), str(country))
                matches_fc.append({
                    "Prefix": prefix,
                    "Source": "Forest",
                    "Target": "Air_Country",
                    "Country_A": loc,
                    "Country_B": country,
                    "Distance": distance
                })

acw_df = dp.pd.DataFrame(matches_fc)
if not acw_df.empty:
    print("Forest-Air_City matches (sorted by distance):")
    display(acw_df.sort_values('Distance'))
else:
    print("No Forest-Air_City matches found")

Forest-Air_City matches (sorted by distance):


Unnamed: 0,Prefix,Source,Target,Country_A,Country_B,Distance
0,far,Forest,Air_Country,faroe islands,faroe islands,0
116,cro,Forest,Air_Country,croatia,croatia,0
117,bar,Forest,Air_Country,barbados,barbados,0
118,cos,Forest,Air_Country,costa rica,costa rica,0
119,dji,Forest,Air_Country,djibouti,djibouti,0
...,...,...,...,...,...,...
64,gab,Forest,Air_Country,gabon,gabon,0
65,ita,Forest,Air_Country,italy,italy,0
57,cam,Forest,Air_Country,cambodia,cambodia,0
182,mal,Forest,Air_Country,malaysia,malaysia,0


In [14]:
# air_country - air_city matching
matches_ccity = []

for prefix in dp.CCity_common_prefixes:
    subset_a = dp.air_country[dp.air_country['country_prefix'] == prefix]
    subset_b = dp.air_city[dp.air_city['country_prefix'] == prefix]
    for loc in subset_a['Country_normalized'].unique():
        for country in subset_b['Country_normalized'].unique():
            if levenshtein_match(loc, country):
                distance = dp.lev.distance(str(loc), str(country))
                matches_ccity.append({
                    "Prefix": prefix,
                    "Source": "Air_Country",
                    "Target": "Air_City",
                    "Country_A": loc,
                    "Country_B": country,
                    "Distance": distance
                })

acw_df = dp.pd.DataFrame(matches_ccity)
if not acw_df.empty:
    print("Air_Country-Air_City matches (sorted by distance):")
    display(acw_df.sort_values('Distance'))
else:
    print("No Air_Country-Air_City matches found")

Air_Country-Air_City matches (sorted by distance):


Unnamed: 0,Prefix,Source,Target,Country_A,Country_B,Distance
40,mex,Air_Country,Air_City,mexico,mexico,0
58,pan,Air_Country,Air_City,panama,panama,0
57,bra,Air_Country,Air_City,brazil,brazil,0
56,spa,Air_Country,Air_City,spain,spain,0
55,cos,Air_Country,Air_City,costa rica,costa rica,0
...,...,...,...,...,...,...
24,kuw,Air_Country,Air_City,kuwait,kuwait,0
23,jor,Air_Country,Air_City,jordan,jordan,0
22,bel,Air_Country,Air_City,belgium,belgium,0
20,bos,Air_Country,Air_City,bosnia and herzegovina,bosnia and herzegovina,0


In [15]:
# forest - weather
matches_fw = []

for prefix in dp.FW_common_prefixes:
    subset_a = dp.forest[dp.forest['country_prefix'] == prefix]
    subset_b = dp.weather[dp.weather['country_prefix'] == prefix]
    for loc in subset_a['Country_normalized'].unique():
        for country in subset_b['Country_normalized'].unique():
            if levenshtein_match(loc, country):
                distance = dp.lev.distance(str(loc), str(country))
                matches_fw.append({
                    "Prefix": prefix,
                    "Source": "Forest",
                    "Target": "Weather",
                    "Country_A": loc,
                    "Country_B": country,
                    "Distance": distance
                })

acw_df = dp.pd.DataFrame(matches_fw)
if not acw_df.empty:
    print("Forest - Weather matches (sorted by distance):")
    display(acw_df.sort_values('Distance'))
else:
    print("No Forest - Weather matches found")

Forest - Weather matches (sorted by distance):


Unnamed: 0,Prefix,Source,Target,Country_A,Country_B,Distance
0,tun,Forest,Weather,tunisia,tunisia,0
101,bar,Forest,Weather,barbados,barbados,0
102,cos,Forest,Weather,costa rica,costa rica,0
103,dji,Forest,Weather,djibouti,djibouti,0
104,bhu,Forest,Weather,bhutan,bhutan,0
...,...,...,...,...,...,...
55,gab,Forest,Weather,gabon,gabon,0
56,ita,Forest,Weather,italy,italy,0
39,som,Forest,Weather,somalia,somalia,0
158,mal,Forest,Weather,malaysia,malaysia,0


In [None]:
# forest - air_city: Comparison of cities

matches_fcity = []

for prefix in dp.FCity_city_common_prefixes:
    subset_a = dp.forest[dp.forest['city_prefix'] == prefix]
    subset_b = dp.air_city[dp.air_city['city_prefix'] == prefix]
    for loc in subset_a['City_normalized'].unique():
        for city in subset_b['City_normalized'].unique():
            if levenshtein_match(loc, city):
                distance = dp.lev.distance(str(loc), str(city))
                matches_fcity.append({
                    "Prefix": prefix,
                    "Source": "Forest",
                    "Target": "Air_City",
                    "City_A": loc,
                    "City_B": city,
                    "Distance": distance
                })

acw_df = dp.pd.DataFrame(matches_fcity)
if not acw_df.empty:
    print("Forest-Air_City matches (sorted by distance):")
    display(acw_df.sort_values('Distance'))
else:
    print("No Forest-Air_City matches found")

Forest-Air_City matches (sorted by distance):


Unnamed: 0,Prefix,Source,Target,City_A,City_B,Distance
0,acc,Forest,Air_City,accra,accra,0
36,san,Forest,Air_City,santiago,santiago,0
73,ber,Forest,Air_City,berlin,berlin,0
38,san,Forest,Air_City,san salvador,san salvador,0
40,lim,Forest,Air_City,lima,lima,0
...,...,...,...,...,...,...
39,san,Forest,Air_City,sanaa,sanya,1
29,sai,Forest,Air_City,saint johns,saint john,1
2,vie,Forest,Air_City,vienna,vienne,1
31,kin,Forest,Air_City,kingstown,kingston,1


In [26]:
# air_city - weather: Comparison of cities

matches_cw = []

for prefix in dp.CW_city_common_prefixes:
    subset_a = dp.air_city[dp.air_city['city_prefix'] == prefix]
    subset_b = dp.weather[dp.weather['city_prefix'] == prefix]
    for loc in subset_a['City_normalized'].unique():
        for city in subset_b['City_normalized'].unique():
            if levenshtein_match(loc, city):
                distance = dp.lev.distance(str(loc), str(city))
                matches_cw.append({
                    "Prefix": prefix,
                    "Source": "Air_city",
                    "Target": "Weather",
                    "City_A": loc,
                    "City_B": city,
                    "Distance": distance
                })

acw_df = dp.pd.DataFrame(matches_cw)
if not acw_df.empty:
    print("Forest-Air_City matches (sorted by distance):")
    display(acw_df.sort_values('Distance'))
else:
    print("No Forest-Air_City matches found")

Forest-Air_City matches (sorted by distance):


Unnamed: 0,Prefix,Source,Target,City_A,City_B,Distance
0,acc,Air_city,Weather,accra,accra,0
38,san,Air_city,Weather,san salvador,san salvador,0
39,san,Air_city,Weather,san francisco,san francisco,0
40,san,Air_city,Weather,san rafael,san rafael,0
41,lim,Air_city,Weather,lima,lima,0
...,...,...,...,...,...,...
30,kin,Air_city,Weather,kingston,kingstown,1
2,vie,Air_city,Weather,vienne,vienna,1
36,san,Air_city,Weather,sanya,sanaa,1
12,yao,Air_city,Weather,yaoude,yaounde,1


In [27]:
# forest - weather: Comparison of cities

matches_fw = []

for prefix in dp.FW_city_common_prefixes:
    subset_a = dp.forest[dp.forest['city_prefix'] == prefix]
    subset_b = dp.weather[dp.weather['city_prefix'] == prefix]
    for loc in subset_a['City_normalized'].unique():
        for city in subset_b['City_normalized'].unique():
            if levenshtein_match(loc, city):
                distance = dp.lev.distance(str(loc), str(city))
                matches_fw.append({
                    "Prefix": prefix,
                    "Source": "Forest",
                    "Target": "Weather",
                    "City_A": loc,
                    "City_B": city,
                    "Distance": distance
                })

acw_df = dp.pd.DataFrame(matches_fw)
if not acw_df.empty:
    print("Forest-Air_City matches (sorted by distance):")
    display(acw_df.sort_values('Distance'))
else:
    print("No Forest-Air_City matches found")

Forest-Air_City matches (sorted by distance):


Unnamed: 0,Prefix,Source,Target,City_A,City_B,Distance
0,acc,Forest,Weather,accra,accra,0
112,rig,Forest,Weather,riga,riga,0
113,lju,Forest,Weather,ljubljana,ljubljana,0
114,sko,Forest,Weather,skopje,skopje,0
115,dha,Forest,Weather,dhaka,dhaka,0
...,...,...,...,...,...,...
73,kin,Forest,Weather,kingston,kingstown,1
52,bog,Forest,Weather,bogota,bogot,1
57,nuk,Forest,Weather,nukualofa,nukualoia,1
171,geo,Forest,Weather,george town,georgetown,1
