In [60]:
# Necessary libraries
import pandas as pd
import Levenshtein as lev
import re
import unicodedata

In [61]:
# Load datasets
traffic = pd.read_csv("original data/trafficlist_forcountry.csv")
forest = pd.read_csv("original data/forest-cover-v1.csv")
air_city = pd.read_csv("original data/aap_air_quality_database_2018_v14.csv", skiprows=2)
air_country = pd.read_csv("original data/【12】GlobalPM25-1998-2022.csv")
weather = pd.read_csv("original data/GlobalWeatherRepository.csv")

In [62]:
# Data normalization function, to lowercase, remove special chars, and standardize
def normalize_text(text):
    if pd.isna(text):
        return ""
    text = str(text)
    # Convert to lowercase and remove extra whitespace
    text = text.lower().strip()
    # Remove special characters and accents
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')
    # Remove remaining special chars (keep only letters, numbers, spaces)
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

In [63]:
# Normalize all country/city columns upfront
traffic['Location_normalized'] = traffic['Location'].apply(normalize_text)
forest['Country_normalized'] = forest['Country Name'].apply(normalize_text)
air_city['Country_normalized'] = air_city['Country'].apply(normalize_text)
air_country['Region_normalized'] = air_country['Region'].apply(normalize_text)
weather['country_normalized'] = weather['country'].apply(normalize_text)

# Get unique normalized values
countries_traffic = traffic['Location_normalized'].dropna().unique()
countries_forest = forest['Country_normalized'].dropna().unique()
countries_air_city = air_city['Country_normalized'].dropna().unique()
countries_air_country = air_country['Region_normalized'].dropna().unique()
countries_weather = weather['country_normalized'].dropna().unique()

In [64]:
# Defining the Levenshtein function
max_distance = 1  # Maximum distance between the data

def levenshtein_match(a, b):
    # Pure Levenshtein distance check with pre-normalized data
    return lev.distance(a, b) <= max_distance


In [65]:
# Pratice traffic - forest
import dp

matches_tf = []

for prefix in dp.TF_common_prefixes:
    subset_a = dp.traffic[dp.traffic['country_prefix'] == prefix]
    subset_b = dp.forest[dp.forest['country_prefix'] == prefix]
    for loc in subset_a['Country_normalized'].unique():
        for country in subset_b['Country_normalized'].unique():
            if levenshtein_match(loc, country):
                distance = dp.lev.distance(str(loc), str(country))
                matches_tf.append({
                    "Prefix": prefix,
                    "Source": "Traffic",
                    "Target": "Forest",
                    "Country_A": loc,
                    "Country_B": country,
                    "Distance": distance

                })

tf_df = dp.pd.DataFrame(matches_tf)
if not tf_df.empty:
    print("Traffic-Forest matches (sorted by distance):")
    display(tf_df.sort_values('Distance'))
else:
    print("No Traffic-Forest matches found")

    

Traffic-Forest matches (sorted by distance):


Unnamed: 0,Prefix,Source,Target,Country_A,Country_B,Distance
0,por,Traffic,Forest,portugal,portugal,0
96,chi,Traffic,Forest,chile,chile,0
97,chi,Traffic,Forest,china,china,0
98,can,Traffic,Forest,canada,canada,0
99,sou,Traffic,Forest,south africa,south africa,0
...,...,...,...,...,...,...
52,arm,Traffic,Forest,armenia,armenia,0
53,som,Traffic,Forest,somalia,somalia,0
47,lit,Traffic,Forest,lithuania,lithuania,0
150,pap,Traffic,Forest,papua new guinea,papua new guinea,0


In [66]:
# traffic - air_city
matches_tcity = []

for prefix in dp.TCity_common_prefixes:
    subset_a = dp.traffic[dp.traffic['country_prefix'] == prefix]
    subset_b = dp.air_city[dp.air_city['country_prefix'] == prefix]
    for loc in subset_a['Country_normalized'].unique():
        for country in subset_b['Country_normalized'].unique():
            if levenshtein_match(loc, country):
                distance = dp.lev.distance(str(loc), str(country))
                matches_tcity.append({
                    "Prefix": prefix,
                    "Source": "Traffic",
                    "Target": "Air_City",
                    "Country_A": loc,
                    "Country_B": country,
                    "Distance": distance
                })

tac_df = dp.pd.DataFrame(matches_tcity)
if not tac_df.empty:
    print("Traffic-Air_City matches (sorted by distance):")
    display(tac_df.sort_values('Distance'))
else:
    print("No Traffic-Air_City matches found")

Traffic-Air_City matches (sorted by distance):


Unnamed: 0,Prefix,Source,Target,Country_A,Country_B,Distance
0,por,Traffic,Air_City,portugal,portugal,0
58,aus,Traffic,Air_City,austria,austria,0
57,aus,Traffic,Air_City,australia,australia,0
56,pol,Traffic,Air_City,poland,poland,0
55,tha,Traffic,Air_City,thailand,thailand,0
...,...,...,...,...,...,...
23,per,Traffic,Air_City,peru,peru,0
22,mal,Traffic,Air_City,malta,malta,0
29,swe,Traffic,Air_City,sweden,sweden,0
80,uru,Traffic,Air_City,uruguay,uruguay,0


In [67]:
# Traffic - AirCountry matching
matches_tc = []

for prefix in dp.TC_common_prefixes:
    subset_a = dp.traffic[dp.traffic['country_prefix'] == prefix]
    subset_b = dp.air_country[dp.air_country['country_prefix'] == prefix]
    for loc in subset_a['Country_normalized'].unique():
        for country in subset_b['Country_normalized'].unique():
            if levenshtein_match(loc, country):
                distance = dp.lev.distance(str(loc), str(country))
                matches_tc.append({
                    "Prefix": prefix,
                    "Source": "Traffic",
                    "Target": "Air_Country",
                    "Country_A": loc,
                    "Country_B": country,
                    "Distance": distance
                })

tac_df = dp.pd.DataFrame(matches_tc)
if not tac_df.empty:
    print("Traffic-Air_Country matches (sorted by distance):")
    display(tac_df.sort_values('Distance'))
else:
    print("No Traffic-Air_Country matches found")

Traffic-Air_Country matches (sorted by distance):


Unnamed: 0,Prefix,Source,Target,Country_A,Country_B,Distance
0,por,Traffic,Air_Country,portugal,portugal,0
110,can,Traffic,Air_Country,canada,canada,0
112,sou,Traffic,Air_Country,south korea,south korea,0
113,sou,Traffic,Air_Country,south africa,south africa,0
114,sur,Traffic,Air_Country,suriname,suriname,0
...,...,...,...,...,...,...
55,mal,Traffic,Air_Country,malta,malta,0
172,pap,Traffic,Air_Country,papua new guinea,papua new guinea,0
111,pal,Traffic,Air_Country,palestine,palestina,1
46,ira,Traffic,Air_Country,iraq,iran,1


In [68]:
# Traffic-Weather matching
matches_tw = []

for prefix in dp.TW_common_prefixes:
    subset_a = dp.traffic[dp.traffic['country_prefix'] == prefix]
    subset_b = dp.weather[dp.weather['country_prefix'] == prefix]
    for loc in subset_a['Country_normalized'].unique():
        for country in subset_b['Country_normalized'].unique():
            if levenshtein_match(loc, country):
                distance = dp.lev.distance(str(loc), str(country))
                matches_tw.append({
                    "Prefix": prefix,
                    "Source": "Traffic",
                    "Target": "Weather",
                    "Value_A": loc,
                    "Value_B": country,
                    "Distance": distance
                })

tw_df = dp.pd.DataFrame(matches_tw)
if not tw_df.empty:
    print("Traffic-Weather matches (sorted by distance):")
    display(tw_df.sort_values('Distance'))
else:
    print("No Traffic-Weather matches found")

Traffic-Weather matches (sorted by distance):


Unnamed: 0,Prefix,Source,Target,Value_A,Value_B,Distance
0,por,Traffic,Weather,portugal,portugal,0
101,eth,Traffic,Weather,ethiopia,ethiopia,0
102,chi,Traffic,Weather,chile,chile,0
103,chi,Traffic,Weather,china,china,0
104,can,Traffic,Weather,canada,canada,0
...,...,...,...,...,...,...
56,per,Traffic,Weather,peru,peru,0
50,mal,Traffic,Weather,malta,malta,0
131,kyr,Traffic,Weather,kyrgyzstan,kyrghyzstan,1
41,ira,Traffic,Weather,iraq,iran,1


In [69]:
# AirCountry-Weather matching
matches_cw = []

for prefix in dp.CW_common_prefixes:
    subset_a = dp.air_country[dp.air_country['country_prefix'] == prefix]
    subset_b = dp.weather[dp.weather['country_prefix'] == prefix]
    for loc in subset_a['Country_normalized'].unique():
        for country in subset_b['Country_normalized'].unique():
            if levenshtein_match(loc, country):
                distance = dp.lev.distance(str(loc), str(country))
                matches_cw.append({
                    "Prefix": prefix,
                    "Source": "Air_Country",
                    "Target": "Weather",
                    "Value_A": loc,
                    "Value_B": country,
                    "Distance": distance
                })

acw_df = dp.pd.DataFrame(matches_cw)
if not acw_df.empty:
    print("Air_Country-Weather matches (sorted by distance):")
    display(acw_df.sort_values('Distance'))
else:
    print("No Air_Country-Weather matches found")

Air_Country-Weather matches (sorted by distance):


Unnamed: 0,Prefix,Source,Target,Value_A,Value_B,Distance
0,por,Air_Country,Weather,portugal,portugal,0
115,eth,Air_Country,Weather,ethiopia,ethiopia,0
116,chi,Air_Country,Weather,chile,chile,0
117,chi,Air_Country,Weather,china,china,0
118,can,Air_Country,Weather,canada,canada,0
...,...,...,...,...,...,...
57,mal,Air_Country,Weather,maldives,maldives,0
180,pap,Air_Country,Weather,papua new guinea,papua new guinea,0
150,kyr,Air_Country,Weather,kyrgyzstan,kyrghyzstan,1
47,ira,Air_Country,Weather,iraq,iran,1


In [70]:
# Forest - air_city matching
matches_fcity = []

for prefix in dp.FCity_common_prefixes:
    subset_a = dp.forest[dp.forest['country_prefix'] == prefix]
    subset_b = dp.air_city[dp.air_city['country_prefix'] == prefix]
    for loc in subset_a['Country_normalized'].unique():
        for country in subset_b['Country_normalized'].unique():
            if levenshtein_match(loc, country):
                distance = dp.lev.distance(str(loc), str(country))
                matches_fcity.append({
                    "Prefix": prefix,
                    "Source": "Forest",
                    "Target": "Air_City",
                    "Value_A": loc,
                    "Value_B": country,
                    "Distance": distance
                })

acw_df = dp.pd.DataFrame(matches_fcity)
if not acw_df.empty:
    print("Forest-Air_City matches (sorted by distance):")
    display(acw_df.sort_values('Distance'))
else:
    print("No Forest-Air_City matches found")

Forest-Air_City matches (sorted by distance):


Unnamed: 0,Prefix,Source,Target,Value_A,Value_B,Distance
0,por,Forest,Air_City,portugal,portugal,0
58,aus,Forest,Air_City,austria,austria,0
57,aus,Forest,Air_City,australia,australia,0
56,pol,Forest,Air_City,poland,poland,0
55,tha,Forest,Air_City,thailand,thailand,0
...,...,...,...,...,...,...
23,per,Forest,Air_City,peru,peru,0
22,mal,Forest,Air_City,malta,malta,0
29,swe,Forest,Air_City,sweden,sweden,0
80,uru,Forest,Air_City,uruguay,uruguay,0


In [71]:
# Forest - air_country matching
matches_fc = []

for prefix in dp.FC_common_prefixes:
    subset_a = dp.forest[dp.forest['country_prefix'] == prefix]
    subset_b = dp.air_country[dp.air_country['country_prefix'] == prefix]
    for loc in subset_a['Country_normalized'].unique():
        for country in subset_b['Country_normalized'].unique():
            if levenshtein_match(loc, country):
                distance = dp.lev.distance(str(loc), str(country))
                matches_fc.append({
                    "Prefix": prefix,
                    "Source": "Forest",
                    "Target": "Air_Country",
                    "Country_A": loc,
                    "Country_B": country,
                    "Distance": distance
                })

acw_df = dp.pd.DataFrame(matches_fc)
if not acw_df.empty:
    print("Forest-Air_City matches (sorted by distance):")
    display(acw_df.sort_values('Distance'))
else:
    print("No Forest-Air_City matches found")

Forest-Air_City matches (sorted by distance):


Unnamed: 0,Prefix,Source,Target,Country_A,Country_B,Distance
0,por,Forest,Air_Country,portugal,portugal,0
116,chi,Forest,Air_Country,china,china,0
117,can,Forest,Air_Country,canada,canada,0
118,pal,Forest,Air_Country,palau,palau,0
119,sou,Forest,Air_Country,south sudan,south sudan,0
...,...,...,...,...,...,...
63,lux,Forest,Air_Country,luxembourg,luxembourg,0
64,bul,Forest,Air_Country,bulgaria,bulgaria,0
65,cur,Forest,Air_Country,curacao,curacao,0
67,ser,Forest,Air_Country,serbia,serbia,0


In [72]:
# air_country - air_city matching
matches_ccity = []

for prefix in dp.CCity_common_prefixes:
    subset_a = dp.air_country[dp.air_country['country_prefix'] == prefix]
    subset_b = dp.air_city[dp.air_city['country_prefix'] == prefix]
    for loc in subset_a['Country_normalized'].unique():
        for country in subset_b['Country_normalized'].unique():
            if levenshtein_match(loc, country):
                distance = dp.lev.distance(str(loc), str(country))
                matches_ccity.append({
                    "Prefix": prefix,
                    "Source": "Air_Country",
                    "Target": "Air_City",
                    "Country_A": loc,
                    "Country_B": country,
                    "Distance": distance
                })

acw_df = dp.pd.DataFrame(matches_ccity)
if not acw_df.empty:
    print("Air_Country-Air_City matches (sorted by distance):")
    display(acw_df.sort_values('Distance'))
else:
    print("No Air_Country-Air_City matches found")

Air_Country-Air_City matches (sorted by distance):


Unnamed: 0,Prefix,Source,Target,Country_A,Country_B,Distance
0,por,Air_Country,Air_City,portugal,portugal,0
58,aus,Air_Country,Air_City,australia,australia,0
57,pol,Air_Country,Air_City,poland,poland,0
56,tha,Air_Country,Air_City,thailand,thailand,0
55,ban,Air_Country,Air_City,bangladesh,bangladesh,0
...,...,...,...,...,...,...
23,mal,Air_Country,Air_City,malta,malta,0
22,par,Air_Country,Air_City,paraguay,paraguay,0
29,cro,Air_Country,Air_City,croatia,croatia,0
81,uru,Air_Country,Air_City,uruguay,uruguay,0


In [73]:
# forest - weather
matches_fw = []

for prefix in dp.FW_common_prefixes:
    subset_a = dp.forest[dp.forest['country_prefix'] == prefix]
    subset_b = dp.weather[dp.weather['country_prefix'] == prefix]
    for loc in subset_a['Country_normalized'].unique():
        for country in subset_b['Country_normalized'].unique():
            if levenshtein_match(loc, country):
                distance = dp.lev.distance(str(loc), str(country))
                matches_fw.append({
                    "Prefix": prefix,
                    "Source": "Forest",
                    "Target": "Weather",
                    "Country_A": loc,
                    "Country_B": country,
                    "Distance": distance
                })

acw_df = dp.pd.DataFrame(matches_fw)
if not acw_df.empty:
    print("Forest - Weather matches (sorted by distance):")
    display(acw_df.sort_values('Distance'))
else:
    print("No Forest - Weather matches found")

AttributeError: module 'dp' has no attribute 'FW_common_prefixes'

In [None]:
# City-level matching setup
cities_air_city = air_city['City/Town'].dropna().unique()
cities_weather = weather['location_name'].dropna().unique()

In [None]:
# Air-City-Weather city matching
matches_city = []
for city in cities_air_city:
    for weather_city in cities_weather:
        if levenshtein_match(city, weather_city):
            distance = lev.distance(str(city), str(weather_city))
            matches_city.append({
                "Source": "Air_City",
                "Target": "Weather",
                "Value_A": city,
                "Value_B": weather_city,
                "Distance": distance
            })

city_df = pd.DataFrame(matches_city)
if not city_df.empty:
    print("Air_City-Weather city matches (sorted by distance):")
    display(city_df.sort_values('Distance'))
else:
    print("No city-level matches found")

Air_City-Weather city matches (sorted by distance):


Unnamed: 0,Source,Target,Value_A,Value_B,Distance
0,Air_City,Weather,Tirana,Tirana,0
55,Air_City,Weather,Amsterdam,Amsterdam,0
54,Air_City,Weather,Ulaanbaatar,Ulaanbaatar,0
53,Air_City,Weather,Skopje,Skopje,0
52,Air_City,Weather,Mexico City,Mexico City,0
...,...,...,...,...,...
44,Air_City,Weather,Roma,Rome,1
31,Air_City,Weather,Vienne,Vienna,1
39,Air_City,Weather,Mehran,Tehran,1
11,Air_City,Weather,Rigi,Riga,1
