In [131]:
import pandas as pd
from py_stringmatching import JaroWinkler

In [132]:
traffic = pd.read_csv("original data/trafficlist_forcountry.csv")
forest = pd.read_csv("original data/forest-cover-v1.csv")
air_city = pd.read_csv("original data/aap_air_quality_database_2018_v14.csv", skiprows=2)
air_country = pd.read_csv("original data/【12】GlobalPM25-1998-2022.csv")
weather = pd.read_csv("original data/GlobalWeatherRepository.csv")

In [133]:
# Country - Jaro-Winkler
jw = JaroWinkler()
threshold = 0.90 # Set the matching threshold (adjustable)
matches1 = [] # result

traffic['Location'] = traffic['Location'].str.lower().str.strip()
forest['Country Name'] = forest['Country Name'].str.lower().str.strip()
air_city['Country'] = air_city['Country'].str.lower().str.strip()
air_country['Region'] = air_country['Region'].str.lower().str.strip()
weather['country'] = weather['country'].str.lower().str.strip()

# 2. Extract prefix column
prefix_length = 3

traffic['country_prefix'] = traffic['Location'].str[:prefix_length]
forest['country_prefix'] = forest['Country Name'].str[:prefix_length]
air_city['country_prefix'] = air_city['Country'].str[:prefix_length]
air_country['country_prefix'] = air_country['Region'].str[:prefix_length]
weather['country_prefix'] = weather['country'].str[:prefix_length]


In [134]:
# traffic-forest

# Find common prefixes (blocking)
common_prefixes = set(traffic['country_prefix']).intersection(set(forest['country_prefix']))
print(f"Common country prefixes are: {common_prefixes}")

# For each common prefix, compare countries
for prefix in common_prefixes:
    subset_a = traffic[traffic['country_prefix'] == prefix]
    subset_b = forest[forest['country_prefix'] == prefix]
    
    for country_a in subset_a['Location'].unique():
        for country_b in subset_b['Country Name'].unique():
            score = jw.get_sim_score(str(country_a), str(country_b))
            if score >= threshold:
                matches1.append({
                    "Prefix": prefix,
                    "Country_A": country_a,
                    "Country_B": country_b,
                    "Similarity": round(score, 4)
                })

# Convert to DataFrame and display results
result_df = pd.DataFrame(matches1)
print(result_df)



Common country prefixes are: {'nep', 'hun', 'sie', 'new', 'nor', 'geo', 'sud', 'mex', 'sin', 'isr', 'egy', 'ton', 'uru', 'tur', 'lux', 'lat', 'mor', 'ecu', 'far', 'bru', 'pap', 'uni', 'jap', 'el ', 'pol', 'jor', 'zam', 'mal', 'sey', 'tun', 'nam', 'bur', 'pal', 'bot', 'mad', 'gui', 'tha', 'ben', 'slo', 'dji', 'eri', 'est', 'uzb', 'mol', 'nig', 'fin', 'tan', 'dom', 'cub', 'gib', 'swe', 'alg', 'lie', 'nau', 'alb', 'phi', 'fra', 'ind', 'moz', 'cen', 'ire', 'taj', 'bul', 'den', 'isl', 'gua', 'kaz', 'tri', 'oma', 'bar', 'ice', 'cha', 'som', 'net', 'cro', 'bel', 'sen', 'cze', 'spa', 'aze', 'lib', 'ita', 'bos', 'kyr', 'mon', 'pak', 'ira', 'fij', 'kir', 'arm', 'ven', 'rom', 'afg', 'ban', 'mic', 'aus', 'sol', 'com', 'cam', 'leb', 'can', 'sau', 'rwa', 'eth', 'gam', 'lao', 'bhu', 'mya', 'bol', 'kuw', 'sam', 'cyp', 'gre', 'san', 'chi', 'sou', 'lit', 'zim', 'por', 'qat', 'ukr', 'bah', 'sri', 'ken', 'arg', 'mac', 'ger', 'gha', 'syr', 'swi', 'rus', 'guy', 'and', 'ant', 'uga', 'sur', 'col', 'vie', 'tog

In [135]:
# traffic-air-city
# Find common prefixes (blocking)
common_prefixes = set(traffic['country_prefix']).intersection(set(air_city['country_prefix']))
print(f"Common country prefixes are: {common_prefixes}")

# For each common prefix, compare countries
for prefix in common_prefixes:
    subset_a = traffic[traffic['country_prefix'] == prefix]
    subset_b = air_city[air_city['country_prefix'] == prefix]
    
    for country_a in subset_a['Location'].unique():
        for country_b in subset_b['Country'].unique():
            score = jw.get_sim_score(str(country_a), str(country_b))
            if score >= threshold:
                matches1.append({
                    "Prefix": prefix,
                    "Country_A": country_a,
                    "Country_B": country_b,
                    "Similarity": round(score, 4)
                })

#  Convert to DataFrame and display results
result_df = pd.DataFrame(matches1)
print(result_df)


Common country prefixes are: {'nep', 'hun', 'new', 'nor', 'geo', 'mex', 'sin', 'isr', 'uru', 'tur', 'lux', 'lat', 'mor', 'ecu', 'uni', 'jap', 'el ', 'pol', 'jor', 'mal', 'mad', 'tha', 'slo', 'est', 'fin', 'cub', 'swe', 'alb', 'phi', 'fra', 'ind', 'ire', 'bul', 'den', 'gua', 'ice', 'net', 'cro', 'bel', 'sen', 'cze', 'spa', 'lib', 'ita', 'bos', 'mon', 'pak', 'fij', 'ira', 'rom', 'ban', 'aus', 'cam', 'leb', 'can', 'sau', 'kuw', 'cyp', 'gre', 'chi', 'sou', 'lit', 'por', 'ukr', 'bah', 'ken', 'ger', 'gha', 'swi', 'rus', 'and', 'uga', 'col', 'vie', 'ser', 'pan', 'cos', 'bra', 'per'}
    Prefix     Country_A     Country_B  Similarity
0      nep         nepal         nepal         1.0
1      hun       hungary       hungary         1.0
2      sie  sierra leone  sierra leone         1.0
3      new   new zealand   new zealand         1.0
4      nor        norway        norway         1.0
..     ...           ...           ...         ...
241    ser        serbia        serbia         1.0
242    pa

In [136]:
# traffic-air-country
common_prefixes = set(traffic['country_prefix']).intersection(set(air_country['country_prefix']))
print(f"Common country prefixes are: {common_prefixes}")

# For each common prefix, compare countries
for prefix in common_prefixes:
    subset_a = traffic[traffic['country_prefix'] == prefix]
    subset_b = air_country[air_country['country_prefix'] == prefix]
    
    for country_a in subset_a['Location'].unique():
        for country_b in subset_b['Region'].unique():
            score = jw.get_sim_score(str(country_a), str(country_b))
            if score >= threshold:
                matches1.append({
                    "Prefix": prefix,
                    "Country_A": country_a,
                    "Country_B": country_b,
                    "Similarity": round(score, 4)
                })

#  Convert to DataFrame and display results
result_df = pd.DataFrame(matches1)
print(result_df)

Common country prefixes are: {'nep', 'hun', 'sie', 'new', 'nor', 'geo', 'sud', 'mex', 'sin', 'isr', 'egy', 'ton', 'uru', 'tur', 'lux', 'lat', 'mor', 'ecu', 'far', 'cap', 'bru', 'pap', 'uni', 'jap', 'el ', 'pol', 'jor', 'zam', 'mal', 'sey', 'tun', 'nam', 'bur', 'pal', 'bot', 'mad', 'gui', 'tha', 'ben', 'slo', 'dji', 'eri', 'est', 'uzb', 'mol', 'nig', 'fin', 'tan', 'dom', 'cub', 'gib', 'swe', 'alg', 'lie', 'nau', 'alb', 'phi', 'fra', 'ind', 'moz', 'cen', 'ire', 'taj', 'bul', 'den', 'isl', 'gua', 'kaz', 'tri', 'oma', 'bar', 'gue', 'ice', 'cha', 'som', 'net', 'cro', 'bel', 'sen', 'kos', 'cze', 'spa', 'aze', 'lib', 'ita', 'bos', 'kyr', 'tai', 'mon', 'pak', 'ira', 'fij', 'kir', 'arm', 'ven', 'rom', 'afg', 'ban', 'mic', 'aus', 'sol', 'com', 'cam', 'leb', 'can', 'sau', 'rwa', 'eth', 'sai', 'gam', 'lao', 'bhu', 'mya', 'bol', 'kuw', 'jer', 'sam', 'cyp', 'gre', 'san', 'chi', 'sou', 'lit', 'zim', 'por', 'qat', 'ukr', 'bah', 'sri', 'ken', 'arg', 'mac', 'ger', 'gha', 'syr', 'swi', 'rus', 'guy', 'and

In [137]:
# traffic-weather
common_prefixes = set(traffic['country_prefix']).intersection(set(weather['country_prefix']))
print(f"Common country prefixes are: {common_prefixes}")

# For each common prefix, compare countries
for prefix in common_prefixes:
    subset_a = traffic[traffic['country_prefix'] == prefix]
    subset_b = weather[weather['country_prefix'] == prefix]
    
    for country_a in subset_a['Location'].unique():
        for country_b in subset_b['country'].unique():
            score = jw.get_sim_score(str(country_a), str(country_b))
            if score >= threshold:
                matches1.append({
                    "Prefix": prefix,
                    "Country_A": country_a,
                    "Country_B": country_b,
                    "Similarity": round(score, 4)
                })

#  Convert to DataFrame and display results
result_df = pd.DataFrame(matches1)
print(result_df)

Common country prefixes are: {'nep', 'hun', 'sie', 'new', 'nor', 'geo', 'sud', 'mex', 'sin', 'isr', 'egy', 'ton', 'uru', 'tur', 'lux', 'lat', 'mor', 'ecu', 'cap', 'bru', 'pap', 'uni', 'jap', 'el ', 'pol', 'jor', 'zam', 'mal', 'sey', 'tun', 'nam', 'bur', 'pal', 'bot', 'mad', 'gui', 'tha', 'ben', 'slo', 'dji', 'eri', 'est', 'uzb', 'nig', 'fin', 'tan', 'dom', 'cub', 'swe', 'alg', 'lie', 'alb', 'phi', 'fra', 'ind', 'moz', 'cen', 'ire', 'taj', 'bul', 'den', 'gua', 'kaz', 'tri', 'oma', 'bar', 'ice', 'cha', 'som', 'net', 'cro', 'bel', 'sen', 'cze', 'spa', 'aze', 'lib', 'ita', 'bos', 'kyr', 'mon', 'pak', 'ira', 'fij', 'kir', 'arm', 'ven', 'rom', 'afg', 'ban', 'mic', 'aus', 'sol', 'com', 'cam', 'leb', 'can', 'sau', 'rwa', 'eth', 'sai', 'gam', 'bhu', 'mya', 'bol', 'kuw', 'sam', 'cyp', 'gre', 'san', 'chi', 'sou', 'lit', 'zim', 'por', 'qat', 'ukr', 'bah', 'sri', 'ken', 'arg', 'mac', 'ger', 'gha', 'syr', 'swi', 'rus', 'guy', 'and', 'ant', 'uga', 'sur', 'vie', 'ser', 'pan', 'jam', 'cos', 'bra', 'per

In [138]:
# air-country-weather
common_prefixes = set(forest['country_prefix']).intersection(set(weather['country_prefix']))
print(f"Common country prefixes are: {common_prefixes}")

# For each common prefix, compare countries
for prefix in common_prefixes:
    subset_a = forest[forest['country_prefix'] == prefix]
    subset_b = weather[weather['country_prefix'] == prefix]
    
    for country_a in subset_a['Country Name'].unique():
        for country_b in subset_b['country'].unique():
            score = jw.get_sim_score(str(country_a), str(country_b))
            if score >= threshold:
                matches1.append({
                    "Prefix": prefix,
                    "Country_A": country_a,
                    "Country_B": country_b,
                    "Similarity": round(score, 4)
                })

#  Convert to DataFrame and display results
result_df = pd.DataFrame(matches1)
print(result_df)

Common country prefixes are: {'nep', 'hun', 'sie', 'new', 'nor', 'geo', 'hai', 'sud', 'egy', 'mex', 'isr', 'sin', 'ton', 'uru', 'tur', 'lux', 'lat', 'mor', 'mau', 'ecu', 'bru', 'pap', 'tuv', 'el ', 'jap', 'uni', 'pol', 'jor', 'zam', 'mal', 'sey', 'nam', 'bur', 'tun', 'pal', 'bot', 'mad', 'gui', 'ben', 'equ', 'tha', 'slo', 'dji', 'eri', 'est', 'uzb', 'nig', 'fin', 'tan', 'dom', 'cub', 'swe', 'alg', 'yem', 'lie', 'alb', 'phi', 'tim', 'fra', 'ind', 'moz', 'cen', 'ire', 'taj', 'bul', 'den', 'ang', 'van', 'gua', 'kaz', 'con', 'par', 'gab', 'tri', 'oma', 'bar', 'ice', 'cha', 'som', 'net', 'cro', 'bel', 'sen', 'cze', 'aze', 'spa', 'lib', 'ita', 'nic', 'bos', 'kyr', 'mon', 'pak', 'fij', 'ira', 'kir', 'arm', 'ven', 'rom', 'afg', 'ban', 'mic', 'aus', 'sol', 'com', 'cam', 'leb', 'can', 'les', 'sau', 'rwa', 'eth', 'gam', 'bhu', 'mya', 'bol', 'kuw', 'sam', 'cyp', 'gre', 'san', 'chi', 'sou', 'lit', 'zim', 'por', 'qat', 'ukr', 'bah', 'sri', 'ken', 'arg', 'mar', 'ger', 'gha', 'mac', 'syr', 'swi', 'rus

In [139]:
# forest-air-city
common_prefixes = set(forest['country_prefix']).intersection(set(air_city['country_prefix']))
print(f"Common country prefixes are: {common_prefixes}")

# For each common prefix, compare countries
for prefix in common_prefixes:
    subset_a = forest[forest['country_prefix'] == prefix]
    subset_b = air_city[air_city['country_prefix'] == prefix]
    
    for country_a in subset_a['Country Name'].unique():
        for country_b in subset_b['Country'].unique():
            score = jw.get_sim_score(str(country_a), str(country_b))
            if score >= threshold:
                matches1.append({
                    "Prefix": prefix,
                    "Country_A": country_a,
                    "Country_B": country_b,
                    "Similarity": round(score, 4)
                })

#  Convert to DataFrame and display results
result_df = pd.DataFrame(matches1)
print(result_df)

Common country prefixes are: {'nep', 'hun', 'new', 'nor', 'geo', 'mex', 'sin', 'isr', 'uru', 'tur', 'lux', 'lat', 'mor', 'ecu', 'uni', 'jap', 'el ', 'pol', 'jor', 'mal', 'mad', 'tha', 'slo', 'est', 'fin', 'cub', 'swe', 'alb', 'phi', 'fra', 'ind', 'ire', 'bul', 'den', 'gua', 'par', 'ice', 'net', 'cro', 'bel', 'sen', 'cze', 'spa', 'lib', 'ita', 'bos', 'mon', 'pak', 'fij', 'ira', 'rom', 'ban', 'aus', 'cam', 'leb', 'can', 'sau', 'kuw', 'cyp', 'gre', 'chi', 'sou', 'lit', 'por', 'ukr', 'bah', 'ken', 'ger', 'gha', 'swi', 'rus', 'and', 'uga', 'col', 'vie', 'ser', 'pan', 'cos', 'bra', 'per'}
    Prefix     Country_A     Country_B  Similarity
0      nep         nepal         nepal         1.0
1      hun       hungary       hungary         1.0
2      sie  sierra leone  sierra leone         1.0
3      new   new zealand   new zealand         1.0
4      nor        norway        norway         1.0
..     ...           ...           ...         ...
844    ser        serbia        serbia         1.0
84

In [140]:
# forest-air-country
common_prefixes = set(forest['country_prefix']).intersection(set(air_country['country_prefix']))
print(f"Common country prefixes are: {common_prefixes}")

# For each common prefix, compare countries
for prefix in common_prefixes:
    subset_a = forest[forest['country_prefix'] == prefix]
    subset_b = air_country[air_country['country_prefix'] == prefix]
    
    for country_a in subset_a['Country Name'].unique():
        for country_b in subset_b['Region'].unique():
            score = jw.get_sim_score(str(country_a), str(country_b))
            if score >= threshold:
                matches1.append({
                    "Prefix": prefix,
                    "Country_A": country_a,
                    "Country_B": country_b,
                    "Similarity": round(score, 4)
                })

#  Convert to DataFrame and display results
result_df = pd.DataFrame(matches1)
print(result_df)

Common country prefixes are: {'nep', 'hun', 'sie', 'new', 'nor', 'geo', 'hai', 'sud', 'egy', 'mex', 'isr', 'sin', 'ton', 'vir', 'uru', 'tur', 'lux', 'lat', 'mor', 'mau', 'ecu', 'far', 'bru', 'pap', 'tuv', 'uni', 'jap', 'el ', 'pol', 'jor', 'zam', 'mal', 'sey', 'nam', 'bur', 'tun', 'pal', 'bot', 'mad', 'gui', 'ben', 'equ', 'slo', 'tha', 'dji', 'fre', 'eri', 'est', 'uzb', 'mol', 'nig', 'fin', 'tan', 'dom', 'cub', 'gib', 'pue', 'alg', 'swe', 'yem', 'lie', 'nau', 'alb', 'phi', 'tim', 'fra', 'ind', 'moz', 'cen', 'ire', 'taj', 'wes', 'aru', 'bul', 'den', 'ang', 'van', 'cay', 'isl', 'gua', 'kaz', 'par', 'ame', 'gab', 'ber', 'oma', 'bar', 'tri', 'ice', 'cha', 'som', 'net', 'cro', 'bel', 'sen', 'cze', 'aze', 'spa', 'lib', 'ita', 'nic', 'bos', 'kyr', 'cur', 'mon', 'pak', 'fij', 'ira', 'kir', 'arm', 'ven', 'rom', 'afg', 'ban', 'mic', 'aus', 'sol', 'com', 'cam', 'leb', 'can', 'les', 'sau', 'rwa', 'eth', 'gam', 'lao', 'bhu', 'mya', 'bol', 'bri', 'kuw', 'sam', 'cyp', 'gre', 'san', 'chi', 'sou', 'lit

In [141]:
# air-country-air-city
common_prefixes = set(air_country['country_prefix']).intersection(set(air_city['country_prefix']))
print(f"Common country prefixes are: {common_prefixes}")

# For each common prefix, compare countries
for prefix in common_prefixes:
    subset_a = air_country[air_country['country_prefix'] == prefix]
    subset_b = air_city[air_city['country_prefix'] == prefix]
    
    for country_a in subset_a['Region'].unique():
        for country_b in subset_b['Country'].unique():
            score = jw.get_sim_score(str(country_a), str(country_b))
            if score >= threshold:
                matches1.append({
                    "Prefix": prefix,
                    "Country_A": country_a,
                    "Country_B": country_b,
                    "Similarity": round(score, 4)
                })

#  Convert to DataFrame and display results
result_df = pd.DataFrame(matches1)
print(result_df)

Common country prefixes are: {'nep', 'hun', 'new', 'nor', 'geo', 'mex', 'sin', 'isr', 'uru', 'tur', 'lux', 'lat', 'mor', 'ecu', 'uni', 'jap', 'el ', 'pol', 'jor', 'mal', 'mad', 'tha', 'slo', 'est', 'fin', 'cub', 'swe', 'alb', 'phi', 'fra', 'ind', 'ire', 'bul', 'den', 'rep', 'gua', 'par', 'ice', 'net', 'cro', 'bel', 'sen', 'cze', 'spa', 'lib', 'ita', 'bos', 'mon', 'pak', 'fij', 'ira', 'rom', 'ban', 'aus', 'cam', 'leb', 'can', 'sau', 'kuw', 'cyp', 'gre', 'chi', 'sou', 'lit', 'por', 'ukr', 'bah', 'ken', 'ger', 'gha', 'swi', 'rus', 'and', 'uga', 'col', 'vie', 'ser', 'pan', 'cos', 'bra', 'per'}
     Prefix     Country_A     Country_B  Similarity
0       nep         nepal         nepal         1.0
1       hun       hungary       hungary         1.0
2       sie  sierra leone  sierra leone         1.0
3       new   new zealand   new zealand         1.0
4       nor        norway        norway         1.0
...     ...           ...           ...         ...
1123    ser        serbia        serbia

In [142]:
# City - Jaro-Winkler
jw = JaroWinkler()
threshold = 0.90
matches2 = []

cities_air_city = air_city['City/Town'].dropna().unique()
cities_air_weather = weather['location_name'].dropna().unique()

In [143]:
#Blocking strategy
#Hash Strategy - city to city
#Categorize the cities by their country, and pick an example [ex: China] for comparison


from itertools import product

cities_air_city = air_city['City/Town'].dropna().unique()
cities_air_weather = weather['location_name'].dropna().unique()

# 1. Preprocessing: remove empty values, unify format (lowercase & remove spaces)
air_city['City/Town'] = air_city['City/Town'].str.lower().str.strip()
air_city['Country'] = air_city['Country'].str.lower().str.strip()
weather['location_name'] = weather['location_name'].str.lower().str.strip()
weather['country'] = weather['country'].str.lower().str.strip()

# 2. Filter records whose country is China
cities_air_city_China = air_city[air_city['Country'] == 'China']
cities_air_weather_China = weather[weather['country'] == 'China']

# 3. Finding common cities (Blocking)
common_cities = set(cities_air_city_China['City/Town']).intersection(
    set(cities_air_weather_China['location_name'])
)

print(f"China's common cities are: {common_cities}")

# 4. Make matching combinations in common cities
# air-city-weather
for city_a in cities_air_city:
    for city_b in cities_air_weather:
        score = jw.get_sim_score(str(city_a), str(city_b))
        if score >= threshold:
            matches2.append({
                "City_a": city_a,
                "City_B": city_b,
                "Similarity": round(score, 4)
            })

# reslt
result_df = pd.DataFrame(matches2)
print(result_df)


China's common cities are: set()
            City_a             City_B  Similarity
0           Tirana             Tirana      1.0000
1         Canberra           Canberra      1.0000
2         Brussels           Brussels      1.0000
3            Dhaka              Dhaka      1.0000
4            Sofia              Sofia      1.0000
..             ...                ...         ...
102  San Francisco      San Francisco      1.0000
103     San Rafael         San Rafael      1.0000
104    Victorville           Victoria      0.9023
105     Washington    Washington Park      0.9333
106     Washington  Washington Harbor      0.9176

[107 rows x 3 columns]
