# Packages

In [1]:
from folders import *
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns

plt.style.use("seaborn")

# Directory

In [2]:
path = r"C:\Users\jigon\OneDrive\Documentos\Economía\Commuting-Zones-Costa-Rica"
os.chdir(path)

# Read the data

In [3]:
enaho2020 = pd.read_spss(enaho2020_file)
enaho2021 = pd.read_spss(enaho2021_file)
distances = pd.read_excel(costa_rican_municipalities_distance_matrix)
municipalities_correspondence = pd.read_excel(region_concordance_file, sheet_name="municipalidades")
provinces_correspondence = pd.read_excel(region_concordance_file, sheet_name="provincias")

# Aggregating distances
Since we only know the administrative region of origin $i$ and the destination municipality or province $j$, we aggregate these distances following Head and Mayer (2002): 
$$dist_{ij}=\left(\sum_{r\,\in\,i}\sum_{s\,\in\,j}\left(\dfrac{pop_{r}}{pop_{i}}\right)\left(\dfrac{pop_{s}}{pop_{j}}\right)d_{rs}^{\theta}\right)^{1/\theta}$$

We assume $\theta=-1$. 

In [4]:
distances.loc[distances["travel_time"] == 0, "travel_time"] = 1
distances.loc[distances["distance"] == 0, "distance"] = 1
for column in ['municipality_origin', 'region_origin', 'province_origin', 'municipality_destination', 'region_destination', 'province_destination']: 
    distances[column] = distances[column].str.strip()

## Municipalities

In [5]:
mun_distances = distances.copy()

In [6]:
share_pop_origin_mun = distances[['municipality_origin', 'region_origin', 'province_origin', 'employment_origin']].drop_duplicates()
share_pop_origin_mun['employment_origin'] /= share_pop_origin_mun.groupby(['region_origin'])['employment_origin'].transform("sum")

In [7]:
mun_distances.drop(columns=['employment_origin', 'employment_destination'], inplace=True)
mun_distances = mun_distances.merge(share_pop_origin_mun, how="left", on=['municipality_origin', 'region_origin', 'province_origin'])
mun_distances['employment_destination'] = 1.0
mun_distances['share_dist'] = mun_distances['employment_origin'] * mun_distances['employment_destination'] / mun_distances['distance']
mun_distances['share_time'] = mun_distances['employment_origin'] * mun_distances['employment_destination'] / mun_distances['travel_time']
mun_distances = mun_distances.groupby(['region_origin', 'municipality_destination']).sum()[["share_dist", "share_time"]].reset_index().copy()
mun_distances['share_dist'] = 1 / mun_distances['share_dist']
mun_distances['share_time'] = 1 / mun_distances['share_time']
mun_distances.rename(columns={"region_origin": "REGION", 'municipality_destination': "municipality"}, inplace=True)

# Provinces

In [8]:
provinces_distances = distances.copy()

In [9]:
share_pop_origin_provinces = distances[['municipality_origin', 'region_origin', 'province_origin', 'employment_origin']].drop_duplicates()
share_pop_origin_provinces['employment_origin'] /= share_pop_origin_provinces.groupby(['region_origin'])['employment_origin'].transform("sum")
share_pop_destination_provinces = distances[['municipality_destination', 'region_destination', 'province_destination', 'employment_destination']].drop_duplicates()
share_pop_destination_provinces['employment_destination'] /= share_pop_destination_provinces.groupby(['province_destination'])['employment_destination'].transform("sum")

In [10]:
provinces_distances.drop(columns=['employment_origin', 'employment_destination'], inplace=True)
provinces_distances = provinces_distances.merge(share_pop_origin_provinces, how="left", on=['municipality_origin', 'region_origin', 'province_origin'])
provinces_distances = provinces_distances.merge(share_pop_destination_provinces, how="left", on=['municipality_destination', 'region_destination', 'province_destination'])
provinces_distances['share_dist'] = provinces_distances['employment_origin'] * provinces_distances['employment_destination'] / provinces_distances['distance']
provinces_distances['share_time'] = provinces_distances['employment_origin'] * provinces_distances['employment_destination'] / provinces_distances['travel_time']
provinces_distances = provinces_distances.groupby(['region_origin', 'province_destination']).sum()[["share_dist", "share_time"]].reset_index().copy()
provinces_distances['share_dist'] = 1 / provinces_distances['share_dist']
provinces_distances['share_time'] = 1 / provinces_distances['share_time']
provinces_distances.rename(columns={"region_origin": "REGION", 'province_destination': "province"}, inplace=True)

# Clean before saving

In [11]:
mun_distances = mun_distances.merge(municipalities_correspondence, on="municipality", how="left").drop(columns="municipality")
provinces_distances = provinces_distances.merge(provinces_correspondence, on="province", how="left").drop(columns="province")
agg_distances = pd.concat([mun_distances, provinces_distances], ignore_index=True)

# Save aggregate distances

In [12]:
agg_distances.to_excel(costa_rican_aggregate_distance_matrix, index=False)