# Create migration matrix
VGsim needs a matrix with entries containing probability of an individual moving between locations per unit time, this notebook
obtains that by calculating number of people flying between countries through reversing an effective distance matrix that is available in some supplementary materials https://www.science.org/doi/10.1126/science.1245200#supplementary-materials. 

### Load libraries and datasets

In [260]:
import pandas as pd
import numpy as np

# Alpha2 code of where to start the pandemic, look in the effective distance matrix for the available countries.
pandemic_start_alpha2 = 'CN'
# Uniform probability of staying in any country per unit time.
# Magnitude as UK citizens leaving per day (from ONS) in 2022 / UK population in 2022 = 0.0029 so 1 minus this for magnitude
in_country_probability = 0.999

# Effective distance data

effective_distances = pd.read_csv('original_data/effective.distance.matrix.country.csv', header=0)
effective_distances.drop(effective_distances.columns[0], axis=1, inplace=True)

# Population sizes data

population_sizes = pd.read_csv('output/census_2013.csv', header=0)
alpha2_codes = effective_distances.columns

# Sampling data (obtained from sampling_multipliers.ipynb)

sampling_multipliers = pd.read_csv('output/sampling_multipliers.csv', header=0, index_col=0)

effective_distances.head()

Unnamed: 0,AE,AF,AG,AI,AL,AM,AO,AR,AS,AT,...,VI,VN,VU,WF,WS,YE,YT,ZA,ZM,ZW
0,0.0,5.614035,11.531617,15.902166,10.178471,7.76953,7.319591,9.130665,13.846666,6.344803,...,10.779283,7.031367,10.655119,14.960192,11.888486,6.435626,12.316025,5.262377,7.236649,8.955652
1,1.548074,0.0,13.07969,17.45024,10.173316,9.317603,8.867664,10.678739,15.394739,5.369434,...,12.327357,8.579441,12.203193,16.508266,13.43656,7.9837,13.864099,6.810451,8.784722,10.503726
2,7.181742,12.795778,0.0,5.366969,11.139288,13.218427,12.27586,8.597538,11.985323,8.737326,...,7.430663,11.554362,14.098395,16.611105,13.239721,13.617368,14.920257,8.765838,12.203931,12.459113
3,8.968363,14.582398,2.224032,0.0,13.363319,14.356298,14.254211,9.569931,12.957716,10.961358,...,2.213855,12.526755,15.070788,17.583498,14.212114,15.403988,14.410679,10.98987,14.427962,14.683145
4,6.413046,10.472095,11.76067,17.12764,0.0,10.349453,11.707611,8.216086,15.163964,3.566389,...,12.096582,11.296419,17.068165,18.930315,16.418362,11.336016,12.90528,10.016969,13.455061,13.435707


## Flux matrix
Invert effective distance function to get flux. Off diagonals matter only, since diagonal is zero in effective distance matrix.

In [261]:
flux_matrix = effective_distances.apply(lambda x: np.exp(1 - x))

## Clean flux matrix
Remove rows and columns of countries in flux matrix that aren't in population dataset.

In [262]:
missing_countries_columns = []
missing_countries_rows = []
for index, alpha2 in enumerate(alpha2_codes):
    country_mask = population_sizes['alpha2'] == alpha2
    country_data = population_sizes[country_mask]
    if len(country_data['population']) == 0:
        missing_countries_columns.append(alpha2)
        missing_countries_rows.append(index)
flux_matrix = flux_matrix.drop(missing_countries_columns, axis=1).drop(flux_matrix.index[missing_countries_rows])
flux_matrix.index = flux_matrix.columns

# Arbritralily say that 85% of people in each country stay in that country each unit of time. If migration rates are unrealistically high, this number is too low.
np.fill_diagonal(flux_matrix.values, 0)

In [263]:
normalising_matrix = (1-in_country_probability)*np.diag(np.reciprocal(flux_matrix.sum(axis=0)))
normalising_matrix

array([[0.00010989, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.01336507, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.00112343, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.00016248, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.00129997,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.00193516]])

In [264]:
normalised_matrix = np.matmul(flux_matrix.to_numpy(),normalising_matrix)
np.fill_diagonal(normalised_matrix, in_country_probability)
probability_matrix = pd.DataFrame(normalised_matrix, index=flux_matrix.columns, columns=flux_matrix.columns)
probability_matrix.head()

Unnamed: 0,AE,AF,AG,AL,AM,AO,AR,AS,AT,AU,...,VE,VG,VI,VN,VU,WS,YE,ZA,ZM,ZW
AE,0.999,0.0001324711,2.997255e-08,1.083707e-06,8.119638e-06,3.468076e-06,1.809255e-07,1.391127e-08,2.760247e-06,4.809794e-06,...,9.430056e-08,2.944346e-08,2.502199e-07,2.920553e-06,2.402221e-07,2.86256e-08,1.110469e-05,2.289121e-06,2.54327e-06,6.786123e-07
AF,6.352546e-05,0.999,6.373878e-09,1.089307e-06,1.726699e-06,7.375112e-07,3.847512e-08,2.958331e-09,7.320577e-06,1.022837e-06,...,2.005369e-08,6.261364e-09,5.321105e-08,6.210766e-07,5.108496e-08,6.08744e-09,2.361492e-06,4.86798e-07,5.408447e-07,1.443118e-07
AG,2.271312e-07,1.007233e-07,0.999,4.146048e-07,3.492291e-08,2.441227e-08,3.083429e-07,8.948239e-08,2.522832e-07,1.537201e-07,...,6.065759e-07,7.172158e-06,7.122121e-06,3.170687e-08,7.677453e-09,7.411751e-09,8.443363e-09,6.888659e-08,1.770636e-08,2.04215e-08
AL,4.899113e-07,1.02871e-06,2.383674e-08,0.999,6.153047e-07,4.309189e-08,4.515404e-07,3.726245e-09,4.442195e-05,7.888169e-09,...,4.618992e-07,7.886668e-09,6.70234e-08,4.103709e-08,3.939696e-10,3.086417e-10,8.265975e-08,1.971404e-08,5.067226e-09,7.690567e-09
AM,6.096118e-06,2.703377e-06,3.277042e-09,1.010938e-06,0.999,7.077407e-08,1.498807e-08,3.084098e-09,2.394362e-05,9.815492e-08,...,2.090623e-08,6.527553e-09,5.547321e-08,2.3654e-06,4.902285e-09,5.841713e-10,2.266167e-07,4.671478e-08,5.190129e-08,1.384865e-08


# Place starting place at (0,0)
By default, the first entry in the population data is where the pandemic starts (in VGsim), so we need to swap the first row and first column with the row and column of the country we want to start in. 


In [265]:
# Get column index (and by symmetry also row index) of starting country 
target_start_index = probability_matrix.columns.get_loc(pandemic_start_alpha2)
original_start_name = probability_matrix.columns[0]
# Swap the columns of the start country and the first country 
col_list = list(probability_matrix)
col_list[0], col_list[target_start_index] = col_list[target_start_index], col_list[0]
probability_matrix.columns = col_list

# Swap the rows 
ordered_proportions = probability_matrix.reindex(col_list)
ordered_proportions = ordered_proportions.transpose()
ordered_proportions.head()

Unnamed: 0,CN,AF,AG,AL,AM,AO,AR,AS,AT,AU,...,VE,VG,VI,VN,VU,WS,YE,ZA,ZM,ZW
CN,2.251074e-06,6.352546e-05,2.271312e-07,4.899113e-07,6.096118e-06,1.079414e-05,2.568488e-07,2.362503e-07,1.887108e-06,8.998446e-06,...,1.283912e-07,9.825151e-08,3.481949e-07,1.521097e-06,1.898966e-06,5.239387e-07,2.459809e-05,1.1695e-05,1.20148e-05,2.25041e-06
AF,1.03767e-06,0.999,1.007233e-07,1.02871e-06,2.703377e-06,4.786755e-06,1.139019e-07,1.047673e-07,1.031239e-05,3.990441e-06,...,5.693622e-08,4.35705e-08,1.544101e-07,6.745441e-07,8.421131e-07,2.323453e-07,1.090824e-05,5.186249e-06,5.328072e-06,9.979644e-07
AG,2.062721e-08,6.373878e-09,0.999,2.383674e-08,3.277042e-09,9.801886e-09,5.610882e-08,1.963798e-07,2.241458e-08,3.740947e-08,...,1.067234e-07,2.80172e-06,1.968416e-06,2.153321e-09,7.894618e-09,1.740734e-08,2.468069e-09,4.582014e-08,1.083865e-08,8.816943e-09
AL,3.259777e-07,1.089307e-06,4.146048e-07,0.999,1.010938e-06,2.939267e-07,1.391113e-06,1.292094e-07,6.637361e-05,3.264462e-08,...,1.374167e-06,5.373544e-08,1.904338e-07,4.675129e-08,6.889079e-09,1.145327e-08,3.792655e-07,2.210527e-07,8.819979e-08,4.906557e-08
AM,1.612077e-06,1.726699e-06,3.492291e-08,6.153047e-07,0.999,2.933978e-07,2.789226e-08,6.949516e-08,2.177476e-05,2.445887e-07,...,3.776743e-08,2.890156e-08,1.024247e-07,1.64935e-06,5.16162e-08,1.424129e-08,6.68606e-07,3.178842e-07,3.26577e-07,6.11689e-08


In [266]:
filtered_pop = population_sizes[population_sizes['alpha2'].isin(alpha2_codes)]
filtered_pop.set_index('alpha2', inplace=True)
filtered_pop = filtered_pop.reindex(col_list)
filtered_pop.head()

Unnamed: 0_level_0,population,fullname
alpha2,Unnamed: 1_level_1,Unnamed: 2_level_1
CN,1371860000,China
AF,32716210,Afghanistan
AG,89236,Antigua and Barbuda
AL,2889104,Albania
AM,2889930,Armenia


## Write matrix in .mg format


In [267]:
base_file = 'output/manypop'

In [268]:
with open(base_file + '.mg', 'w') as f:
    f.write('#Migration_format_version 0.0.1\n')

(ordered_proportions).to_csv(base_file + '.mg', header=False, index=None, mode="a", sep=' ', float_format='%.15f')

## Write population data in .pp format 

In [269]:
sampling_multipliers_complete = np.full(shape=len(filtered_pop['population']),fill_value=0.00,dtype=np.double)
outer_index = 0
for index, row in filtered_pop.iterrows():
    if index in sampling_multipliers.index:
        sampling_multipliers_complete[outer_index] = sampling_multipliers.loc[index,'proportion'].astype(float)
    outer_index+=1

In [270]:

population_data = pd.DataFrame({'size': filtered_pop['population'],
                                'contactDensity': np.full(shape=len(filtered_pop['population']), fill_value=1.00,
                                                          dtype=np.double),
                                'conDenAfterLD': np.full(shape=len(filtered_pop['population']), fill_value=0.1,
                                                         dtype=np.double),
                                'startLD': np.full(shape=len(filtered_pop['population']), fill_value=0.01,
                                                   dtype=np.double),
                                'endLD': np.full(shape=len(filtered_pop['population']), fill_value=0.002,
                                                 dtype=np.double),
                                'samplingMultiplier': sampling_multipliers_complete,
                                'fullname':filtered_pop['fullname']})
# VGsim seems to require 3 columns to be comma-seperated, while the rest are space seperated, so must merge some columns by commas: 
population_data = population_data.reindex(col_list)
population_data['id'] = range(len(filtered_pop['population']))
# Extract ID to Full country name to a separate table for relabelling countries after simulations
population_translation_data = population_data[['id','fullname']]
population_translation_data = population_translation_data.rename(columns={'id':'location'})
population_data['conDenAfterLD startLD endLD'] = population_data['conDenAfterLD'].astype(str) + "," + population_data[
    'startLD'].astype(str) + "," + population_data['endLD'].astype(str)
population_data = population_data[['id', 'size', 'contactDensity', 'conDenAfterLD startLD endLD', 'samplingMultiplier']]
population_data

Unnamed: 0_level_0,id,size,contactDensity,conDenAfterLD startLD endLD,samplingMultiplier
alpha2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CN,0,1371860000,1.0,"0.1,0.01,0.002",0.000004
AF,1,32716210,1.0,"0.1,0.01,0.002",0.000000
AG,2,89236,1.0,"0.1,0.01,0.002",0.000000
AL,3,2889104,1.0,"0.1,0.01,0.002",0.000000
AM,4,2889930,1.0,"0.1,0.01,0.002",0.000000
...,...,...,...,...,...
WS,204,201757,1.0,"0.1,0.01,0.002",0.000000
YE,205,27753304,1.0,"0.1,0.01,0.002",0.000000
ZA,206,54729551,1.0,"0.1,0.01,0.002",0.008850
ZM,207,15737793,1.0,"0.1,0.01,0.002",0.000000


In [271]:
with open(base_file + '.pp', 'w') as f:
    f.write('#Population_format_version 0.0.1\n')

population_data.to_csv(base_file + '.pp', header=True, index=None, mode="a", sep=' ')
population_translation_data.to_csv(r'./output/manypop_country_ids.csv', index=None, sep=',', mode='w')

# Output
You can access the resulting three data files relative in:
 1. `/migration_data/output/manypop.mg`
2. `/migration_data/output/manypop.pp`
3. `/migration_data/output/manypop_country_codes.csv` 