# Create migration matrix
VGsim needs a matrix with entries containing probability of an individual moving between locations per unit time, this notebook
obtains that by calculating number of people flying between countries through reversing an effective distance matrix that is available in some supplementary materials https://www.science.org/doi/10.1126/science.1245200#supplementary-materials. 

### Load libraries and datasets

In [89]:
import pandas as pd
import numpy as np

# Alpha2 code of where to start the pandemic, look in the effective distance matrix for the available countries.
pandemic_start_alpha2 = 'CN'
# Uniform probability of staying in any country per unit time.
# Magnitude as UK citizens leaving per day (from ONS) in 2022 / UK population in 2022 = 0.0029 so 1 minus this for magnitude, time scale in this simulation is per 7 days
in_country_probability = 0.99

# Effective distance data

effective_distances = pd.read_csv('original_data/effective.distance.matrix.country.csv', header=0)
effective_distances.drop(effective_distances.columns[0], axis=1, inplace=True)

# Population sizes data

population_sizes = pd.read_csv('output/census_2013.csv', header=0)
alpha2_codes = effective_distances.columns

# Sampling data (obtained from sampling_multipliers.ipynb)

sampling_multipliers = pd.read_csv('output/sampling_multipliers.csv', header=0, index_col=0)

effective_distances.head()

Unnamed: 0,AE,AF,AG,AI,AL,AM,AO,AR,AS,AT,...,VI,VN,VU,WF,WS,YE,YT,ZA,ZM,ZW
0,0.0,5.614035,11.531617,15.902166,10.178471,7.76953,7.319591,9.130665,13.846666,6.344803,...,10.779283,7.031367,10.655119,14.960192,11.888486,6.435626,12.316025,5.262377,7.236649,8.955652
1,1.548074,0.0,13.07969,17.45024,10.173316,9.317603,8.867664,10.678739,15.394739,5.369434,...,12.327357,8.579441,12.203193,16.508266,13.43656,7.9837,13.864099,6.810451,8.784722,10.503726
2,7.181742,12.795778,0.0,5.366969,11.139288,13.218427,12.27586,8.597538,11.985323,8.737326,...,7.430663,11.554362,14.098395,16.611105,13.239721,13.617368,14.920257,8.765838,12.203931,12.459113
3,8.968363,14.582398,2.224032,0.0,13.363319,14.356298,14.254211,9.569931,12.957716,10.961358,...,2.213855,12.526755,15.070788,17.583498,14.212114,15.403988,14.410679,10.98987,14.427962,14.683145
4,6.413046,10.472095,11.76067,17.12764,0.0,10.349453,11.707611,8.216086,15.163964,3.566389,...,12.096582,11.296419,17.068165,18.930315,16.418362,11.336016,12.90528,10.016969,13.455061,13.435707


## Flux matrix
Invert effective distance function to get flux. Off diagonals matter only, since diagonal is zero in effective distance matrix.

In [90]:
flux_matrix = effective_distances.apply(lambda x: np.exp(1 - x))

## Clean flux matrix
Remove rows and columns of countries in flux matrix that aren't in population dataset.

In [91]:
missing_countries_columns = []
missing_countries_rows = []
for index, alpha2 in enumerate(alpha2_codes):
    country_mask = population_sizes['alpha2'] == alpha2
    country_data = population_sizes[country_mask]
    if len(country_data['population']) == 0:
        missing_countries_columns.append(alpha2)
        missing_countries_rows.append(index)
flux_matrix = flux_matrix.drop(missing_countries_columns, axis=1).drop(flux_matrix.index[missing_countries_rows])
flux_matrix.index = flux_matrix.columns

# Arbritralily say that 85% of people in each country stay in that country each unit of time. If migration rates are unrealistically high, this number is too low.
np.fill_diagonal(flux_matrix.values, 0)
ordered_columns = np.array(flux_matrix.columns)
target_start_index = flux_matrix.columns.get_loc(pandemic_start_alpha2)
ordered_columns[[0, target_start_index]] = ordered_columns[[target_start_index, 0]]
ordered_matrix = pd.DataFrame(0., index=ordered_columns, columns=ordered_columns)
for row in ordered_columns:
    for col in ordered_columns:
        ordered_matrix.at[row, col] = flux_matrix.loc[row, col]
ordered_matrix.head()

Unnamed: 0,CN,AF,AG,AL,AM,AO,AR,AS,AT,AU,...,VE,VG,VI,VN,VU,WS,YE,ZA,ZM,ZW
CN,0.0,7.8e-05,1.8e-05,3.1e-05,0.000228,0.000298,0.000124,4.180121e-06,0.001003,0.019114,...,0.00011,5.528111e-06,9e-05,0.014072,4.108398e-05,2.115453e-05,3.3e-05,0.00086,1.7e-05,2.2e-05
AF,0.00443,0.0,6e-06,0.000104,0.000244,0.000383,6.3e-05,5.603303e-07,0.012658,0.006342,...,1.5e-05,7.410236e-07,1.2e-05,0.000511,1.363061e-05,3.970733e-06,0.000927,0.002996,0.000416,7.5e-05
AG,0.001401,8e-06,0.0,3.9e-05,5e-06,1.3e-05,0.000502,1.694864e-05,0.000436,0.000953,...,0.000446,0.0008488147,0.001611,2.6e-05,2.048515e-06,4.834559e-06,3e-06,0.000424,1.4e-05,1.1e-05
AL,0.001309,7.7e-05,2.1e-05,0.0,8.7e-05,2.2e-05,0.000735,7.05779e-07,0.076812,4.9e-05,...,0.00034,9.33376e-07,1.5e-05,3.4e-05,1.051199e-07,2.013217e-07,3.2e-05,0.000121,4e-06,4e-06
AM,0.010669,0.000202,3e-06,9.6e-05,0.0,3.7e-05,2.4e-05,5.841516e-07,0.041402,0.000609,...,1.5e-05,7.725267e-07,1.3e-05,0.001946,1.308039e-06,3.81045e-07,8.9e-05,0.000288,4e-05,7e-06


In [92]:
normalising_matrix = (1-in_country_probability)*np.diag(np.reciprocal(ordered_matrix.sum(axis=0)))
normalising_matrix

array([[0.00231952, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.13365069, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.01123432, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.00162478, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.01299974,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.01935162]])

In [93]:
normalised_matrix = np.matmul(ordered_matrix.to_numpy(),normalising_matrix)
np.fill_diagonal(normalised_matrix, in_country_probability)
probability_matrix = pd.DataFrame(normalised_matrix, index=ordered_matrix.columns, columns=ordered_matrix.columns)
probability_matrix.head()

Unnamed: 0,CN,AF,AG,AL,AM,AO,AR,AS,AT,AU,...,VE,VG,VI,VN,VU,WS,YE,ZA,ZM,ZW
CN,0.99,1e-05,2.062721e-07,3e-06,1.612077e-05,5.737221e-06,7.604802e-07,2.206945e-07,6e-06,3.082932e-05,...,1.496026e-06,4.671041e-07,3.969598e-06,0.0001710825,1.539751e-06,3.243152e-07,8.368131e-07,1.396734e-06,2.243683e-07,4.179451e-07
AF,1e-05,0.99,6.373878e-08,1.1e-05,1.726699e-05,7.375112e-06,3.847512e-07,2.958331e-08,7.3e-05,1.022837e-05,...,2.005369e-07,6.261364e-08,5.321105e-07,6.210766e-06,5.108496e-07,6.08744e-08,2.361492e-05,4.86798e-06,5.408447e-06,1.443118e-06
AG,3e-06,1e-06,0.99,4e-06,3.492291e-07,2.441227e-07,3.083429e-06,8.948239e-07,3e-06,1.537201e-06,...,6.065759e-06,7.172158e-05,7.122121e-05,3.170687e-07,7.677453e-08,7.411751e-08,8.443363e-08,6.888659e-07,1.770636e-07,2.04215e-07
AL,3e-06,1e-05,2.383674e-07,0.99,6.153047e-06,4.309189e-07,4.515404e-06,3.726245e-08,0.000444,7.888169e-08,...,4.618992e-06,7.886668e-08,6.70234e-07,4.103709e-07,3.939696e-09,3.086417e-09,8.265975e-07,1.971404e-07,5.067226e-08,7.690567e-08
AM,2.5e-05,2.7e-05,3.277042e-08,1e-05,0.99,7.077407e-07,1.498807e-07,3.084098e-08,0.000239,9.815492e-07,...,2.090623e-07,6.527553e-08,5.547321e-07,2.3654e-05,4.902285e-08,5.841713e-09,2.266167e-06,4.671478e-07,5.190129e-07,1.384865e-07


In [94]:
filtered_pop = population_sizes[population_sizes['alpha2'].isin(alpha2_codes)]
filtered_pop.set_index('alpha2', inplace=True)
filtered_pop = filtered_pop.reindex(ordered_columns)
filtered_pop.head()

Unnamed: 0_level_0,population,fullname
alpha2,Unnamed: 1_level_1,Unnamed: 2_level_1
CN,1371860000,China
AF,32716210,Afghanistan
AG,89236,Antigua and Barbuda
AL,2889104,Albania
AM,2889930,Armenia


## Write matrix in .mg format


In [95]:
base_file = 'output/manypop'

In [96]:
with open(base_file + '.mg', 'w') as f:
    f.write('#Migration_format_version 0.0.1\n')

probability_matrix.to_csv(base_file + '.mg', header=False, index=None, mode="a", sep=' ', float_format='%.15f')

## Write population data in .pp format 

In [97]:
sampling_multipliers_complete = np.full(shape=len(filtered_pop['population']),fill_value=0.00,dtype=np.double)
outer_index = 0
for index, row in filtered_pop.iterrows():
    if index in sampling_multipliers.index:
        sampling_multipliers_complete[outer_index] = sampling_multipliers.loc[index,'proportion'].astype(float)
    outer_index+=1

In [98]:

population_data = pd.DataFrame({'size': filtered_pop['population'],
                                'contactDensity': np.full(shape=len(filtered_pop['population']), fill_value=1.00,
                                                          dtype=np.double),
                                'conDenAfterLD': np.full(shape=len(filtered_pop['population']), fill_value=1.00,
                                                         dtype=np.double),
                                'startLD': np.full(shape=len(filtered_pop['population']), fill_value=1.,
                                                   dtype=np.double),
                                'endLD': np.full(shape=len(filtered_pop['population']), fill_value=0.002,
                                                 dtype=np.double),
                                'samplingMultiplier': np.full(shape=len(filtered_pop['population']), fill_value=1.,
                                                 dtype=np.double),
                                'fullname':filtered_pop['fullname']})
# VGsim seems to require 3 columns to be comma-seperated, while the rest are space seperated, so must merge some columns by commas: 
population_data = population_data.reindex(ordered_columns)
population_data['id'] = range(len(filtered_pop['population']))
# Extract ID to Full country name to a separate table for relabelling countries after simulations
population_translation_data = population_data[['id','fullname']]
population_translation_data = population_translation_data.rename(columns={'id':'location'})
population_data['conDenAfterLD startLD endLD'] = population_data['conDenAfterLD'].astype(str) + "," + population_data[
    'startLD'].astype(str) + "," + population_data['endLD'].astype(str)
population_data = population_data[['id', 'size', 'contactDensity', 'conDenAfterLD startLD endLD', 'samplingMultiplier']]
population_data

Unnamed: 0_level_0,id,size,contactDensity,conDenAfterLD startLD endLD,samplingMultiplier
alpha2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CN,0,1371860000,1.0,"1.0,1.0,0.002",1.0
AF,1,32716210,1.0,"1.0,1.0,0.002",1.0
AG,2,89236,1.0,"1.0,1.0,0.002",1.0
AL,3,2889104,1.0,"1.0,1.0,0.002",1.0
AM,4,2889930,1.0,"1.0,1.0,0.002",1.0
...,...,...,...,...,...
WS,204,201757,1.0,"1.0,1.0,0.002",1.0
YE,205,27753304,1.0,"1.0,1.0,0.002",1.0
ZA,206,54729551,1.0,"1.0,1.0,0.002",1.0
ZM,207,15737793,1.0,"1.0,1.0,0.002",1.0


In [99]:
with open(base_file + '.pp', 'w') as f:
    f.write('#Population_format_version 0.0.1\n')

population_data.to_csv(base_file + '.pp', header=True, index=None, mode="a", sep=' ')
population_translation_data.to_csv(r'./output/manypop_country_ids.csv', index=None, sep=',', mode='w')

# Output
You can access the resulting three data files relative in:
 1. `/migration_data/output/manypop.mg`
2. `/migration_data/output/manypop.pp`
3. `/migration_data/output/manypop_country_codes.csv` 