# Population data
This notebook collates processed census data, along with migration matrices and outputs them in a format that will allow for input into the simulation, as well as make it easy to process results after simulation.

## Migration matrix
VGsim needs a matrix with entries containing probability of an individual moving between locations per unit time, this notebook obtains that by calculating number of people flying between countries through reversing an effective distance matrix that is available in some supplementary materials https://www.science.org/doi/10.1126/science.1245200#supplementary-materials. 

### Load libraries and datasets

In [28]:
import pandas as pd
import numpy as np

# Alpha2 code of where to start the pandemic, look in the effective distance matrix for the available countries.
pandemic_start_alpha2 = 'CN'
# Uniform probability of staying in any country per unit time.
# Magnitude as UK citizens leaving per day (from ONS) in 2022 / UK population in 2022 = 0.0029 so 1 minus this for magnitude, time scale in this simulation is per 7 days
in_country_probability = 0.997

# Effective distance data

effective_distances = pd.read_csv('data_sources/effective.distance.matrix.country.csv', header=0)
effective_distances.drop(effective_distances.columns[0], axis=1, inplace=True)

# Population sizes data

population_sizes = pd.read_csv('data_sources/census_2013.csv', header=0)
alpha2_codes = effective_distances.columns

# Sampling data (obtained from 02_sampling_multipliers.ipynb)

sampling_multipliers = pd.read_csv('data_sources/sampling_multipliers.csv', header=0, index_col=0)

effective_distances.head()

Unnamed: 0,AE,AF,AG,AI,AL,AM,AO,AR,AS,AT,...,VI,VN,VU,WF,WS,YE,YT,ZA,ZM,ZW
0,0.0,5.614035,11.531617,15.902166,10.178471,7.76953,7.319591,9.130665,13.846666,6.344803,...,10.779283,7.031367,10.655119,14.960192,11.888486,6.435626,12.316025,5.262377,7.236649,8.955652
1,1.548074,0.0,13.07969,17.45024,10.173316,9.317603,8.867664,10.678739,15.394739,5.369434,...,12.327357,8.579441,12.203193,16.508266,13.43656,7.9837,13.864099,6.810451,8.784722,10.503726
2,7.181742,12.795778,0.0,5.366969,11.139288,13.218427,12.27586,8.597538,11.985323,8.737326,...,7.430663,11.554362,14.098395,16.611105,13.239721,13.617368,14.920257,8.765838,12.203931,12.459113
3,8.968363,14.582398,2.224032,0.0,13.363319,14.356298,14.254211,9.569931,12.957716,10.961358,...,2.213855,12.526755,15.070788,17.583498,14.212114,15.403988,14.410679,10.98987,14.427962,14.683145
4,6.413046,10.472095,11.76067,17.12764,0.0,10.349453,11.707611,8.216086,15.163964,3.566389,...,12.096582,11.296419,17.068165,18.930315,16.418362,11.336016,12.90528,10.016969,13.455061,13.435707


### Flux matrix
Invert effective distance function to get flux. Off diagonals matter only, since diagonal is zero in effective distance matrix.

In [29]:
flux_matrix = effective_distances.apply(lambda x: np.exp(1 - x))

### Clean flux matrix
Remove rows and columns of countries in flux matrix that aren't in population dataset.

In [30]:
missing_countries_columns = []
missing_countries_rows = []
for index, alpha2 in enumerate(alpha2_codes):
    country_mask = population_sizes['alpha2'] == alpha2
    country_data = population_sizes[country_mask]
    if len(country_data['population']) == 0:
        missing_countries_columns.append(alpha2)
        missing_countries_rows.append(index)
flux_matrix = flux_matrix.drop(missing_countries_columns, axis=1).drop(flux_matrix.index[missing_countries_rows])
flux_matrix.index = flux_matrix.columns

# Arbritralily say that 85% of people in each country stay in that country each unit of time. If migration rates are unrealistically high, this number is too low.
np.fill_diagonal(flux_matrix.values, 0)
ordered_columns = np.array(flux_matrix.columns)
target_start_index = flux_matrix.columns.get_loc(pandemic_start_alpha2)
ordered_columns[[0, target_start_index]] = ordered_columns[[target_start_index, 0]]
ordered_matrix = pd.DataFrame(0., index=ordered_columns, columns=ordered_columns)
for row in ordered_columns:
    for col in ordered_columns:
        ordered_matrix.at[row, col] = flux_matrix.loc[row, col]
ordered_matrix.head()

Unnamed: 0,CN,AF,AG,AL,AM,AO,AR,AS,AT,AU,...,VE,VG,VI,VN,VU,WS,YE,ZA,ZM,ZW
CN,0.0,7.8e-05,1.8e-05,3.1e-05,0.000228,0.000298,0.000124,4.180121e-06,0.001003,0.019114,...,0.00011,5.528111e-06,9e-05,0.014072,4.108398e-05,2.115453e-05,3.3e-05,0.00086,1.7e-05,2.2e-05
AF,0.00443,0.0,6e-06,0.000104,0.000244,0.000383,6.3e-05,5.603303e-07,0.012658,0.006342,...,1.5e-05,7.410236e-07,1.2e-05,0.000511,1.363061e-05,3.970733e-06,0.000927,0.002996,0.000416,7.5e-05
AG,0.001401,8e-06,0.0,3.9e-05,5e-06,1.3e-05,0.000502,1.694864e-05,0.000436,0.000953,...,0.000446,0.0008488147,0.001611,2.6e-05,2.048515e-06,4.834559e-06,3e-06,0.000424,1.4e-05,1.1e-05
AL,0.001309,7.7e-05,2.1e-05,0.0,8.7e-05,2.2e-05,0.000735,7.05779e-07,0.076812,4.9e-05,...,0.00034,9.33376e-07,1.5e-05,3.4e-05,1.051199e-07,2.013217e-07,3.2e-05,0.000121,4e-06,4e-06
AM,0.010669,0.000202,3e-06,9.6e-05,0.0,3.7e-05,2.4e-05,5.841516e-07,0.041402,0.000609,...,1.5e-05,7.725267e-07,1.3e-05,0.001946,1.308039e-06,3.81045e-07,8.9e-05,0.000288,4e-05,7e-06


In [31]:
normalising_matrix = (1-in_country_probability)*np.diag(np.reciprocal(ordered_matrix.sum(axis=0)))
normalising_matrix

array([[0.00069585, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.04009521, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.00337029, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.00048743, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.00389992,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.00580549]])

In [32]:
normalised_matrix = np.matmul(ordered_matrix.to_numpy(),normalising_matrix)
np.fill_diagonal(normalised_matrix, in_country_probability)
probability_matrix = pd.DataFrame(normalised_matrix, index=ordered_matrix.columns, columns=ordered_matrix.columns)
probability_matrix

Unnamed: 0,CN,AF,AG,AL,AM,AO,AR,AS,AT,AU,...,VE,VG,VI,VN,VU,WS,YE,ZA,ZM,ZW
CN,9.970000e-01,3.113009e-06,6.188163e-08,9.779330e-07,4.836230e-06,1.721166e-06,2.281441e-07,6.620835e-08,1.739494e-06,9.248795e-06,...,4.488077e-07,1.401312e-07,1.190879e-06,5.132475e-05,4.619252e-07,9.729457e-08,2.510439e-07,4.190202e-07,6.731049e-08,1.253835e-07
AF,3.082944e-06,9.970000e-01,1.912163e-08,3.267921e-06,5.180098e-06,2.212534e-06,1.154253e-07,8.874993e-09,2.196173e-05,3.068512e-06,...,6.016108e-08,1.878409e-08,1.596332e-07,1.863230e-06,1.532549e-07,1.826232e-08,7.084476e-06,1.460394e-06,1.622534e-06,4.329354e-07
AG,9.749293e-07,3.021700e-07,9.970000e-01,1.243814e-06,1.047687e-07,7.323680e-08,9.250288e-07,2.684472e-07,7.568497e-07,4.611603e-07,...,1.819728e-06,2.151647e-05,2.136636e-05,9.512060e-08,2.303236e-08,2.223525e-08,2.533009e-08,2.066598e-07,5.311908e-08,6.126451e-08
AL,9.106796e-07,3.086129e-06,7.151022e-08,9.970000e-01,1.845914e-06,1.292757e-07,1.354621e-06,1.117873e-08,1.332659e-04,2.366451e-08,...,1.385698e-06,2.366000e-08,2.010702e-07,1.231113e-07,1.181909e-09,9.259252e-10,2.479793e-07,5.914213e-08,1.520168e-08,2.307170e-08
AM,7.424159e-06,8.110132e-06,9.831127e-09,3.032814e-06,9.970000e-01,2.123222e-07,4.496422e-08,9.252295e-09,7.183087e-05,2.944648e-07,...,6.271870e-08,1.958266e-08,1.664196e-07,7.096201e-06,1.470686e-08,1.752514e-09,6.798502e-07,1.401443e-07,1.557039e-07,4.154594e-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WS,3.620930e-06,6.970358e-07,5.222203e-08,3.435982e-08,4.272388e-08,3.169148e-08,3.670566e-07,2.896260e-03,4.288799e-08,7.658382e-05,...,3.787497e-07,1.182570e-07,1.004985e-06,2.316545e-06,9.869896e-06,9.970000e-01,5.843061e-08,3.049588e-07,3.988419e-08,9.040536e-08
YE,1.134951e-06,3.272472e-05,7.404206e-09,1.137796e-06,2.005818e-06,1.283428e-06,4.469456e-08,3.436541e-09,8.236621e-07,1.188178e-06,...,2.329534e-08,7.273505e-09,6.181254e-08,7.214728e-07,5.934277e-08,7.071467e-09,9.970000e-01,5.654882e-07,8.214628e-07,1.530191e-06
ZA,4.372082e-06,1.555875e-05,1.374604e-07,6.631581e-07,9.536526e-07,1.376932e-04,9.417097e-06,3.707656e-08,1.664343e-06,1.431471e-05,...,2.612825e-07,7.847325e-08,6.668905e-07,6.064917e-07,7.149391e-07,8.519433e-08,1.304247e-06,9.970000e-01,1.732889e-04,3.927934e-04
ZM,9.340782e-07,1.598421e-05,3.251594e-08,2.645994e-07,9.797310e-07,3.431254e-05,1.138543e-06,4.482621e-09,3.330567e-07,1.730673e-06,...,3.158951e-08,1.173983e-08,8.062822e-08,3.523995e-07,8.643738e-08,1.030014e-08,2.703974e-06,1.601931e-04,9.970000e-01,1.299010e-03


In [33]:
filtered_pop = population_sizes[population_sizes['alpha2'].isin(alpha2_codes)]
old_pop = filtered_pop.copy(deep=True)
filtered_pop.set_index('alpha2', inplace=True)
filtered_pop = filtered_pop.reindex(ordered_columns)

### Write matrix in .mg format


In [34]:
base_file = '../parameters/manypop'

In [35]:
with open(base_file + '.mg', 'w') as f:
    f.write('#Migration_format_version 0.0.1\n')

probability_matrix.to_csv(base_file + '.mg', header=False, index=None, mode="a", sep=' ', float_format='%.15f')

### Write population data in .pp format 

In [36]:
sampling_multipliers_complete = np.full(shape=len(filtered_pop['population']),fill_value=0.000000000000001,dtype=np.double)
outer_index = 0
for index, row in filtered_pop.iterrows():
    if index in sampling_multipliers.index:
        # Multiply by 10 to cancel out default sampling rate of 0.1, then multiply by 10 since this is a rate, and the current unit of time is 10 days
        sampling_multipliers_complete[outer_index] = sampling_multipliers.loc[index,'proportion'].astype(float)*10*10
    outer_index+=1
sampling_multipliers_complete

array([1.79034503e-05, 1.00000000e-15, 1.00000000e-15, 1.00000000e-15,
       1.00000000e-15, 3.90918953e-03, 1.00000000e-15, 1.00000000e-15,
       1.00000000e-15, 9.65873847e-04, 1.00000000e-15, 1.00000000e-15,
       1.00000000e-15, 1.00000000e-15, 1.00000000e-15, 1.00000000e-15,
       1.00000000e-15, 1.00000000e-15, 1.00000000e-15, 1.00000000e-15,
       1.00000000e-15, 1.00000000e-15, 1.00000000e-15, 1.00000000e-15,
       1.00000000e-15, 1.00000000e-15, 1.00000000e-15, 1.00000000e-15,
       1.00000000e-15, 1.00000000e-15, 3.76229406e-03, 1.00000000e-15,
       1.00000000e-15, 1.00000000e-15, 1.00000000e-15, 1.00000000e-15,
       1.00000000e-15, 1.00000000e-15, 1.00000000e-15, 1.00000000e-15,
       1.00000000e-15, 1.00000000e-15, 1.00000000e-15, 1.00000000e-15,
       1.00000000e-15, 1.00000000e-15, 8.55412651e-06, 1.00000000e-15,
       1.00000000e-15, 1.00000000e-15, 1.00000000e-15, 1.00000000e-15,
       1.00000000e-15, 2.36643736e-03, 1.00000000e-15, 1.00000000e-15,
      

In [37]:

population_data = pd.DataFrame({'size': filtered_pop['population'],
                                'contactDensity': np.full(shape=len(filtered_pop['population']), fill_value=1.00,
                                                          dtype=np.double),
                                'conDenAfterLD': np.full(shape=len(filtered_pop['population']), fill_value=0.2,
                                                         dtype=np.double),
                                'startLD': np.full(shape=len(filtered_pop['population']), fill_value=1.,
                                                   dtype=np.double),
                                'endLD': np.full(shape=len(filtered_pop['population']), fill_value=0.01,
                                                 dtype=np.double),
                                'samplingMultiplier': sampling_multipliers_complete,
                                'fullname':filtered_pop['fullname']})
# VGsim seems to require 3 columns to be comma-seperated, while the rest are space seperated, so must merge some columns by commas: 
population_data = population_data.reindex(ordered_columns)
population_data['id'] = range(0,len(filtered_pop['population']))
# Extract ID to Full country name to a separate table for relabelling countries after simulations
population_translation_data = population_data[['id','fullname']]
population_translation_data = population_translation_data.rename(columns={'id':'location'})
population_data['conDenAfterLD startLD endLD'] = population_data['conDenAfterLD'].astype(str) + "," + population_data[
    'startLD'].astype(str) + "," + population_data['endLD'].astype(str)
population_data = population_data[['id', 'size', 'contactDensity', 'conDenAfterLD startLD endLD', 'samplingMultiplier']]

In [38]:
translation_table = population_translation_data.merge(old_pop, left_on='fullname', right_on='fullname', how='outer')
translation_table

Unnamed: 0,location,fullname,alpha2,population
0,1,Afghanistan,AF,32716210
1,3,Albania,AL,2889104
2,51,Algeria,DZ,38760168
3,7,American Samoa,AS,52217
4,5,Angola,AO,27128337
...,...,...,...,...
204,202,Viet Nam,VN,91235504
205,201,Virgin Islands (U.S.),VI,107882
206,205,"Yemen, Rep.",YE,27753304
207,207,Zambia,ZM,15737793


In [39]:
with open(base_file + '.pp', 'w') as f:
    f.write('#Population_format_version 0.0.1\n')

population_data.to_csv(base_file + '.pp', header=True, index=None, mode="a", sep=' ')
translation_table.to_csv(f'{base_file}_country_ids.csv', index=None, sep=',', mode='w')

In [40]:
population_data.loc['GB']

id                                       64
size                               64602298
contactDensity                          1.0
conDenAfterLD startLD endLD    0.2,1.0,0.01
samplingMultiplier                 0.091332
Name: GB, dtype: object