# Sampling multiplier 
We want a realistic sampling bias in our simulations. This sheet computes sampling multipliers for each country in our simulation by calculating the cases sequenced to number of cases ratio for each country, the most available dataset for this is COVID.

COVID sequencing data is obtained from https://www.biorxiv.org/content/10.1101/2024.07.12.603240v1
COVID case data is obtained from https://ourworldindata.org/covid-cases


In [82]:
import pandas as pd
import pycountry
unclean_seq_m = pd.read_csv('original_data/Viridian_2M_noShortDel_MAPLE_metaData.tsv', sep='\t')
cases_by_country = pd.read_csv('original_data/owid-covid-data.csv')

In [83]:
seq_m = unclean_seq_m[unclean_seq_m['country'].notna()]
seq_m = seq_m[seq_m['country']!= 'UNKNOWN']
seq_m.head()

Unnamed: 0,strain,collapsedTo,support,rootSupport,supportGroup,supportTo,mutationsInf,Ns,country,date,lineage
1,ERR4806745,ERR4806745_MinorSeqsClade,,,,,,,United Kingdom,2020-10-08,A
2,ERR5989690,ERR4806745_MinorSeqsClade,,,,,,,United Kingdom,2020,A
3,ERR6061709,ERR4806745_MinorSeqsClade,,,,,,,United Kingdom,2020,A
4,ERR5628278,ERR4806745_MinorSeqsClade,,,,,,,United Kingdom,2020,A
5,ERR4806964,ERR4806745_MinorSeqsClade,,,,,,,United Kingdom,2020-10-13,A


In [84]:
seq_count = seq_m.groupby('country').size()
def convert(country_name):
    country = pycountry.countries.get(name=country_name)
    if country is None:
        if country_name == 'Russia':
            return 'RUS'
        return country_name
    return country.alpha_3
seq_count.index = seq_count.index.map(lambda x: convert(x))
seq_count.head()

country
AGO      304
AUS    10605
AUT       21
BWA       35
BRA      500
dtype: int64

In [96]:
total_cases_by_country = cases_by_country.groupby('iso_code', as_index=True).aggregate({'new_cases':'sum'})
# Hong Kong is missing, using: https://www.worldometers.info/coronavirus/country/china-hong-kong-sar/

total_cases_by_country.loc['HKG', 'new_cases'] = 2937609.

In [104]:
sampling_proportions = pd.DataFrame(columns=['alpha2','proportion'])
new_index = 0
for country in seq_count.index:
    sampling_prop = seq_count[country]/total_cases_by_country.loc[country]['new_cases']
    sampling_proportions.loc[new_index] = [pycountry.countries.get(alpha_3=country).alpha_2]+[sampling_prop]
    new_index+=1
sampling_proportions['proportion'] = sampling_proportions['proportion']/sampling_proportions['proportion'].sum(axis=0)
sampling_proportions

Unnamed: 0,alpha2,proportion
0,AO,0.024503
1,AU,0.007745
2,AT,3e-05
3,BW,0.000917
4,BR,0.000115
5,CM,0.015977
6,CA,0.006688
7,CL,0.000456
8,CN,4e-06
9,HR,9.9e-05


In [106]:
sampling_proportions.to_csv('output/sampling_multipliers.csv', index=False)