# Generate Passport Index datasets
* Data by Passport Index 2024: https://www.passportindex.org/
* In both tidy and matrix formats
* Using ISO-2, ISO-3, and full country names

In [1]:
import httpx # see https://www.python-httpx.org/
import pandas as pd
import json
import time

# Load tqdm!
from tqdm import tqdm
tqdm.pandas()

In [2]:
headers = {
    'Host': 'www.passportindex.org',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:126.0) Gecko/20100101 Firefox/126.0',
    'Accept': '*/*',
    'Accept-Language': 'en-US,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate, br, zstd',
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'X-Requested-With': 'XMLHttpRequest',
    'Origin': 'https://www.passportindex.org',
    'Referer': 'https://www.passportindex.org/comparebyPassport.php'
}

mounts = {
    'https://': httpx.HTTPTransport(http2=True)
}


with httpx.Client() as client:

    csrf_token = (
        client.request(
            url='https://www.passportindex.org/comparebyPassport.php',
            method='get',
            headers=headers
        ).text
        .split('let csrf_token = "')[1]
        .split('";')[0]
        .strip()
    )

    print(f'CSRF token: {csrf_token}')


    country_data = (
        pd.read_csv(
            'https://gist.githubusercontent.com/ilyankou/b2580c632bdea4af2309dcaa69860013/raw/420fb417bcd17d833156efdf64ce8a1c3ceb2691/country-codes',
            dtype=str
        )
        .fillna('NA')
    )

    
    def get_data(country_code):

        time.sleep(6.1)
        
        res = client.request(
            url='https://www.passportindex.org/incl/compare2.php',
            method='post',
            headers=headers,
            data={
                'compare': '3',
                'cc': str.lower(country_code),
                'year': '2024',
                'csrf_token': csrf_token
            }
        ).json()
        return res

    
    #print(get_data('by'))
    country_data['travel'] = country_data.ISO2.progress_apply(get_data)

CSRF token: 5d04a1bd6a23c293a9ee9567e3e0966cf61d1edbc9a3d151e889ef6423106bd0


100%|█████████████████████████████████████████| 199/199 [22:18<00:00,  6.72s/it]


## Get data from PassportIndex

In [3]:
assert country_data['travel'].isna().sum() == 0, \
    f"Didn't fetch {country_data['travel'].isna().sum()} rows!"

## Clean up the data

In [4]:
obj = {}

for passport in country_data.ISO2.tolist():
    
    # Add passport to the object
    if passport not in obj:
        obj[passport] = {}
    
    # Add destinations for the given passport
    for dest in country_data.query(f'ISO2 == "{passport}"').iloc[0]['travel']:
        
        text = dest['text'].lower()
        res = ''
        
        # ** Visa required, incl Cuba's tourist card **
        if text == 'visa required' or text == 'tourist card':
            res = 'visa required'
        
        # ** Visa on arrival **
        elif 'visa on arrival' in text:
            res = 'visa on arrival'
            
        # ** Covid-19 ban ** 
        elif text == 'covid-19 ban':
            res = 'covid ban'
            
        # ** Visa-free, incl. Seychelles' tourist registration **
        elif 'visa-free' in text or 'tourist registration' in text or 'visa waiver' in text:
            res = dest['dur'] if dest['dur'] != '' else 'visa free'
            
        # ** eVisas, incl eVisitors (Australia), eTourist cards (Suriname),
        # eTA (US), and pre-enrollment (Ivory Coast), or EVW (UK) **
        elif 'evis' in text or 'etourist' in text or text == 'eta' or text == 'pre-enrollment' or text == 'evw':
            res = 'e-visa'
            
        # ** No admission, including Trump ban **
        elif text == 'trump ban' or text == 'not admitted':
            res = 'no admission'
        
        # Update the result!
        obj[passport][ dest['code'] ] = res if res != '' else dest['text']

In [5]:
for x, y in country_data.head(5).iterrows():
    print(x, y['ISO3'])

0 AFG
1 ALB
2 DZA
3 AND
4 AGO


In [6]:
#iso2to3
#matrix

## Save

In [7]:
# ISO-2: Matrix
matrix = pd.DataFrame(obj).T.fillna(-1)
matrix.to_csv('passport-index-matrix-iso2.csv', index_label='Passport')

# ISO-2: Tidy
matrix.stack().to_csv(
    'passport-index-tidy-iso2.csv',
    index_label=['Passport', 'Destination'],
    header=['Requirement'])

# ISO-3: Matrix
iso2to3 =  { y['ISO2']:y['ISO3'] for _,y in country_data.iterrows() }
matrix.rename(columns=iso2to3, index=iso2to3).to_csv('passport-index-matrix-iso3.csv', index_label='Passport')

# ISO-3: Tidy
matrix.rename(columns=iso2to3, index=iso2to3).stack().to_csv(
    'passport-index-tidy-iso3.csv',
    index_label=['Passport', 'Destination'],
    header=['Requirement'])


# Country names: Matrix
iso2name =  { y['ISO2']:y['Country'] for _,y in country_data.iterrows() }
matrix.rename(columns=iso2name, index=iso2name).to_csv('passport-index-matrix.csv', index_label='Passport')

# Country names: Tidy
matrix.rename(columns=iso2name, index=iso2name).stack().to_csv(
    'passport-index-tidy.csv',
    index_label=['Passport', 'Destination'],
    header=['Requirement'])

In [8]:
# Print all values
tidy = matrix.rename(columns=iso2to3, index=iso2to3).stack()
tidy.value_counts()

visa required      13784
90                  7650
e-visa              7067
visa on arrival     5703
visa free           1864
30                  1815
180                  734
-1                   199
120                  115
21                   106
14                   105
60                   102
15                   102
360                   97
42                    60
no admission          34
28                    23
240                   15
45                    15
10                     6
7                      4
31                     1
Name: count, dtype: int64

In [9]:
# Which countries have no admission? (Typically war zones etc)
tidy[ tidy == 'no admission' ]

ARM  AZE    no admission
BGD  IRQ    no admission
     LBY    no admission
HTI  SUR    no admission
IRN  LBY    no admission
ISR  DZA    no admission
     BGD    no admission
     BRN    no admission
     IRN    no admission
     LBN    no admission
     LBY    no admission
     MYS    no admission
     PAK    no admission
     SAU    no admission
     SYR    no admission
     YEM    no admission
XKX  ARM    no admission
     KHM    no admission
     CUB    no admission
     HKG    no admission
     SYC    no admission
PRK  JPN    no admission
     LKA    no admission
PAK  LBY    no admission
PSE  MDG    no admission
     SYR    no admission
PHL  KWT    no admission
SOM  AUS    no admission
     CAN    no admission
SDN  LBY    no admission
SYR  LBY    no admission
TWN  GEO    no admission
TJK  KGZ    no admission
YEM  LBY    no admission
dtype: object

In [10]:
assert tidy.loc[('BLR', 'RUS')] == '90', "Check data!"

### Difference with previous run
* Typically the difference between two consecutive months would be in low hundreds.
* If difference is higher, double-check the data!

In [11]:
(tidy
 .reset_index()
 .rename(
     columns={'level_0': 'Passport', 'level_1': 'Destination', 0: 'Requirement'}
 )
 .merge(
     pd.read_csv('legacy/2024-06-10/passport-index-tidy-iso3.csv'), # SELECT VERSION TO COMPARE!
     how='left',
     left_on=['Passport', 'Destination'],
     right_on=['Passport', 'Destination'],
     suffixes=('_previous', '_new')
 )
 .assign(
     is_different=lambda df_: df_.Requirement_previous.ne(df_.Requirement_new)
 )
 .query('is_different & (Passport != Destination)')
 .drop(columns=['is_different'])
)

Unnamed: 0,Passport,Destination,Requirement_previous,Requirement_new
84,AFG,JPN,visa required,e-visa
208,ALB,AZE,90,e-visa
250,ALB,SLV,180,90
283,ALB,JPN,visa required,e-visa
482,DZA,JPN,visa required,e-visa
...,...,...,...,...
38491,VAT,JPN,visa required,e-visa
38690,VEN,JPN,visa required,e-visa
39088,YEM,JPN,visa required,e-visa
39287,ZMB,JPN,visa required,e-visa
