# Data Cleaning

## Import Packages

In [1]:
import pandas as pd
import numpy as np
from itertools import chain

## Get Country Codes

In [2]:
country_df = pd.read_excel(
    '../data/unwto-all-data-download_2022.xlsx',
    sheet_name = 'Inbound Tourism-Arrivals',
    header = 2
)

In [3]:
country_df = country_df[['C.','Basic data and indicators']][
    (country_df['Basic data and indicators'].isna() == False) & 
    (country_df['C.'].isna() == False)
]

In [4]:
countries = country_df.rename(columns = {
    'C.':'country_code',
    'Basic data and indicators':'country'
})

## Function to Standardize Data Cleaning

In [5]:
def cleanse_tourism_df(sheet):
    
    # read excel sheet
    df = pd.read_excel(
        '../data/unwto-all-data-download_2022.xlsx',
        sheet_name = sheet,
        header = 2
    )
    
    # remove irrelevant rows
    df = df[
        (df['S.'].isna() == False) & 
        (df['S.'] != 0)
    ]
    
    # utilizing the attribute number column with table name, as numbers are repeating across tables
    df['table_measure'] = sheet.replace(' ','_').replace('-','_').lower() + '_' + df['S.'].map('{:,.2f}'.format)
    
    # defining the columns to keep
    col_list = [['C.','table_measure'], list(range(1995,2023))]
    col_list = list(chain(*col_list))
    df = df[col_list]
    
    # utilizing pd.melt() to put the years onto rows
    df = pd.melt(df,
                 id_vars=['C.', 'table_measure'], 
                 var_name='year',
                 value_name='value')
    
    # replacing '..' values with NaN
    df['value'] = df['value'].replace('..',np.nan)
    
    # pivoting to put the attributes onto columns
    df = df.pivot(index=['C.', 'year'], columns='table_measure', values='value')#.reset_index()
    
    return df

## Creating Data Frames

In [6]:
# inbound tourism tables
inbound_accommodations = cleanse_tourism_df('Inbound Tourism-Accommodation')
inbound_regions = cleanse_tourism_df('Inbound Tourism-Regions')
inbound_arrivals = cleanse_tourism_df('Inbound Tourism-Arrivals')
inbound_purpose = cleanse_tourism_df('Inbound Tourism-Purpose')
inbound_transport = cleanse_tourism_df('Inbound Tourism-Transport')
inbound_expenditure = cleanse_tourism_df('Inbound Tourism-Expenditure')

# domestic tourism tables
domestic_trips = cleanse_tourism_df('Domestic Tourism-Trips')
domestic_accommodations = cleanse_tourism_df('Domestic Tourism-Accommodation')

# outbound tourism tables
outbound_departures = cleanse_tourism_df('Outbound Tourism-Departures')
outbound_expenditure = cleanse_tourism_df('Outbound Tourism-Expenditure')

# tourism industry table
tourism_industry = cleanse_tourism_df('Tourism Industries')

# tourism employment table
employment = cleanse_tourism_df('Employment')

In [7]:
# concatenate data frames into one
tourism_df = pd.concat([
    inbound_accommodations,
    inbound_regions,
    inbound_arrivals,
    inbound_purpose,
    inbound_transport,
    inbound_expenditure,
    domestic_trips,
    domestic_accommodations,
    outbound_departures,
    outbound_expenditure,
    tourism_industry,
    employment
], axis = 1)

tourism_df = tourism_df.reset_index().rename(columns = {
    'C.':'country_code'
})

In [8]:
tourism_df.head(5)

table_measure,country_code,year,inbound_tourism_accommodation_1.29,inbound_tourism_accommodation_1.30,inbound_tourism_accommodation_1.31,inbound_tourism_accommodation_1.32,inbound_tourism_regions_1.10,inbound_tourism_regions_1.11,inbound_tourism_regions_1.12,inbound_tourism_regions_1.13,...,tourism_industries_4.17,tourism_industries_4.18,tourism_industries_4.19,employment_5.10,employment_5.20,employment_5.30,employment_5.40,employment_5.50,employment_5.60,employment_5.70
0,4.0,1995,,,,,,,,,...,,,,,,,,,,
1,4.0,1996,,,,,,,,,...,,,,,,,,,,
2,4.0,1997,,,,,,,,,...,,,,,,,,,,
3,4.0,1998,,,,,,,,,...,,,,,,,,,,
4,4.0,1999,,,,,,,,,...,,,,,,,,,,


## Exporting Data to CSV

In [9]:
# write to csv
tourism_df.to_csv('../data/tourism_metrics.csv', index = False)
countries.to_csv('../data/country_codes.csv', index = False)