# Data Cleaning

## Import Packages

In [1]:
import pandas as pd
import numpy as np
from itertools import chain

## Get Country Codes

In [2]:
country_df = pd.read_excel(
    '../data/raw/unwto-all-data-download_2022.xlsx',
    sheet_name = 'Inbound Tourism-Arrivals',
    header = 2
)

In [19]:
country_df = country_df[['C.','Basic data and indicators']][
    (country_df['Basic data and indicators'].isna() == False) & 
    (country_df['C.'].isna() == False)
]

In [20]:
countries = country_df.rename(columns = {
    'C.':'country_code',
    'Basic data and indicators':'country'
})

## Function to Standardize Data Cleaning

In [21]:
def cleanse_tourism_df(sheet):
    
    # read excel sheet
    df = pd.read_excel(
        '../data/raw/unwto-all-data-download_2022.xlsx',
        sheet_name = sheet,
        header = 2
    )
    
    # remove irrelevant rows
    df = df[
        (df['S.'].isna() == False) & 
        (df['S.'] != 0)
    ]
    
    # utilizing the attribute number column with table name, as numbers are repeating across tables
    df['table_measure'] = sheet.replace(' ','_').replace('-','_').lower() + '_' + df['S.'].map('{:,.2f}'.format)
    
    # defining the columns to keep
    col_list = [['C.','table_measure'], list(range(1995,2023))]
    col_list = list(chain(*col_list))
    df = df[col_list]
    
    # utilizing pd.melt() to put the years onto rows
    df = pd.melt(df,
                 id_vars=['C.', 'table_measure'], 
                 var_name='year',
                 value_name='value')
    
    # replacing '..' values with NaN
    df['value'] = df['value'].replace('..',np.nan)
    
    # pivoting to put the attributes onto columns
    df = df.pivot(index=['C.', 'year'], columns='table_measure', values='value')#.reset_index()
    
    return df

## Creating Data Frames

In [23]:
# inbound tourism tables
inbound_accommodations = cleanse_tourism_df('Inbound Tourism-Accommodation')
inbound_regions = cleanse_tourism_df('Inbound Tourism-Regions')
inbound_arrivals = cleanse_tourism_df('Inbound Tourism-Arrivals')
inbound_purpose = cleanse_tourism_df('Inbound Tourism-Purpose')
inbound_transport = cleanse_tourism_df('Inbound Tourism-Transport')
inbound_expenditure = cleanse_tourism_df('Inbound Tourism-Expenditure')

# domestic tourism tables
domestic_trips = cleanse_tourism_df('Domestic Tourism-Trips')
domestic_accommodations = cleanse_tourism_df('Domestic Tourism-Accommodation')

# outbound tourism tables
outbound_departures = cleanse_tourism_df('Outbound Tourism-Departures')
outbound_expenditure = cleanse_tourism_df('Outbound Tourism-Expenditure')

# tourism industry table
tourism_industry = cleanse_tourism_df('Tourism Industries')

# tourism employment table
employment = cleanse_tourism_df('Employment')

In [34]:
# concatenate data frames into one
tourism_df = pd.concat([
    inbound_accommodations,
    inbound_regions,
    inbound_arrivals,
    inbound_purpose,
    inbound_transport,
    inbound_expenditure,
    domestic_trips,
    domestic_accommodations,
    outbound_departures,
    outbound_expenditure,
    tourism_industry,
    employment
], axis = 1)

tourism_df = tourism_df.reset_index().rename(columns = {
    'C.':'country_code'
})

In [35]:
column_name_map = {
    'country_code':'country_code',
    'year':'year',
    'inbound_tourism_accommodation_1.29':'total_guests',
    'inbound_tourism_accommodation_1.30':'total_overnights',
    'inbound_tourism_accommodation_1.31':'hotel_guests',
    'inbound_tourism_accommodation_1.32':'hotel_overnights',
    'inbound_tourism_regions_1.10':'total_middle_east',
    'inbound_tourism_regions_1.11':'total_south_asia',
    'inbound_tourism_regions_1.12':'total_other_region_not_classified',
    'inbound_tourism_regions_1.13':'total_nationals_residing_abroad',
    'inbound_tourism_regions_1.50':'total_regions',
    'inbound_tourism_regions_1.60':'total_africa',
    'inbound_tourism_regions_1.70':'total_americas',
    'inbound_tourism_regions_1.80':'total_east_asia_pacific',
    'inbound_tourism_regions_1.90':'total_europe',
    'inbound_tourism_arrivals_1.10':'total_region_arrivals', #remove
    'inbound_tourism_arrivals_1.20':'overnight_visitors',
    'inbound_tourism_arrivals_1.30':'same_day_visitors',
    'inbound_tourism_arrivals_1.40':'same_day_cruise_passengers',
    'inbound_tourism_purpose_1.14':'total_by_purpose',
    'inbound_tourism_purpose_1.15':'visitors_personal',
    'inbound_tourism_purpose_1.18':'visitors_business_professional',
    'inbound_tourism_transport_1.19':'total_arrivals_transport', # remove
    'inbound_tourism_transport_1.20':'arrivals_by_air',
    'inbound_tourism_transport_1.21':'arrivals_by_water',
    'inbound_tourism_transport_1.22':'arrivals_by_land',
    'inbound_tourism_expenditure_1.33':'total_inbound_tourism_expenditure',
    'inbound_tourism_expenditure_1.34':'travel_inbound_tourism_expenditure',
    'inbound_tourism_expenditure_1.35':'transport_inbound_tourism_expenditure',
    'domestic_tourism_trips_2.10':'total_domestic_trip_visitors',
    'domestic_tourism_trips_2.20':'domestic_trip_overnight_visitors',
    'domestic_tourism_trips_2.30':'domestic_trip_same_day_visitors',
    'domestic_tourism_accommodation_2.19':'total_guests_domestic',
    'domestic_tourism_accommodation_2.20':'total_overnights_domestic',
    'domestic_tourism_accommodation_2.21':'hotel_guests_domestic',
    'domestic_tourism_accommodation_2.22':'hotel_overnights_domestic',
    'outbound_tourism_departures_3.10':'total_departures',
    'outbound_tourism_departures_3.20':'departures_overnight_visitors',
    'outbound_tourism_departures_3.30':'departures_same_day_visitors',
    'outbound_tourism_expenditure_3.40':'outbound_total_expenditure',
    'outbound_tourism_expenditure_3.50':'outbound_travel_expenditure',
    'outbound_tourism_expenditure_3.60':'outbound_transport_expenditure',
    'tourism_industries_4.13':'establishments',
    'tourism_industries_4.14':'rooms',
    'tourism_industries_4.15':'bed_places',
    'tourism_industries_4.16':'occupancy_rate_per_room',
    'tourism_industries_4.17':'occupancy_rate_per_bed_place',
    'tourism_industries_4.18':'avg_stay_length',
    'tourism_industries_4.19':'available_capacity',
    'employment_5.10':'total_tourism_employees',
    'employment_5.20':'employees_visitor_accomodation_services',
    'employment_5.30':'employees_other_accomodation_services',
    'employment_5.40':'employees_food_and_beverage_service',
    'employment_5.50':'employees_passenger_transportation',
    'employment_5.60':'employees_travel_agencies',
    'employment_5.70':'employees_other_tourism_industries'
}

In [36]:
tourism_df = tourism_df.rename(columns = column_name_map)

In [37]:
tourism_df.head(5)

table_measure,country_code,year,total_guests,total_overnights,hotel_guests,hotel_overnights,total_middle_east,total_south_asia,total_other_region_not_classified,total_nationals_residing_abroad,...,occupancy_rate_per_bed_place,avg_stay_length,available_capacity,total_tourism_employees,employees_visitor_accomodation_services,employees_other_accomodation_services,employees_food_and_beverage_service,employees_passenger_transportation,employees_travel_agencies,employees_other_tourism_industries
0,4.0,1995,,,,,,,,,...,,,,,,,,,,
1,4.0,1996,,,,,,,,,...,,,,,,,,,,
2,4.0,1997,,,,,,,,,...,,,,,,,,,,
3,4.0,1998,,,,,,,,,...,,,,,,,,,,
4,4.0,1999,,,,,,,,,...,,,,,,,,,,


### GDP Per Capita

In [30]:
# read in excel file
gdp_per_capita_total = pd.read_excel(
    '../data/raw/RealPerCapitaGDPValues.xlsx',
    sheet_name = 'GDP Per Cap',
    header = 12
).rename(columns = {
    'Country':'country'
})

# get columns of interest
gdp_per_capita_cols = [x for x in gdp_per_capita_total.columns if (x.startswith('y')) or (x == 'country')]

# subset to columns, and use pd.melt to get years into a single column
gdp_per_capita = pd.melt(gdp_per_capita_total[gdp_per_capita_cols],
        id_vars=['country'],
        var_name='year',
        value_name='gdp_per_capita')

# replace the 'y' in the year values
gdp_per_capita.year = gdp_per_capita.year.str.replace('y','')

## Exporting Data to CSV

In [None]:
# write to csv
tourism_df.to_csv('../data/tourism_metrics.csv', index = False)
countries.to_csv('../data/country_codes.csv', index = False)
gdp_per_capita.to_csv('../data/gdp_per_capita.csv', index = False)