In [106]:
# numerical processing
import numpy as np
# storing and analysing data
import pandas as pd

In [107]:
# list of rows to drop
drop_countries = ['World', 'High income', 'OECD members',
       'Post-demographic dividend', 'North America', 
       'IDA & IBRD total', 'Low & middle income', 'Middle income',
       'IBRD only', 'East Asia & Pacific', 'Europe & Central Asia',
       'Upper middle income', 'Late-demographic dividend',
       'European Union', 'Euro area', 'Early-demographic dividend',
       'East Asia & Pacific (excluding high income)',
       'East Asia & Pacific (IDA & IBRD countries)', 
       'Lower middle income', 'Latin America & Caribbean',
       'Latin America & the Caribbean (IDA & IBRD countries)',
       'Latin America & Caribbean (excluding high income)', 
       'Middle East & North Africa', 'Arab World',
       'Europe & Central Asia (IDA & IBRD countries)',
       'Europe & Central Asia (excluding high income)', 
       'South Asia', 'South Asia (IDA & IBRD)', 'Sub-Saharan Africa (excluding high income)',
       'Middle East & North Africa (IDA & IBRD countries)',
       'Central Europe and the Baltics', 'Small states',
       'IDA blend', 'IDA only',
       'Middle East & North Africa (excluding high income)', 'IDA total',
       'Sub-Saharan Africa', 'Sub-Saharan Africa (IDA & IBRD countries)',
       'Least developed countries: UN classification', 
       'Pre-demographic dividend',
       'Heavily indebted poor countries (HIPC)', 
       'Fragile and conflict affected situations', 
       'Other small states', 'Low income']

In [2]:
# get years
years = [str(i) for i in range(1970, 2018)]
# first few values
print(years[:5])
# last few values
print(years[-5:])
# length of years
print(len(years))

['1970', '1971', '1972', '1973', '1974']
['2013', '2014', '2015', '2016', '2017']
48


In [109]:
# clean data
def clean_data(file_name, save_as):
    '''Get a csv file, preprocess it, clean it and save it as another csv file
    '''
    # read csv file
    df = pd.read_csv(file_name)
    # drop rows 
    df = df[~df['Country Name'].isin(drop_countries)]
    # convert it to a longer format
    df = df.melt(id_vars = ['Country Name'], value_vars=years, 
                 var_name='Year', value_name='Count')
    # drop rows with na
    df = df.dropna()
    # reset index
    df = df.reset_index(drop=True)
    # convert to int format
    df['Count'] = df['Count'].astype('int')
    # save it a csv file
    df.to_csv(save_as, index=False)

In [110]:
clean_data('passengers_count_clean.csv', 'passengers_count_long.csv')
clean_data('departures_count_clean.csv', 'departures_count_long.csv')
# clean_data('departures_count_clean.csv', 'departures_count_long.csv')

187
187
