In [1]:
import pandas as pd
from sqlalchemy import create_engine
import pymysql
import chardet

# Import CSV files/create dataframes

In [11]:
#------------------------Read CSV Arrival Data----------------------------#
#(This file contains number of arrivals by travelers, per year up to 2018)#

arrivals_file = "resources/data/travel/API_ST.INT.ARVL_DS2_en_csv_v2_10515875.csv"
with open(arrivals_file, 'rb') as f:
    result = chardet.detect(f.read())
arrivals_df = pd.read_csv(arrivals_file, encoding=result['encoding'])
arrivals_df.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
0,Aruba,ABW,"International tourism, number of arrivals",ST.INT.ARVL,,,,,,,...,813000.0,824000.0,869000.0,904000.0,979000.0,1072000.0,1225000.0,1102000.0,1070500.0,
1,Afghanistan,AFG,"International tourism, number of arrivals",ST.INT.ARVL,,,,,,,...,,,,,,,,,,
2,Angola,AGO,"International tourism, number of arrivals",ST.INT.ARVL,,,,,,,...,366000.0,425000.0,481000.0,528000.0,650000.0,595000.0,592000.0,397000.0,261000.0,
3,Albania,ALB,"International tourism, number of arrivals",ST.INT.ARVL,,,,,,,...,1711000.0,2191000.0,2469000.0,3156000.0,2857000.0,3341000.0,3784000.0,4070000.0,4643000.0,
4,Andorra,AND,"International tourism, number of arrivals",ST.INT.ARVL,,,,,,,...,1830000.0,1808000.0,2242000.0,2238000.0,2328000.0,2363000.0,2663000.0,2831000.0,,


## Issues

<b>Error with csv read:</b> ParserError: Error tokenizing data. C error: Expected 3 fields in line 5, saw 64 <br>
<b>How error resolved:</b> This looked to be a parsing error. Added "error_bad_lines=False" to code to skip the offending lines.
<br>
<br>
<b>Error with csv read:</b> UnicodeDecodeError: 'utf-8' codec can't decode byte 0x92 in position 18: invalid start byte <br>
<b>How error resolved:</b> Imported chardet which is used for unkown encoding types. Then used a with loop and used chardet to read file.

In [16]:
#----------------------------------Read CSV metadata file-----------------------------------------#
#(This file contains country code column, and some demographic info such as region & income group)#

metadata_file =  "resources/data/travel/Metadata_Country_API_ST.INT.ARVL_DS2_en_csv_v2_10515875.csv"
metadata_df = pd.read_csv(metadata_file, encoding='utf-8')
metadata_df.head()

Unnamed: 0,Country Code,Region,IncomeGroup,SpecialNotes,TableName,Unnamed: 5
0,ABW,Latin America & Caribbean,High income,Mining is included in agriculture\nElectricty ...,Aruba,
1,AFG,South Asia,Low income,Fiscal year end: March 20; reporting period fo...,Afghanistan,
2,AGO,Sub-Saharan Africa,Lower middle income,,Angola,
3,ALB,Europe & Central Asia,Upper middle income,,Albania,
4,AND,Europe & Central Asia,High income,WB-3 code changed from ADO to AND to align wit...,Andorra,


# Clean Dataframes

In [23]:
#------------Arrivals dataframe------------#

# First list columns and keep only columns we are interested in
list(arrivals_df)
new_arrivals_df = arrivals_df[['Country Name','Country Code','2008','2009','2010','2011','2012','2013','2014',\
 '2015','2016','2017','2018']]
new_arrivals_df.head()

# Noticed on inspection that there is no data for 2018 so will drop column
print(new_arrivals_df['2018'])
del new_arrivals_df['2018']
new_arrivals_df.head()

0     NaN
1     NaN
2     NaN
3     NaN
4     NaN
5     NaN
6     NaN
7     NaN
8     NaN
9     NaN
10    NaN
11    NaN
12    NaN
13    NaN
14    NaN
15    NaN
16    NaN
17    NaN
18    NaN
19    NaN
20    NaN
21    NaN
22    NaN
23    NaN
24    NaN
25    NaN
26    NaN
27    NaN
28    NaN
29    NaN
       ..
234   NaN
235   NaN
236   NaN
237   NaN
238   NaN
239   NaN
240   NaN
241   NaN
242   NaN
243   NaN
244   NaN
245   NaN
246   NaN
247   NaN
248   NaN
249   NaN
250   NaN
251   NaN
252   NaN
253   NaN
254   NaN
255   NaN
256   NaN
257   NaN
258   NaN
259   NaN
260   NaN
261   NaN
262   NaN
263   NaN
Name: 2018, Length: 264, dtype: float64


Unnamed: 0,Country Name,Country Code,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
0,Aruba,ABW,827000.0,813000.0,824000.0,869000.0,904000.0,979000.0,1072000.0,1225000.0,1102000.0,1070500.0
1,Afghanistan,AFG,,,,,,,,,,
2,Angola,AGO,294000.0,366000.0,425000.0,481000.0,528000.0,650000.0,595000.0,592000.0,397000.0,261000.0
3,Albania,ALB,1247000.0,1711000.0,2191000.0,2469000.0,3156000.0,2857000.0,3341000.0,3784000.0,4070000.0,4643000.0
4,Andorra,AND,2059000.0,1830000.0,1808000.0,2242000.0,2238000.0,2328000.0,2363000.0,2663000.0,2831000.0,


In [27]:
#------------Metadata dataframe------------#

# First list columns and keep only columns we are interested in
list(metadata_df)
new_metadata_df = metadata_df[['Country Code','Region','IncomeGroup']]
new_metadata_df.head()

Unnamed: 0,Country Code,Region,IncomeGroup
0,ABW,Latin America & Caribbean,High income
1,AFG,South Asia,Low income
2,AGO,Sub-Saharan Africa,Lower middle income
3,ALB,Europe & Central Asia,Upper middle income
4,AND,Europe & Central Asia,High income


# Create Database connection to SQL

In [41]:
# Connect to SQL & create a database

rds_connection_string = "root:Hionlife946!@127.0.0.1/"
engine = create_engine(f'mysql://{rds_connection_string}')
connection = engine.connect()
#connection.execute('create database country_travel_db')

In [46]:
# Connect to database

rds_connection_string = "root:Hionlife946!@127.0.0.1/country_travel_db"
engine = create_engine(f'mysql://{rds_connection_string}')

# Create tables

#connection.execute('ALTER DATABASE country_travel_db CHARACTER SET utf8;')
new_arrivals_df.to_sql(name='arrivals', con=engine, if_exists='fail', index=False)

UnicodeEncodeError: 'latin-1' codec can't encode character '\u2019' in position 18: ordinal not in range(256)

## Issues
<br>
<b>Error creating table: </b>UnicodeEncodeError: 'latin-1' codec can't encode character '\u2019' in position 18: ordinal not in range(256)