# 3. GDP and countries

This dataset includes the GDP per country. (Source: https://data.worldbank.org/indicator/NY.GDP.MKTP.CD)

Relevant variables include:

1. **Country Name**: country name
2. **Country Code**: three-letter code of each country
3. **years**: year for the GDP entry

A new data frame is created from the raw data. The relevant variables listed above are analyzed and cleaned when necessary. The columns are renamed to avoid conlficts with other data frames. The final data frame is finally saved into a csv file.

### Data loading and cleaning

In [1]:
import pandas as pd
import numpy as np

#### 1. GDP (total per country)

In [2]:
# Import raw data

gdp = pd.read_csv("../data/raw/6_gdp_countries.csv")

In [3]:
gdp.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Aruba,ABW,GDP (current US$),NY.GDP.MKTP.CD,,,,,,,...,2390503000.0,2549721000.0,2534637000.0,2581564000.0,2649721000.0,2691620000.0,2646927000.0,2700559000.0,,
1,Afghanistan,AFG,GDP (current US$),NY.GDP.MKTP.CD,537777811.1,548888895.6,546666677.8,751111191.1,800000044.4,1006667000.0,...,15856570000.0,17804280000.0,20001620000.0,20561050000.0,20484870000.0,19907110000.0,19362640000.0,20191760000.0,19362970000.0,
2,Angola,AGO,GDP (current US$),NY.GDP.MKTP.CD,,,,,,,...,83799500000.0,112000000000.0,128000000000.0,137000000000.0,146000000000.0,116000000000.0,101000000000.0,122000000000.0,106000000000.0,
3,Albania,ALB,GDP (current US$),NY.GDP.MKTP.CD,,,,,,,...,11926960000.0,12890870000.0,12319780000.0,12776280000.0,13228240000.0,11386930000.0,11861350000.0,13025060000.0,15102500000.0,
4,Andorra,AND,GDP (current US$),NY.GDP.MKTP.CD,,,,,,,...,3355695000.0,3442063000.0,3164615000.0,3281585000.0,3350736000.0,2811489000.0,2877312000.0,3013387000.0,3236544000.0,


In [4]:
# Rename Country Name and Country Code columns to match with data for EEE and POM
# Drop useless columns

gdp = gdp.rename(columns={"Country Name": "country_name",
                          "Country Code": "country"}).drop(["Indicator Name", "Indicator Code"], axis=1)

In [5]:
# Create a single variable "year" and assign the value of "gdp" to it
# for each country

gdp_transform = gdp.melt(["country_name", "country"])

In [6]:
# Rename columns from the melt

gdp_transform = gdp_transform.rename(columns={"variable": "year", "value": "gdp"})

In [7]:
gdp_transform.dtypes

country_name     object
country          object
year             object
gdp             float64
dtype: object

In [8]:
gdp_transform.head()

Unnamed: 0,country_name,country,year,gdp
0,Aruba,ABW,1960,
1,Afghanistan,AFG,1960,537777811.1
2,Angola,AGO,1960,
3,Albania,ALB,1960,
4,Andorra,AND,1960,


In [9]:
# check for missing values

gdp_transform.isna().sum()

country_name       0
country            0
year               0
gdp             3748
dtype: int64

In [10]:
gdp_transform.to_csv("../Data/clean_data/4_gdp_total_countries.csv", index=False)

#### 2. GDP (per capita)

In [11]:
# Import raw data

gdp_pc = pd.read_csv("../data/raw/7_gdp_percapita.csv")

In [12]:
gdp_pc.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Aruba,ABW,GDP per capita (current US$),NY.GDP.PCAP.CD,,,,,,,...,23512.6026,24985.99328,24713.69805,25025.09956,25533.56978,25796.38025,25239.60041,25630.26649,,
1,Afghanistan,AFG,GDP per capita (current US$),NY.GDP.PCAP.CD,59.773194,59.860874,58.458015,78.706388,82.095231,101.108305,...,543.303042,591.162347,641.872034,637.165044,613.856333,578.466353,547.22811,556.302138,520.896603,
2,Angola,AGO,GDP per capita (current US$),NY.GDP.PCAP.CD,,,,,,,...,3587.883798,4615.468028,5100.095808,5254.882338,5408.410496,4166.979684,3506.072885,4095.812942,3432.385736,
3,Albania,ALB,GDP per capita (current US$),NY.GDP.PCAP.CD,,,,,,,...,4094.362119,4437.178067,4247.614279,4413.081743,4578.66672,3952.829458,4124.108907,4532.890162,5268.848504,
4,Andorra,AND,GDP per capita (current US$),NY.GDP.PCAP.CD,,,,,,,...,39736.35406,41100.72994,38392.9439,40626.75163,42300.33413,36039.6535,37224.10892,39134.39337,42029.76274,


In [13]:
# Rename Country Name and Country Code columns to match with data for EEE and POM
# Drop useless columns

gdp_pc = gdp_pc.rename(columns={"Country Name": "country_name",
                                "Country Code": "country"}).drop(["Indicator Name", "Indicator Code", ], axis=1)

In [14]:
# Create a single variable "year" and assign the value of "gdp" to it
# for each country

gdp_pc_transform = gdp_pc.melt(["country_name", "country"])

In [15]:
# Rename columns from the melt

gdp_pc_transform = gdp_pc_transform.rename(columns={"variable": "year", "value": "gdp_pc"})

In [16]:
gdp_pc_transform.head()

Unnamed: 0,country_name,country,year,gdp_pc
0,Aruba,ABW,1960,
1,Afghanistan,AFG,1960,59.773194
2,Angola,AGO,1960,
3,Albania,ALB,1960,
4,Andorra,AND,1960,


In [17]:
gdp_pc_transform.to_csv("../Data/clean_data/5_gdp_per_capita.csv", index=False)

#### 3. Countries aggregates

In [18]:
countries = pd.read_csv("../data/raw/8_countries_income_groups.csv")

In [19]:
countries

Unnamed: 0,Country Code,Region,IncomeGroup,TableName
0,ABW,Latin America & Caribbean,High income,Aruba
1,AFG,South Asia,Low income,Afghanistan
2,AGO,Sub-Saharan Africa,Lower middle income,Angola
3,ALB,Europe & Central Asia,Upper middle income,Albania
4,AND,Europe & Central Asia,High income,Andorra
...,...,...,...,...
258,XKX,Europe & Central Asia,Upper middle income,Kosovo
259,YEM,Middle East & North Africa,Low income,"Yemen, Rep."
260,ZAF,Sub-Saharan Africa,Upper middle income,South Africa
261,ZMB,Sub-Saharan Africa,Lower middle income,Zambia


In [20]:
countries = countries.rename(columns={"Country Code": "country", 
                                      "IncomeGroup": "income_group", 
                                      "TableName": "country_name"}).drop(["Region"], axis=1)

#### Create column "region" with "europe" for the 28 EU countries, and "other" for the rest of the countries

In [21]:
# Create a classification for the 28 EU countries

eu_countries = ['AUT', 'BEL', 'BGR', 'CYP', 'CZE', 'DEU', 'DNK', 'ESP', 'EST',
                'FIN', 'FRA', 'GBR', 'GRC', 'HRV', 'HUN', 'IRL', 'ITA', 'LTU',
                'LUX', 'LVA', 'MLT', 'NLD', 'POL', 'PRT', 'ROU', 'SVK', 'SVN',
                'SWE']

In [22]:
# Create a function with loops though the list of EU countries

def match_country(country):
    k = (["Europe" for pays in eu_countries if pays in country])
    return k[0] if k else "Other"

In [23]:
# Pass a lambda function to create the "region" column

countries['region'] = countries.apply(lambda x: match_country(x['country']), axis=1)

In [24]:
countries.dtypes

country         object
income_group    object
country_name    object
region          object
dtype: object

In [25]:
countries.head()

Unnamed: 0,country,income_group,country_name,region
0,ABW,High income,Aruba,Other
1,AFG,Low income,Afghanistan,Other
2,AGO,Lower middle income,Angola,Other
3,ALB,Upper middle income,Albania,Other
4,AND,High income,Andorra,Other


In [27]:
# Locate entries with continent = europe

countries.loc[countries["region"].str.contains("Europe")].nunique()

country         28
income_group     2
country_name    28
region           1
dtype: int64

In [28]:
countries.to_csv("../Data/clean_data/6_countries.csv", index=False)