In [1]:
import pandas as pd
import numpy as np
import json 
import time 
import requests
import pycountry
import csv

In [2]:


# API URL for GDP 
url = "https://ec.europa.eu/eurostat/api/dissemination/sdmx/3.0/data/dataflow/ESTAT/nama_10r_3gdp/1.0?compress=false&format=csvdata"

# Fetch the data
df = pd.read_csv(url)

# Display the first few rows of the dataframe
df


Unnamed: 0,DATAFLOW,LAST UPDATE,freq,unit,geo,TIME_PERIOD,OBS_VALUE,OBS_FLAG
0,ESTAT:NAMA_10R_3GDP(1.0),19/02/24 23:00:00,A,EUR_HAB,AL,2008,3000.0,
1,ESTAT:NAMA_10R_3GDP(1.0),19/02/24 23:00:00,A,EUR_HAB,AL,2009,3000.0,
2,ESTAT:NAMA_10R_3GDP(1.0),19/02/24 23:00:00,A,EUR_HAB,AL,2010,3100.0,
3,ESTAT:NAMA_10R_3GDP(1.0),19/02/24 23:00:00,A,EUR_HAB,AL,2011,3200.0,
4,ESTAT:NAMA_10R_3GDP(1.0),19/02/24 23:00:00,A,EUR_HAB,AL,2012,3300.0,
...,...,...,...,...,...,...,...,...
268489,ESTAT:NAMA_10R_3GDP(1.0),19/02/24 23:00:00,A,PPS_HAB_EU27_2020,TRC34,2018,28.0,
268490,ESTAT:NAMA_10R_3GDP(1.0),19/02/24 23:00:00,A,PPS_HAB_EU27_2020,TRC34,2019,30.0,
268491,ESTAT:NAMA_10R_3GDP(1.0),19/02/24 23:00:00,A,PPS_HAB_EU27_2020,TRC34,2020,33.0,
268492,ESTAT:NAMA_10R_3GDP(1.0),19/02/24 23:00:00,A,PPS_HAB_EU27_2020,TRC34,2021,28.0,


In [3]:
# Dropping 'DATAFLOW' and 'LAST UPDATE' columns, and renaming 'TIME_PERIOD' to 'date'
df2 = df.drop(['DATAFLOW', 'LAST UPDATE', 'OBS_FLAG', 'freq'], axis=1).rename(columns={'TIME_PERIOD': 'date', 'OBS_VALUE': 'value', 'geo': 'id'})

# Display the modified DataFrame
df2

Unnamed: 0,unit,id,date,value
0,EUR_HAB,AL,2008,3000.0
1,EUR_HAB,AL,2009,3000.0
2,EUR_HAB,AL,2010,3100.0
3,EUR_HAB,AL,2011,3200.0
4,EUR_HAB,AL,2012,3300.0
...,...,...,...,...
268489,PPS_HAB_EU27_2020,TRC34,2018,28.0
268490,PPS_HAB_EU27_2020,TRC34,2019,30.0
268491,PPS_HAB_EU27_2020,TRC34,2020,33.0
268492,PPS_HAB_EU27_2020,TRC34,2021,28.0


In [4]:
df2['date'] = df2['date'].astype(str) + '-01-01' 
df2

Unnamed: 0,unit,id,date,value
0,EUR_HAB,AL,2008-01-01,3000.0
1,EUR_HAB,AL,2009-01-01,3000.0
2,EUR_HAB,AL,2010-01-01,3100.0
3,EUR_HAB,AL,2011-01-01,3200.0
4,EUR_HAB,AL,2012-01-01,3300.0
...,...,...,...,...
268489,PPS_HAB_EU27_2020,TRC34,2018-01-01,28.0
268490,PPS_HAB_EU27_2020,TRC34,2019-01-01,30.0
268491,PPS_HAB_EU27_2020,TRC34,2020-01-01,33.0
268492,PPS_HAB_EU27_2020,TRC34,2021-01-01,28.0


In [5]:

# Function to extract country name from NUTS code
def nuts2_to_country(nuts2_code):
    # Check if the code starts with 'UK'
    if nuts2_code.startswith('UK'):
        return 'United Kingdom'
    
    # Check if the code starts with 'EL'
    if nuts2_code.startswith('EL'):
        return 'Greece'
    # Extract the country code from the NUTS2 code
    country_code = nuts2_code[:2]  # Extract first two characters
    country = pycountry.countries.get(alpha_2=country_code)
    
    # Return the country name if found, otherwise 'Unknown'
    return country.name if country else 'Unknown'

# Apply the function to the 'geo' column and create a new 'country' column
df2['country'] = df2['id'].apply(nuts2_to_country)

# Display the modified DataFrame
df2

Unnamed: 0,unit,id,date,value,country
0,EUR_HAB,AL,2008-01-01,3000.0,Albania
1,EUR_HAB,AL,2009-01-01,3000.0,Albania
2,EUR_HAB,AL,2010-01-01,3100.0,Albania
3,EUR_HAB,AL,2011-01-01,3200.0,Albania
4,EUR_HAB,AL,2012-01-01,3300.0,Albania
...,...,...,...,...,...
268489,PPS_HAB_EU27_2020,TRC34,2018-01-01,28.0,Türkiye
268490,PPS_HAB_EU27_2020,TRC34,2019-01-01,30.0,Türkiye
268491,PPS_HAB_EU27_2020,TRC34,2020-01-01,33.0,Türkiye
268492,PPS_HAB_EU27_2020,TRC34,2021-01-01,28.0,Türkiye


In [6]:
#  checking number of unkowns
unknown_count = df2['country'].value_counts().get('Unknown', 0)

print(f"Count of 'Unknown' entries: {unknown_count}")

Count of 'Unknown' entries: 161


In [7]:
# Function to get ISO3 code from country name
def get_iso3_code(country_name):
    try:
        country = pycountry.countries.lookup(country_name)
        return country.alpha_3
    except LookupError:
        return 'Unknown'

# Apply the function to the 'country' column and create the 'iso3' column
df2['iso3'] = df2['country'].apply(get_iso3_code)

# Display the modified DataFrame
df2

Unnamed: 0,unit,id,date,value,country,iso3
0,EUR_HAB,AL,2008-01-01,3000.0,Albania,ALB
1,EUR_HAB,AL,2009-01-01,3000.0,Albania,ALB
2,EUR_HAB,AL,2010-01-01,3100.0,Albania,ALB
3,EUR_HAB,AL,2011-01-01,3200.0,Albania,ALB
4,EUR_HAB,AL,2012-01-01,3300.0,Albania,ALB
...,...,...,...,...,...,...
268489,PPS_HAB_EU27_2020,TRC34,2018-01-01,28.0,Türkiye,TUR
268490,PPS_HAB_EU27_2020,TRC34,2019-01-01,30.0,Türkiye,TUR
268491,PPS_HAB_EU27_2020,TRC34,2020-01-01,33.0,Türkiye,TUR
268492,PPS_HAB_EU27_2020,TRC34,2021-01-01,28.0,Türkiye,TUR


In [8]:
with open('NUTS3_names.json', 'r') as file:
    df10 = json.load(file)
    df10

In [9]:
df20 = pd.DataFrame(list(df10.items()), columns=['id', 'name'])
df20

Unnamed: 0,id,name
0,BE,Belgium
1,BE1,Région de Bruxelles-Capitale/Brussels Hoofdste...
2,BE10,Région de Bruxelles-Capitale/Brussels Hoofdste...
3,BE100,Arr. de Bruxelles-Capitale/Arr. Brussel-Hoofdstad
4,BE2,Vlaams Gewest
...,...,...
2058,TRC3,"Mardin, Batman, Şırnak, Siirt"
2059,TRC31,Mardin
2060,TRC32,Batman
2061,TRC33,Şırnak


In [10]:
df100 = pd.merge(df2, df20, on='id', how='inner')
df100

Unnamed: 0,unit,id,date,value,country,iso3,name
0,EUR_HAB,AL,2008-01-01,3000.00,Albania,ALB,Albania
1,EUR_HAB,AL,2009-01-01,3000.00,Albania,ALB,Albania
2,EUR_HAB,AL,2010-01-01,3100.00,Albania,ALB,Albania
3,EUR_HAB,AL,2011-01-01,3200.00,Albania,ALB,Albania
4,EUR_HAB,AL,2012-01-01,3300.00,Albania,ALB,Albania
...,...,...,...,...,...,...,...
264934,MIO_PPS_EU27_2020,NO0B2,2017-01-01,104.75,Norway,NOR,Svalbard
264935,MIO_PPS_EU27_2020,NO0B2,2018-01-01,108.16,Norway,NOR,Svalbard
264936,MIO_PPS_EU27_2020,NO0B2,2019-01-01,120.48,Norway,NOR,Svalbard
264937,MIO_PPS_EU27_2020,NO0B2,2020-01-01,85.53,Norway,NOR,Svalbard


In [None]:
# UNITS: 
# [MIO_EUR]
# Million euro
# [EUR_HAB]
# Euro per inhabitant
# [EUR_HAB_EU27_2020]
# Euro per inhabitant in percentage of the EU27 (from 2020) average
# [MIO_NAC]
# Million units of national currency
# [MIO_PPS_EU27_2020]
# Million purchasing power standards (PPS, EU27 from 2020)
# [PPS_EU27_2020_HAB]
# Purchasing power standard (PPS, EU27 from 2020), per inhabitant
# [PPS_HAB_EU27_2020]
# Purchasing power standard (PPS, EU27 from 2020), per inhabitant in percentage of the EU27 (from 2020) average

In [14]:
# filtering by unit to get EUR_HAB (essentially gdp per capita?)
gdp_pc_df = df100.loc[(df100['unit'] == 'EUR_HAB')]


# Display the filtered DataFrame
gdp_pc_df

Unnamed: 0,unit,id,date,value,country,iso3,name
0,EUR_HAB,AL,2008-01-01,3000.0,Albania,ALB,Albania
1,EUR_HAB,AL,2009-01-01,3000.0,Albania,ALB,Albania
2,EUR_HAB,AL,2010-01-01,3100.0,Albania,ALB,Albania
3,EUR_HAB,AL,2011-01-01,3200.0,Albania,ALB,Albania
4,EUR_HAB,AL,2012-01-01,3300.0,Albania,ALB,Albania
...,...,...,...,...,...,...,...
264736,EUR_HAB,TRC34,2018-01-01,3500.0,Türkiye,TUR,Siirt
264737,EUR_HAB,TRC34,2019-01-01,4200.0,Türkiye,TUR,Siirt
264738,EUR_HAB,TRC34,2020-01-01,4100.0,Türkiye,TUR,Siirt
264739,EUR_HAB,TRC34,2021-01-01,3800.0,Türkiye,TUR,Siirt


In [16]:
gdp_pc_df2 = gdp_pc_df.drop(['unit'], axis=1)

In [17]:
gdp_pc_df2.to_csv('gdp_euro_hab.csv', index=False)