In [1]:
import pandas as pd
import numpy as np
import json 
import time 
import requests
import pycountry
import csv

In [10]:
url = "https://ec.europa.eu/eurostat/api/dissemination/sdmx/3.0/data/dataflow/ESTAT/demo_r_gind3/1.0?compress=false&format=csvdata"
response = requests.get(url)

# Step 2: Save the response content as a CSV file
csv_file = "population_euro_data.csv"
with open(csv_file, 'wb') as file:
    file.write(response.content)

print(f"Data successfully exported to {csv_file}")


# Fetch the data
df = pd.read_csv(url)

# Display the first few rows of the dataframe
df



Data successfully exported to population_euro_data.csv


Unnamed: 0,DATAFLOW,LAST UPDATE,freq,indic_de,geo,TIME_PERIOD,OBS_VALUE,OBS_FLAG
0,ESTAT:DEMO_R_GIND3(1.0),14/06/24 23:00:00,A,CNMIGRAT,AL,2000,-28527.0,
1,ESTAT:DEMO_R_GIND3(1.0),14/06/24 23:00:00,A,CNMIGRAT,AL,2001,-43694.0,
2,ESTAT:DEMO_R_GIND3(1.0),14/06/24 23:00:00,A,CNMIGRAT,AL,2002,-38304.0,
3,ESTAT:DEMO_R_GIND3(1.0),14/06/24 23:00:00,A,CNMIGRAT,AL,2003,-38108.0,
4,ESTAT:DEMO_R_GIND3(1.0),14/06/24 23:00:00,A,CNMIGRAT,AL,2004,-37837.0,
...,...,...,...,...,...,...,...,...
450324,ESTAT:DEMO_R_GIND3(1.0),14/06/24 23:00:00,A,NATGROWRT,UKN14,2018,4.4,
450325,ESTAT:DEMO_R_GIND3(1.0),14/06/24 23:00:00,A,NATGROWRT,UKN15,2017,1.4,
450326,ESTAT:DEMO_R_GIND3(1.0),14/06/24 23:00:00,A,NATGROWRT,UKN15,2018,0.8,
450327,ESTAT:DEMO_R_GIND3(1.0),14/06/24 23:00:00,A,NATGROWRT,UKN16,2017,4.3,


In [23]:
url = "https://ec.europa.eu/eurostat/api/dissemination/sdmx/3.0/data/dataflow/ESTAT/demo_r_gind3/1.0?compress=false&format=json"
req = requests.get(url)

req

<Response [200]>

In [24]:
data = req.json()

In [25]:
with open('pop_euro_data.json', 'w') as json_file:
        json.dump(data, json_file, indent=4)

In [11]:
# Dropping 'DATAFLOW' and 'LAST UPDATE' columns, and renaming 'TIME_PERIOD' to 'date'
df2 = df.drop(['DATAFLOW', 'LAST UPDATE', 'OBS_FLAG', 'freq'], axis=1).rename(columns={'TIME_PERIOD': 'date', 'OBS_VALUE': 'value', 'geo': 'id'})

# Display the modified DataFrame
df2

Unnamed: 0,indic_de,id,date,value
0,CNMIGRAT,AL,2000,-28527.0
1,CNMIGRAT,AL,2001,-43694.0
2,CNMIGRAT,AL,2002,-38304.0
3,CNMIGRAT,AL,2003,-38108.0
4,CNMIGRAT,AL,2004,-37837.0
...,...,...,...,...
450324,NATGROWRT,UKN14,2018,4.4
450325,NATGROWRT,UKN15,2017,1.4
450326,NATGROWRT,UKN15,2018,0.8
450327,NATGROWRT,UKN16,2017,4.3


In [12]:
# filtering out all other pop indices, leaving JAN = population on 1 Jan, total


df3 = df2.loc[(df2['indic_de'] == 'JAN')]

# Display the filtered DataFrame
df3

Unnamed: 0,indic_de,id,date,value
284679,JAN,AL,2000,3057026.0
284680,JAN,AL,2001,3063320.0
284681,JAN,AL,2002,3057018.0
284682,JAN,AL,2003,3044993.0
284683,JAN,AL,2004,3034231.0
...,...,...,...,...
328850,JAN,UKN16,2015,115171.0
328851,JAN,UKN16,2016,115581.0
328852,JAN,UKN16,2017,116057.0
328853,JAN,UKN16,2018,116612.0


In [13]:
df3['date'] = df3['date'].astype(str) + '-01-01' 
df3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['date'] = df3['date'].astype(str) + '-01-01'


Unnamed: 0,indic_de,id,date,value
284679,JAN,AL,2000-01-01,3057026.0
284680,JAN,AL,2001-01-01,3063320.0
284681,JAN,AL,2002-01-01,3057018.0
284682,JAN,AL,2003-01-01,3044993.0
284683,JAN,AL,2004-01-01,3034231.0
...,...,...,...,...
328850,JAN,UKN16,2015-01-01,115171.0
328851,JAN,UKN16,2016-01-01,115581.0
328852,JAN,UKN16,2017-01-01,116057.0
328853,JAN,UKN16,2018-01-01,116612.0


In [15]:

# Function to extract country name from NUTS code
def nuts2_to_country(nuts2_code):
    # Check if the code starts with 'UK'
    if nuts2_code.startswith('UK'):
        return 'United Kingdom'
    
    # Check if the code starts with 'EL'
    if nuts2_code.startswith('EL'):
        return 'Greece'
    # Extract the country code from the NUTS2 code
    country_code = nuts2_code[:2]  # Extract first two characters
    country = pycountry.countries.get(alpha_2=country_code)
    
    # Return the country name if found, otherwise 'Unknown'
    return country.name if country else 'Unknown'

# Apply the function to the 'geo' column and create a new 'country' column
df3['country'] = df3['id'].apply(nuts2_to_country)

# Display the modified DataFrame
df3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['country'] = df3['id'].apply(nuts2_to_country)


Unnamed: 0,indic_de,id,date,value,country
284679,JAN,AL,2000-01-01,3057026.0,Albania
284680,JAN,AL,2001-01-01,3063320.0,Albania
284681,JAN,AL,2002-01-01,3057018.0,Albania
284682,JAN,AL,2003-01-01,3044993.0,Albania
284683,JAN,AL,2004-01-01,3034231.0,Albania
...,...,...,...,...,...
328850,JAN,UKN16,2015-01-01,115171.0,United Kingdom
328851,JAN,UKN16,2016-01-01,115581.0,United Kingdom
328852,JAN,UKN16,2017-01-01,116057.0,United Kingdom
328853,JAN,UKN16,2018-01-01,116612.0,United Kingdom


In [17]:
#  checking number of unkowns
unknown_count = df3['country'].value_counts().get('Unknown', 0)

print(f"Count of 'Unknown' entries: {unknown_count}")



Count of 'Unknown' entries: 0


In [18]:
# Function to get ISO3 code from country name
def get_iso3_code(country_name):
    try:
        country = pycountry.countries.lookup(country_name)
        return country.alpha_3
    except LookupError:
        return 'Unknown'

# Apply the function to the 'country' column and create the 'iso3' column
df3['iso3'] = df3['country'].apply(get_iso3_code)

# Display the modified DataFrame
df3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['iso3'] = df3['country'].apply(get_iso3_code)


Unnamed: 0,indic_de,id,date,value,country,iso3
284679,JAN,AL,2000-01-01,3057026.0,Albania,ALB
284680,JAN,AL,2001-01-01,3063320.0,Albania,ALB
284681,JAN,AL,2002-01-01,3057018.0,Albania,ALB
284682,JAN,AL,2003-01-01,3044993.0,Albania,ALB
284683,JAN,AL,2004-01-01,3034231.0,Albania,ALB
...,...,...,...,...,...,...
328850,JAN,UKN16,2015-01-01,115171.0,United Kingdom,GBR
328851,JAN,UKN16,2016-01-01,115581.0,United Kingdom,GBR
328852,JAN,UKN16,2017-01-01,116057.0,United Kingdom,GBR
328853,JAN,UKN16,2018-01-01,116612.0,United Kingdom,GBR


In [19]:
df4 = df3.drop(['indic_de'], axis=1)

df4



Unnamed: 0,id,date,value,country,iso3
284679,AL,2000-01-01,3057026.0,Albania,ALB
284680,AL,2001-01-01,3063320.0,Albania,ALB
284681,AL,2002-01-01,3057018.0,Albania,ALB
284682,AL,2003-01-01,3044993.0,Albania,ALB
284683,AL,2004-01-01,3034231.0,Albania,ALB
...,...,...,...,...,...
328850,UKN16,2015-01-01,115171.0,United Kingdom,GBR
328851,UKN16,2016-01-01,115581.0,United Kingdom,GBR
328852,UKN16,2017-01-01,116057.0,United Kingdom,GBR
328853,UKN16,2018-01-01,116612.0,United Kingdom,GBR


In [26]:
with open('NUTS3_names.json', 'r') as file:
    df10 = json.load(file)
    df10

In [27]:
df20 = pd.DataFrame(list(df10.items()), columns=['id', 'name'])
df20

Unnamed: 0,id,name
0,BE,Belgium
1,BE1,Région de Bruxelles-Capitale/Brussels Hoofdste...
2,BE10,Région de Bruxelles-Capitale/Brussels Hoofdste...
3,BE100,Arr. de Bruxelles-Capitale/Arr. Brussel-Hoofdstad
4,BE2,Vlaams Gewest
...,...,...
2058,TRC3,"Mardin, Batman, Şırnak, Siirt"
2059,TRC31,Mardin
2060,TRC32,Batman
2061,TRC33,Şırnak


In [36]:
df100 = pd.merge(df4, df20, on='id', how='inner')
df100

Unnamed: 0,id,date,value,country,iso3,name
0,AL,2000-01-01,3057026.0,Albania,ALB,Albania
1,AL,2001-01-01,3063320.0,Albania,ALB,Albania
2,AL,2002-01-01,3057018.0,Albania,ALB,Albania
3,AL,2003-01-01,3044993.0,Albania,ALB,Albania
4,AL,2004-01-01,3034231.0,Albania,ALB,Albania
...,...,...,...,...,...,...
44171,UKN16,2015-01-01,115171.0,United Kingdom,GBR,Fermanagh and Omagh (NUTS 2016)
44172,UKN16,2016-01-01,115581.0,United Kingdom,GBR,Fermanagh and Omagh (NUTS 2016)
44173,UKN16,2017-01-01,116057.0,United Kingdom,GBR,Fermanagh and Omagh (NUTS 2016)
44174,UKN16,2018-01-01,116612.0,United Kingdom,GBR,Fermanagh and Omagh (NUTS 2016)


In [37]:
# Define the replacements
replacements = {
    '(NUTS 2016)': '',
    '(NUTS 2021)': '',
    '()':''
}

# Apply the replacements to the 'name' column
df100['name'] = df100['name'].replace(replacements, regex=True)
df100

Unnamed: 0,id,date,value,country,iso3,name
0,AL,2000-01-01,3057026.0,Albania,ALB,Albania
1,AL,2001-01-01,3063320.0,Albania,ALB,Albania
2,AL,2002-01-01,3057018.0,Albania,ALB,Albania
3,AL,2003-01-01,3044993.0,Albania,ALB,Albania
4,AL,2004-01-01,3034231.0,Albania,ALB,Albania
...,...,...,...,...,...,...
44171,UKN16,2015-01-01,115171.0,United Kingdom,GBR,Fermanagh and Omagh ()
44172,UKN16,2016-01-01,115581.0,United Kingdom,GBR,Fermanagh and Omagh ()
44173,UKN16,2017-01-01,116057.0,United Kingdom,GBR,Fermanagh and Omagh ()
44174,UKN16,2018-01-01,116612.0,United Kingdom,GBR,Fermanagh and Omagh ()


In [38]:
df100.to_csv('pop_euro.csv', index=False)