In [1]:
import pandas as pd
import numpy as np
import json 
import time 
import requests
import pycountry
import csv

# Cols:
# id - id of the geography (e.g. AT11) - will be context dependent (for countries, use iso3)
# date - yyyy-mm-dd
# value 
# name - of the georaphy (Burgenland)

# country (Austria)
# iso3 - 

In [2]:


# Step 1: Modify the URL to request CSV format
url = "https://ec.europa.eu/eurostat/api/dissemination/sdmx/3.0/data/dataflow/ESTAT/tgs00010/1.0?compress=false&format=csvdata"
response = requests.get(url)

# Step 2: Save the response content as a CSV file
csv_file = "map_builder_test.csv"
with open(csv_file, 'wb') as file:
    file.write(response.content)

print(f"Data successfully exported to {csv_file}")


Data successfully exported to map_builder_test.csv


In [3]:
df = pd.read_csv('map_builder_test.csv')
df

Unnamed: 0,DATAFLOW,LAST UPDATE,freq,isced11,sex,age,unit,geo,TIME_PERIOD,OBS_VALUE,OBS_FLAG
0,ESTAT:TGS00010(1.0),13/06/24 23:00:00,A,ED0-2,F,Y_GE15,PC,AT11,2012,,u
1,ESTAT:TGS00010(1.0),13/06/24 23:00:00,A,ED0-2,F,Y_GE15,PC,AT11,2013,,u
2,ESTAT:TGS00010(1.0),13/06/24 23:00:00,A,ED0-2,F,Y_GE15,PC,AT11,2014,,bu
3,ESTAT:TGS00010(1.0),13/06/24 23:00:00,A,ED0-2,F,Y_GE15,PC,AT11,2015,,u
4,ESTAT:TGS00010(1.0),13/06/24 23:00:00,A,ED0-2,F,Y_GE15,PC,AT11,2016,,u
...,...,...,...,...,...,...,...,...,...,...,...
61618,ESTAT:TGS00010(1.0),13/06/24 23:00:00,A,UNK,T,Y_GE15,PC,UKN0,2015,,u
61619,ESTAT:TGS00010(1.0),13/06/24 23:00:00,A,UNK,T,Y_GE15,PC,UKN0,2016,,u
61620,ESTAT:TGS00010(1.0),13/06/24 23:00:00,A,UNK,T,Y_GE15,PC,UKN0,2017,,u
61621,ESTAT:TGS00010(1.0),13/06/24 23:00:00,A,UNK,T,Y_GE15,PC,UKN0,2018,,u


In [4]:
# Dropping 'DATAFLOW' and 'LAST UPDATE' columns, and renaming 'TIME_PERIOD' to 'date'
df2 = df.drop(['DATAFLOW', 'LAST UPDATE', 'unit', 'OBS_FLAG', 'freq', 'age'], axis=1).rename(columns={'TIME_PERIOD': 'date', 'OBS_VALUE': 'value'})

# Display the modified DataFrame
df2

Unnamed: 0,isced11,sex,geo,date,value
0,ED0-2,F,AT11,2012,
1,ED0-2,F,AT11,2013,
2,ED0-2,F,AT11,2014,
3,ED0-2,F,AT11,2015,
4,ED0-2,F,AT11,2016,
...,...,...,...,...,...
61618,UNK,T,UKN0,2015,
61619,UNK,T,UKN0,2016,
61620,UNK,T,UKN0,2017,
61621,UNK,T,UKN0,2018,


In [5]:
# FREQ = annual
# AGE = 15 years or over
# SEX = all
# EDUCATION = "All ISCED 2011 levels"
# Filter the DataFrame where sex is 'T' and isced11 is 'TOTAL'
df3 = df2.loc[(df2['sex'] == 'T') & (df2['isced11'] == 'TOTAL')]

# Display the filtered DataFrame
df3


Unnamed: 0,isced11,sex,geo,date,value
47172,TOTAL,T,AT11,2012,4.6
47173,TOTAL,T,AT11,2013,4.3
47174,TOTAL,T,AT11,2014,4.8
47175,TOTAL,T,AT11,2015,5.2
47176,TOTAL,T,AT11,2016,5.7
...,...,...,...,...,...
50940,TOTAL,T,UKN0,2015,6.1
50941,TOTAL,T,UKN0,2016,5.7
50942,TOTAL,T,UKN0,2017,4.5
50943,TOTAL,T,UKN0,2018,3.6


In [6]:
df3['date'] = df3['date'].astype(str) + '-01-01' 
df3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['date'] = df3['date'].astype(str) + '-01-01'


Unnamed: 0,isced11,sex,geo,date,value
47172,TOTAL,T,AT11,2012-01-01,4.6
47173,TOTAL,T,AT11,2013-01-01,4.3
47174,TOTAL,T,AT11,2014-01-01,4.8
47175,TOTAL,T,AT11,2015-01-01,5.2
47176,TOTAL,T,AT11,2016-01-01,5.7
...,...,...,...,...,...
50940,TOTAL,T,UKN0,2015-01-01,6.1
50941,TOTAL,T,UKN0,2016-01-01,5.7
50942,TOTAL,T,UKN0,2017-01-01,4.5
50943,TOTAL,T,UKN0,2018-01-01,3.6


In [48]:

# Function to extract country name from NUTS2 code
def nuts2_to_country(nuts2_code):
    # Check if the code starts with 'UK'
      # Check if the code starts with 'UK'
    if nuts2_code.startswith('UK'):
        return 'United Kingdom'
    
    # Check if the code starts with 'EL'
    if nuts2_code.startswith('EL'):
        return 'Greece'
    # Extract the country code from the NUTS2 code
    country_code = nuts2_code[:2]  # Extract first two characters
    country = pycountry.countries.get(alpha_2=country_code)
    
    # Return the country name if found, otherwise 'Unknown'
    return country.name if country else 'Unknown'

# Apply the function to the 'geo' column and create a new 'country' column
df3['country'] = df3['geo'].apply(nuts2_to_country)

# Display the modified DataFrame
df3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['country'] = df3['geo'].apply(nuts2_to_country)


Unnamed: 0,isced11,sex,geo,date,value,country
47172,TOTAL,T,AT11,2012-01-01,4.6,Austria
47173,TOTAL,T,AT11,2013-01-01,4.3,Austria
47174,TOTAL,T,AT11,2014-01-01,4.8,Austria
47175,TOTAL,T,AT11,2015-01-01,5.2,Austria
47176,TOTAL,T,AT11,2016-01-01,5.7,Austria
...,...,...,...,...,...,...
50940,TOTAL,T,UKN0,2015-01-01,6.1,United Kingdom
50941,TOTAL,T,UKN0,2016-01-01,5.7,United Kingdom
50942,TOTAL,T,UKN0,2017-01-01,4.5,United Kingdom
50943,TOTAL,T,UKN0,2018-01-01,3.6,United Kingdom


In [49]:
# checking how well function to add country names worked
unknown_count = df3['country'].value_counts().get('Unknown', 0)

print(f"Count of 'Unknown' entries: {unknown_count}")



Count of 'Unknown' entries: 12


In [9]:
with open('NUTS2_names.json', 'r') as file:
    df10 = json.load(file)
    df10

In [10]:
df20 = pd.DataFrame(list(df10.items()), columns=['geo', 'name'])
df20

Unnamed: 0,geo,name
0,EA20,Euro area – 20 countries (from 2023)
1,BE10,Région de Bruxelles-Capitale/Brussels Hoofdste...
2,BE21,Prov. Antwerpen
3,BE22,Prov. Limburg (BE)
4,BE23,Prov. Oost-Vlaanderen
...,...,...
340,TRB1,"Malatya, Elazığ, Bingöl, Tunceli"
341,TRB2,"Van, Muş, Bitlis, Hakkari"
342,TRC1,"Gaziantep, Adıyaman, Kilis"
343,TRC2,"Şanlıurfa, Diyarbakır"


In [50]:
df100 = pd.merge(df3, df20, on='geo', how='inner')
df100

Unnamed: 0,isced11,sex,geo,date,value,country,name
0,TOTAL,T,AT11,2012-01-01,4.6,Austria,Burgenland
1,TOTAL,T,AT11,2013-01-01,4.3,Austria,Burgenland
2,TOTAL,T,AT11,2014-01-01,4.8,Austria,Burgenland
3,TOTAL,T,AT11,2015-01-01,5.2,Austria,Burgenland
4,TOTAL,T,AT11,2016-01-01,5.7,Austria,Burgenland
...,...,...,...,...,...,...,...
3768,TOTAL,T,UKN0,2015-01-01,6.1,United Kingdom,Northern Ireland (UK) (NUTS 2021)
3769,TOTAL,T,UKN0,2016-01-01,5.7,United Kingdom,Northern Ireland (UK) (NUTS 2021)
3770,TOTAL,T,UKN0,2017-01-01,4.5,United Kingdom,Northern Ireland (UK) (NUTS 2021)
3771,TOTAL,T,UKN0,2018-01-01,3.6,United Kingdom,Northern Ireland (UK) (NUTS 2021)


In [51]:
df200 = df100.drop(['isced11', 'sex'], axis=1).rename(columns={'geo': 'id'})

# Display the modified DataFrame
df200

Unnamed: 0,id,date,value,country,name
0,AT11,2012-01-01,4.6,Austria,Burgenland
1,AT11,2013-01-01,4.3,Austria,Burgenland
2,AT11,2014-01-01,4.8,Austria,Burgenland
3,AT11,2015-01-01,5.2,Austria,Burgenland
4,AT11,2016-01-01,5.7,Austria,Burgenland
...,...,...,...,...,...
3768,UKN0,2015-01-01,6.1,United Kingdom,Northern Ireland (UK) (NUTS 2021)
3769,UKN0,2016-01-01,5.7,United Kingdom,Northern Ireland (UK) (NUTS 2021)
3770,UKN0,2017-01-01,4.5,United Kingdom,Northern Ireland (UK) (NUTS 2021)
3771,UKN0,2018-01-01,3.6,United Kingdom,Northern Ireland (UK) (NUTS 2021)


In [52]:


# Function to get ISO3 code from country name
def get_iso3_code(country_name):
    try:
        country = pycountry.countries.lookup(country_name)
        return country.alpha_3
    except LookupError:
        return 'Unknown'

# Apply the function to the 'country' column and create the 'iso3' column
df200['iso3'] = df200['country'].apply(get_iso3_code)

# Display the modified DataFrame
df200

Unnamed: 0,id,date,value,country,name,iso3
0,AT11,2012-01-01,4.6,Austria,Burgenland,AUT
1,AT11,2013-01-01,4.3,Austria,Burgenland,AUT
2,AT11,2014-01-01,4.8,Austria,Burgenland,AUT
3,AT11,2015-01-01,5.2,Austria,Burgenland,AUT
4,AT11,2016-01-01,5.7,Austria,Burgenland,AUT
...,...,...,...,...,...,...
3768,UKN0,2015-01-01,6.1,United Kingdom,Northern Ireland (UK) (NUTS 2021),GBR
3769,UKN0,2016-01-01,5.7,United Kingdom,Northern Ireland (UK) (NUTS 2021),GBR
3770,UKN0,2017-01-01,4.5,United Kingdom,Northern Ireland (UK) (NUTS 2021),GBR
3771,UKN0,2018-01-01,3.6,United Kingdom,Northern Ireland (UK) (NUTS 2021),GBR


In [53]:
# Define the replacements
replacements = {
    '(NUTS 2013)': '',
    '(NUTS 2016)': '',
    '(NUTS 2021)': '',
    '()': ''
}

# Apply the replacements to the 'name' column
df200['name'] = df200['name'].replace(replacements, regex=True)
df200



Unnamed: 0,id,date,value,country,name,iso3
0,AT11,2012-01-01,4.6,Austria,Burgenland,AUT
1,AT11,2013-01-01,4.3,Austria,Burgenland,AUT
2,AT11,2014-01-01,4.8,Austria,Burgenland,AUT
3,AT11,2015-01-01,5.2,Austria,Burgenland,AUT
4,AT11,2016-01-01,5.7,Austria,Burgenland,AUT
...,...,...,...,...,...,...
3768,UKN0,2015-01-01,6.1,United Kingdom,Northern Ireland (UK) (),GBR
3769,UKN0,2016-01-01,5.7,United Kingdom,Northern Ireland (UK) (),GBR
3770,UKN0,2017-01-01,4.5,United Kingdom,Northern Ireland (UK) (),GBR
3771,UKN0,2018-01-01,3.6,United Kingdom,Northern Ireland (UK) (),GBR


In [54]:
# Replace 'your_file_name.csv' with the desired file path and name
df200.to_csv('unemp_nuts2_euro.csv', index=False)
