# Mandatory data

In [29]:
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import Markdown
import geopandas
import plotly.express as px
import numpy

## Editing the table
Doing some basic work to make tha table more readable

In [30]:
url = "../Data/Annual_population_eurostat(mandatory)_raw.tsv"
table_raw = pd.read_csv(url,sep='\t')
print(table_raw.columns)

table_raw.rename(columns={"freq,citizen,age,unit,sex,geo\TIME_PERIOD":"title"},inplace=True)
display(table_raw.head(5))


Index(['freq,citizen,age,unit,sex,geo\TIME_PERIOD', '2015 ', '2016 ', '2017 ',
       '2018 ', '2019 ', '2020 ', '2021 ', '2022 ', '2023 ', '2024 '],
      dtype='object')


  table_raw.rename(columns={"freq,citizen,age,unit,sex,geo\TIME_PERIOD":"title"},inplace=True)


Unnamed: 0,title,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,"A,EU28_FOR,TOTAL,NR,F,AT",288028,310759,329981,348012,365125,383295,:,:,:,:
1,"A,EU28_FOR,TOTAL,NR,F,BE",416232,428008,434889,440355,448655,457258,:,:,:,:
2,"A,EU28_FOR,TOTAL,NR,F,CH",594052,610566,622893,630371,637330,645916,:,:,:,:
3,"A,EU28_FOR,TOTAL,NR,F,CY",55730,57716,57673,59787,63044,62935,:,:,:,:
4,"A,EU28_FOR,TOTAL,NR,F,CZ",72465,76736,81670,86132,91296,96186,:,:,:,:


In [31]:
citizens = []
sex = []
country = []

for index,row in table_raw.iterrows():
    title_split = (row["title"].split(","))
    # print(title_split)
    citizens.append(title_split[1]) 
    sex.append(title_split[4])
    country.append(title_split[5])
    
    

table_raw["citizen"] = citizens
table_raw["sex"]=sex
table_raw["country"]=country

table_raw.drop(columns=["title"],inplace=True)
table_raw.set_index("country",inplace=True)



#### Reindexing, adding ISO3 codes


In [32]:
# print(table_raw.columns)
#rename country codes
country_codes = pd.read_csv("../Data/country-region-codes.csv")
display(country_codes.head())


table = table_raw.reset_index().rename(columns={'2015 ':2015, '2016 ':2016, '2017 ':2017, '2018 ':2018, '2019 ':2019, '2020 ':2020, '2021 ':2021, '2022 ':2022,
       '2023 ':2023, '2024 ':2024})
table.rename(columns={"country":"alpha-2"},inplace=True)

table = pd.merge(table,country_codes,on="alpha-2").drop(columns=["country-code","alpha-2"]).rename(columns={"name":"Country","alpha-3":"ISO3"})


#reindexing
new_order = ["Country","ISO3",'citizen', 'sex']
for i in range(2015,2025,1):
    new_order.append(i)

table = table[new_order]
display(table.head())

Unnamed: 0,name,alpha-2,alpha-3,country-code
0,Afghanistan,AF,AFG,4
1,Åland Islands,AX,ALA,248
2,Albania,AL,ALB,8
3,Algeria,DZ,DZA,12
4,American Samoa,AS,ASM,16


Unnamed: 0,Country,ISO3,citizen,sex,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,Austria,AUT,EU28_FOR,F,288028,310759,329981,348012,365125,383295,:,:,:,:
1,Belgium,BEL,EU28_FOR,F,416232,428008,434889,440355,448655,457258,:,:,:,:
2,Switzerland,CHE,EU28_FOR,F,594052,610566,622893,630371,637330,645916,:,:,:,:
3,Cyprus,CYP,EU28_FOR,F,55730,57716,57673,59787,63044,62935,:,:,:,:
4,Czechia,CZE,EU28_FOR,F,72465,76736,81670,86132,91296,96186,:,:,:,:


#### Fixing some values

In [33]:
# display(
#     table_raw[table_raw['sex'] == 'T']
#     .dropna(subset=['2024 ']) 
#     .sort_values(by="2024 ", ascending=False)
#     .head(200)
# )


print(table.iloc[1,10])

# replaced = table.replace(to_replace=table.iloc[1,10],value=numpy.nan)
replaced = table.copy()


#fix numeric values
for column in replaced.columns[3:]:
    replaced[column] = replaced[column].apply(str.strip)
    # print(type(replaced[column][0]))

replaced.replace(to_replace=replaced.iloc[1,10],value=numpy.nan,inplace=True)

display(replaced.head())



: 


Unnamed: 0,Country,ISO3,citizen,sex,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,Austria,AUT,EU28_FOR,F,288028,310759,329981,348012,365125,383295,,,,
1,Belgium,BEL,EU28_FOR,F,416232,428008,434889,440355,448655,457258,,,,
2,Switzerland,CHE,EU28_FOR,F,594052,610566,622893,630371,637330,645916,,,,
3,Cyprus,CYP,EU28_FOR,F,55730,57716,57673,59787,63044,62935,,,,
4,Czechia,CZE,EU28_FOR,F,72465,76736,81670,86132,91296,96186,,,,


### Fixing provisional data

In [34]:
#tags
tags = ["b","p","ep","e","be","bep"]

#I used Deepseek AI for this, it was quite complex
pattern = r'\b(?:' + '|'.join(tags) + r')\b'

# Apply the replacement to all columns (or specify specific columns)
df = replaced.apply(lambda x: x.str.replace(pattern, '', regex=True).str.strip())



#### Saving to file

In [35]:
path = "../Data/Annual_population_eurostat(mandatory).csv"

df.to_csv(path,index=False)
