In [6]:
import pandas as pd

# Load the original Excel file
file_path = '../data/europe/educ_edited.xlsx'
df_original = pd.read_excel(file_path)

In [7]:
# Remove the first 9 rows
df_remooved = df_original[9:].reset_index(drop=True)

# Set the first remaining row as column headers
df_remooved.columns = df_remooved.iloc[0]
df = df_remooved[1:].reset_index(drop=True)

# Rename the first column
first_col_name = df_remooved.columns[0]
df_remooved = df_remooved.rename(columns={first_col_name: 'University ↓ / Student →'})

# Remove the current first row (if needed)
df_remooved = df_remooved[2:].reset_index(drop=True)

# Remove rows 38 to 47 by index
df_remooved = df_remooved.drop(index=range(38, 48)).reset_index(drop=True)

df_remooved

Unnamed: 0,University ↓ / Student →,Belgium,NaN,Bulgaria,NaN.1,Czechia,NaN.2,Denmark,NaN.3,Germany,...,Gibraltar,NaN.4,Holy See,NaN.5,Monaco,NaN.6,Russia,NaN.7,San Marino,NaN.8
0,Belgium,:,m,166,,62,,36,,747,...,0,,0,,3,,284,,2,
1,Bulgaria,28,d,:,m,12,d,20,d,1813,...,0,d,0,d,0,d,325,d,0,d
2,Czechia,38,,119,,:,m,32,,904,...,4,,0,,0,,7781,,0,
3,Denmark,230,,378,,507,,:,m,3248,...,0,,0,,1,,126,,1,
4,Germany,3089,e,5410,e,1512,e,770,e,:,...,2,e,10,e,8,e,11024,e,2,e
5,Estonia,13,,6,,3,,7,,89,...,0,,0,,0,,451,,0,
6,Ireland,231,,77,,325,,51,,1087,...,0,,0,,2,,156,,1,
7,Greece,95,,280,,15,,8,,1294,...,0,,0,,0,,583,,0,
8,Spain,640,,440,,122,,94,,2434,...,2,,1,,43,,781,,0,
9,France,2530,bd,616,bd,334,bd,188,bd,4391,...,0,bd,0,bd,171,bd,3001,bd,3,bd


In [8]:
# Keep only columns at even positions starting from index 0 and 1
# i.e., keep columns 0 and 1, drop 2, keep 3, drop 4, keep 5, etc. — then invert mask

# Create a list of column indices to drop: start at 2 and go every other one
columns_to_drop = df_remooved.columns[2::2]

# Drop those columns
df_remooved = df_remooved.drop(columns=columns_to_drop)

df_remooved

Unnamed: 0,University ↓ / Student →,Belgium,Bulgaria,Czechia,Denmark,Germany,Estonia,Ireland,Greece,Spain,...,Türkiye,Ukraine,Kosovo*,Andorra,Belarus,Gibraltar,Holy See,Monaco,Russia,San Marino
0,Belgium,:,166,62,36,747,31,74,411,924,...,771,389,24,3,43,0,0,3,284,2
1,Bulgaria,28,:,12,20,1813,3,399,4019,96,...,868,1771,182,1,28,0,0,0,325,0
2,Czechia,38,119,:,32,904,17,50,195,239,...,520,6529,40,0,1096,4,0,0,7781,0
3,Denmark,230,378,507,:,3248,121,111,812,1506,...,162,125,5,0,11,0,0,1,126,1
4,Germany,3089,5410,1512,770,:,365,990,4356,8252,...,16585,9038,824,10,1143,2,10,8,11024,2
5,Estonia,13,6,3,7,89,:,12,16,46,...,136,520,5,0,60,0,0,0,451,0
6,Ireland,231,77,325,51,1087,17,31,170,828,...,199,183,1,1,7,0,0,2,156,1
7,Greece,95,280,15,8,1294,5,5,:,62,...,132,290,0,0,44,0,0,0,583,0
8,Spain,640,440,122,94,2434,83,220,417,:,...,333,554,0,1226,66,2,1,43,781,0
9,France,2530,616,334,188,4391,72,404,2754,5170,...,2706,2250,100,294,198,0,0,171,3001,3


In [9]:
# Replace ":" with "0" as string first to avoid silent downcasting warning
df_replaced = df_remooved.replace(':', '0')

# Convert all columns (except the first if it's text) to numbers
for col in df_replaced.columns[1:]:
    df_replaced[col] = pd.to_numeric(df_replaced[col], errors='coerce').fillna(0)
df_replaced

Unnamed: 0,University ↓ / Student →,Belgium,Bulgaria,Czechia,Denmark,Germany,Estonia,Ireland,Greece,Spain,...,Türkiye,Ukraine,Kosovo*,Andorra,Belarus,Gibraltar,Holy See,Monaco,Russia,San Marino
0,Belgium,0,166,62,36,747,31,74,411,924,...,771,389,24,3,43,0,0,3,284,2
1,Bulgaria,28,0,12,20,1813,3,399,4019,96,...,868,1771,182,1,28,0,0,0,325,0
2,Czechia,38,119,0,32,904,17,50,195,239,...,520,6529,40,0,1096,4,0,0,7781,0
3,Denmark,230,378,507,0,3248,121,111,812,1506,...,162,125,5,0,11,0,0,1,126,1
4,Germany,3089,5410,1512,770,0,365,990,4356,8252,...,16585,9038,824,10,1143,2,10,8,11024,2
5,Estonia,13,6,3,7,89,0,12,16,46,...,136,520,5,0,60,0,0,0,451,0
6,Ireland,231,77,325,51,1087,17,31,170,828,...,199,183,1,1,7,0,0,2,156,1
7,Greece,95,280,15,8,1294,5,5,0,62,...,132,290,0,0,44,0,0,0,583,0
8,Spain,640,440,122,94,2434,83,220,417,0,...,333,554,0,1226,66,2,1,43,781,0
9,France,2530,616,334,188,4391,72,404,2754,5170,...,2706,2250,100,294,198,0,0,171,3001,3


In [10]:
# Ulož do nového Excel souboru (přímo do dostupné složky)
output_path = '../data/europe/data_europe.xlsx'
df_replaced.to_excel(output_path, sheet_name='List1', index=False)

print("Soubor byl uložen jako:", output_path)

Soubor byl uložen jako: ../data/europe/data_europe.xlsx
