# Load Excel Data

In [1]:
import pandas as pd

# Define the URL of the Excel file
url = "https://dsia.msmt.cz/vystupy/f2/f25.xlsx"

# Load the "2023" sheet using the 6th row as header,
# and skip rows 7-10
df = pd.read_excel(url, sheet_name='2023', header=5, skiprows=range(6, 10))
df

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Hlavní město\nPraha,Středočeský\n kraj,...,Královéhradecký\nkraj,Pardubický\nkraj,Vysočina,Jihomoravský\nkraj,Olomoucký\nkraj,Zlínský\nkraj,Moravskoslezský\nkraj,kraj\nneurčen,Unnamed: 23,Unnamed: 24
0,,11000,Univerzita Karlova,,,,51517,40335,12383,8773,...,2179,1440,1479,969,761,992,1397,0,59,11182
1,,11110,1. lékařská fakulta,,,,4895,3484,989,721,...,146,132,135,125,117,131,215,0,9,1411
2,,11120,3. lékařská fakulta,,,,2560,1693,509,320,...,90,75,72,54,37,49,102,0,1,867
3,,11130,2. lékařská fakulta,,,,2109,1537,566,333,...,68,66,71,31,35,39,40,0,2,572
4,,11140,Lékařská fakulta v Plzni,,,,2495,1730,143,196,...,42,24,52,23,15,33,29,0,4,765
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202,,7R000,"ART & DESIGN INSTITUT, s.r.o.",,,,197,123,49,31,...,4,1,2,4,0,1,2,0,4,74
203,,7S000,"Panevropská univerzita, a.s.",,,,2283,1886,373,421,...,43,58,33,50,51,44,528,0,3,397
204,,7T000,"Vysoká škola kreativní komunikace, s.r.o.",,,,597,434,166,107,...,17,18,8,5,10,8,11,0,3,163
205,,7U000,"Vysoká škola finanční a správní, a.s.",,,,2538,1860,420,362,...,39,35,19,24,14,12,23,0,3,678


In [2]:
# Define the indices of columns to drop: first column and columns with index 3, 4, 5, 6, 7
cols_to_drop = [0, 2, 3, 4, 5, 6, 7]

# Drop the unwanted columns
df = df.drop(df.columns[cols_to_drop], axis=1)

# Rename the first column to "kód"
df.columns.values[0] = "kód"

# Remove the last three columns from the DataFrame
df = df.iloc[:, :-3]

In [3]:
import json

# Open the JSON file containing the faculty-to-region mapping
with open('../../data/czech/faculty_to_region.json', 'r', encoding='utf-8') as file:
    faculty_to_region = json.load(file)

# Add a new column "region" by mapping the "kód" column using the faculty_to_region dictionary
df['University ↓ / Student →'] = df['kód'].map(faculty_to_region)

In [4]:
# Remove columns that are not needed
df = df.drop(columns=["kód"])

# Remove rows that contain aggregated university data when faculty-level data is available, to avoid duplicate sums
df = df.drop(index=[0, 18, 27, 36, 47, 56, 59, 66, 72, 77, 87, 93, 103, 112, 120, 129, 138, 146, 153, 161, 168, 174]).reset_index(drop=True)

# Group rows by 'místo' and sum the student counts for each region
df = df.groupby('University ↓ / Student →').sum()

# Remove rows where the "University ↓ / Student →" column equals "různé kraje"
df = df[df.index != "různé kraje"]

In [5]:
# Replace "Vysočina" to "Kraj Vysočina" and "/n" to " " in cols
df.columns = [col.replace('\n', ' ').replace('Vysočina', 'Kraj Vysočina').strip() if isinstance(col, str) else col for col in df.columns]

# Replace "Vysočina" to "Kraj Vysočina" and "/n" to " " in rows
df.index = [idx.replace('\n', ' ').replace('Vysočina', 'Kraj Vysočina').strip() if isinstance(idx, str) else idx for idx in df.index]


df.columns = [col.replace('\n', ' ').replace('Středočeský  kraj', 'Středočeský kraj').strip() if isinstance(col, str) else col for col in df.columns]
df.index = [idx.replace('\n', ' ').replace('Středočeský  kraj', 'Středočeský kraj').strip() if isinstance(idx, str) else idx for idx in df.index]

# Reset the index so that the index becomes a column again
df = df.reset_index()

# Rename the first column to 'nazev'
df = df.rename(columns={df.columns[0]: 'University ↓ / Student →'})

# Save the result to a new Excel file
df.to_excel('../../data/czech/data_czech.xlsx', sheet_name='List1', index=False)

print("Data was successfully exported to '../data/czech/data_czech.xlsx'.")
df

Data was successfully exported to '../data/czech/data_czech.xlsx'.


Unnamed: 0,University ↓ / Student →,Hlavní město Praha,Středočeský kraj,Jihočeský kraj,Plzeňský kraj,Karlovarský kraj,Ústecký kraj,Liberecký kraj,Královéhradecký kraj,Pardubický kraj,Kraj Vysočina,Jihomoravský kraj,Olomoucký kraj,Zlínský kraj,Moravskoslezský kraj
0,Hlavní město Praha,29879,24305,5838,4205,2652,6997,4035,4703,3343,3160,3240,2056,2077,3449
1,Jihomoravský kraj,986,1432,1195,316,226,555,458,1590,2648,5103,19820,3690,4856,4570
2,Jihočeský kraj,572,1268,7129,984,197,211,109,78,111,1081,178,67,36,78
3,Královéhradecký kraj,366,978,143,80,67,269,475,3213,1519,443,177,170,136,224
4,Liberecký kraj,324,1026,37,41,46,655,2689,711,87,48,49,25,17,42
5,Moravskoslezský kraj,232,296,99,80,35,283,109,228,400,233,718,1989,1423,17357
6,Olomoucký kraj,640,748,212,167,101,297,281,801,1621,644,2230,6435,2833,4082
7,Pardubický kraj,262,962,76,94,33,125,265,1344,2110,384,208,180,98,189
8,Plzeňský kraj,657,1243,1229,6071,1352,951,182,146,100,140,75,53,66,85
9,Středočeský kraj,548,937,141,103,83,232,231,173,51,65,37,27,24,39
