In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import regex as re
import seaborn as sns


In [4]:
# Import the data
df = pd.read_csv("../data/chart-6.csv")

Title:	Academic staff by academic cost centre, sex and academic year			
Location:	UK			
Academic years:	2014/15 to 2021/22			
Data source:	HESA			
Data source link:	https://www.hesa.ac.uk/data-and-analysis/staff/chart-6			
Data file canonical link:	https://www.hesa.ac.uk/data-and-analysis/staff/chart-6.csv			
Licence:	Creative Commons Attribution 4.0 International Licence			
Code page:	Unicode UTF-8			
Disclaimer	Please note that this data includes rounded totals.  Caution must be taken when importing into a pivot table so as not to double count.			
				
Last updated:	Feb-23			


In [5]:
# Count unique cost centre groups and cost centres
num_cost_centre_groups = df['Cost centre group'].nunique()
num_cost_centres = df['Cost centre'].nunique()

num_cost_centre_groups, num_cost_centres


(11, 46)

In [6]:
# Group the data by cost centre group, cost centre, and academic year
grouped_data = df.groupby(['Cost centre group', 'Cost centre', 'Academic year']).sum().reset_index()

# Sort the grouped data
sorted_grouped_data = grouped_data.sort_values(['Cost centre group', 'Cost centre', 'Academic year'])

# Show the sorted grouped data
sorted_grouped_data


Unnamed: 0,Cost centre group,Cost centre,Academic year,Sex,Number
0,Administration & business studies,133 Business & management studies,2014/15,FemaleMale,14365
1,Administration & business studies,133 Business & management studies,2015/16,FemaleMale,14840
2,Administration & business studies,133 Business & management studies,2016/17,FemaleMale,15460
3,Administration & business studies,133 Business & management studies,2017/18,FemaleMale,16045
4,Administration & business studies,133 Business & management studies,2018/19,FemaleMale,16510
...,...,...,...,...,...
363,Total academic services,Total academic services,2017/18,FemaleMale,1145
364,Total academic services,Total academic services,2018/19,FemaleMale,1260
365,Total academic services,Total academic services,2019/20,FemaleMale,1485
366,Total academic services,Total academic services,2020/21,FemaleMale,1540


In [8]:
# Pivot the data to create separate columns for Male and Female
pivot_data = df.pivot_table(index=['Cost centre group', 'Cost centre', 'Academic year'], 
                              columns='Sex', 
                              values='Number', 
                              aggfunc='sum').reset_index()

# Reset column names after pivot
pivot_data.columns.name = ''

# Fill missing values with 0
pivot_data.fillna(0, inplace=True)

# Convert numbers to integers
pivot_data[['Female', 'Male']] = pivot_data[['Female', 'Male']].astype(int)

pivot_data


Unnamed: 0,Cost centre group,Cost centre,Academic year,Female,Male
0,Administration & business studies,133 Business & management studies,2014/15,6025,8340
1,Administration & business studies,133 Business & management studies,2015/16,6275,8565
2,Administration & business studies,133 Business & management studies,2016/17,6600,8860
3,Administration & business studies,133 Business & management studies,2017/18,6950,9095
4,Administration & business studies,133 Business & management studies,2018/19,7215,9295
...,...,...,...,...,...
363,Total academic services,Total academic services,2017/18,660,485
364,Total academic services,Total academic services,2018/19,725,535
365,Total academic services,Total academic services,2019/20,895,590
366,Total academic services,Total academic services,2020/21,905,635
