# ETL OF PROJECT

In [70]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Extract data

In [71]:
# Define raw data path
student_path = "../data/raw/2024 UWA Student Data.xlsx"
government_path = "../data/raw/2024 allocation of units of study.xlsx"

# Load datasets
student_df = pd.read_excel(student_path, sheet_name="Sheet1")
government_df = pd.read_excel(
    government_path, sheet_name="2024AllocationOfUnitsOfStudy"
)

## Tramsform data

### Exploratory Data Analysis

In [72]:
def explore_dataframe(df, df_name):
    print(f"--- Exploring DataFrame: {df_name} ---")
    print("\nShape:")
    print(df.shape)
    print("\nInfo:")
    df.info()
    print("\nFirst 5 Rows:")
    print(df.head())
    print("\nDescriptive Statistics (for numeric columns):")
    print(df.describe())
    print("\nMissing Values Count:")
    print(df.isnull().sum())
    print("-" * 50 + "\n")

In [73]:
# Extract relevant sheets
explore_dataframe(student_df, "Student Data")

--- Exploring DataFrame: Student Data ---

Shape:
(17672, 11)

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17672 entries, 0 to 17671
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   UWACourseID                 17672 non-null  object 
 1   CourseTypeBroadName         17671 non-null  object 
 2   UWAUnitID                   17671 non-null  float64
 3   FundingGroupName            17671 non-null  object 
 4   UnitLevelCode               17671 non-null  float64
 5   UnitLevelName               17671 non-null  object 
 6   UnitPrimaryFOEDetailedName  17671 non-null  object 
 7   UnitPrimaryFOENarrowName    17671 non-null  object 
 8   UnitPrimaryFOECode          17671 non-null  float64
 9   UnitPrimaryFOEBroadName     17671 non-null  object 
 10  2024 EFTSL                  17672 non-null  float64
dtypes: float64(4), object(7)
memory usage: 1.5+ MB

First 5 Rows:
  UWACourseID 

In [74]:
# Extract relevant sheets
explore_dataframe(government_df, "Government Data")

--- Exploring DataFrame: Government Data ---

Shape:
(441, 13)

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 441 entries, 0 to 440
Data columns (total 13 columns):
 #   Column                                                             Non-Null Count  Dtype 
---  ------                                                             --------------  ----- 
 0   Funding Cluster                                                    441 non-null    object
 1   Discipline Code
(FOE)                                              441 non-null    int64 
 2   2024 Maximum Student Contibution                                   441 non-null    int64 
 3   2024 Commonwealth Contribution                                     441 non-null    int64 
 4   2024 Grandfathered Maximum Student Contibution                     441 non-null    int64 
 5   2024 Grandfathered Commonwealth Contribution                       441 non-null    int64 
 6   Funding Cluster varies for FOE depending on E312 or E392 (Ye

### Data Cleaning

### Missing Values Analysis

In [75]:
# For student data, there is one row - grand total row - that has all NaN values. We can remove this row.
student_df.dropna(axis=0, how="any", inplace=True)
student_df.isna().sum()

UWACourseID                   0
CourseTypeBroadName           0
UWAUnitID                     0
FundingGroupName              0
UnitLevelCode                 0
UnitLevelName                 0
UnitPrimaryFOEDetailedName    0
UnitPrimaryFOENarrowName      0
UnitPrimaryFOECode            0
UnitPrimaryFOEBroadName       0
2024 EFTSL                    0
dtype: int64

### Remove duplicates

In [76]:
# remove duplicates from both datasets
student_df.drop_duplicates(inplace=True)
government_df.drop_duplicates(inplace=True)

#### Normalize column names

In [77]:
# Student Data columns
# create rename map dictionary
rename_map = {
    "UWACourseID": "course_id",
    "CourseTypeBroadName": "course_type_broad",
    "UWAUnitID": "unit_id",
    "FundingGroupName": "funding_group",
    "UnitLevelCode": "unit_level_code",
    "UnitLevelName": "unit_level_name",
    "UnitPrimaryFOEDetailedName": "unit_foe_detailed",
    "UnitPrimaryFOENarrowName": "unit_foe_narrow",
    "UnitPrimaryFOECode": "foe_code",
    "UnitPrimaryFOEBroadName": "unit_foe_broad",
    "2024 EFTSL": "eftsl_2024",
}
# rename columns
student_df.rename(columns=rename_map, inplace=True)

In [78]:
# government Data columns
# create rename map dictionary
rename_map = {
    "Funding Cluster": "funding_cluster",
    "Discipline Code\n(FOE)": "foe_code",
    "2024 Maximum Student Contibution": "max_student_contrib_2024",
    "2024 Commonwealth Contribution": "commonwealth_contrib_2024",
    "2024 Grandfathered Maximum Student Contibution": "max_student_contrib_gf_2024",
    "2024 Grandfathered Commonwealth Contribution": "commonwealth_contrib_gf_2024",
    "Funding Cluster varies for FOE depending on E312 or E392 (Yes/No)": "is_funding_cluster_variable",
    "Special Course Type Code for the Course of Study\n(E312 of 27)": "special_course_code",
    "Maximum student contribution indicator\n(E392 =8)": "max_contrib_indicator",
    "DETAILED Discipline (FOE) - Title": "foe_detailed_title",
    "DETAILED Discipline (FOE) ": "foe_detailed",
    "NARROW Discipline (FOE) ": "foe_narrow",
    "BROAD Discipline (FOE) ": "foe_broad",
}
government_df.rename(columns=rename_map, inplace=True)

#### Normalnize column values

Remove leading/trailing spaces from string columns

In [79]:
student_df = student_df.map(lambda x: x.strip() if isinstance(x, str) else x)
government_df = government_df.map(lambda x: x.strip() if isinstance(x, str) else x)

Split the grouped-values

In [80]:
# check the data
student_df["funding_group"]

0                Domestic - Fee-Paying
1              International - Onshore
2              International - Onshore
3              International - Onshore
4                Domestic - Fee-Paying
                     ...              
17666          International - Onshore
17667    Domestic - C'wealth Supported
17668          International - Onshore
17669    Domestic - C'wealth Supported
17670    Domestic - C'wealth Supported
Name: funding_group, Length: 17671, dtype: object

In [81]:
# split funding_group into funding_type and funding_nationality
split_data = student_df["funding_group"].str.split(" - ", expand=True)
student_df["funding_nation"] = split_data[0]
student_df["funding_type"] = split_data[1]
# drop original funding_group column
student_df.drop(columns=["funding_group"], inplace=True)
# check the data
student_df[["funding_nation", "funding_type"]].head()

Unnamed: 0,funding_nation,funding_type
0,Domestic,Fee-Paying
1,International,Onshore
2,International,Onshore
3,International,Onshore
4,Domestic,Fee-Paying


In [82]:
# check the data
government_df[["foe_detailed", "foe_narrow", "foe_broad"]]

Unnamed: 0,foe_detailed,foe_narrow,foe_broad
0,090701 - Psychology,0907 - Behavioural Science,09 - SOCIETY AND CULTURE
1,090701 - Psychology,0907 - Behavioural Science,09 - SOCIETY AND CULTURE
2,090701 - Psychology,0907 - Behavioural Science,09 - SOCIETY AND CULTURE
3,090700 - Behavioural Science,0907 - Behavioural Science,09 - SOCIETY AND CULTURE
4,090700 - Behavioural Science,0907 - Behavioural Science,09 - SOCIETY AND CULTURE
...,...,...,...
436,"060799 - Dental Studies, n.e.c.",0607 - Dental Studies,06 - HEALTH
437,061100 - Veterinary Studies,0611 - Veterinary Studies,06 - HEALTH
438,061101 - Veterinary Science,0611 - Veterinary Studies,06 - HEALTH
439,061103 - Veterinary Assisting,0611 - Veterinary Studies,06 - HEALTH


In [83]:
# split the data
columns_to_clean = ["foe_detailed", "foe_narrow", "foe_broad"]
for column in columns_to_clean:
    government_df[column] = government_df[column].str.split(" - ", n=1).str[1]

# check the data
government_df[["foe_detailed", "foe_narrow", "foe_broad"]].head()

Unnamed: 0,foe_detailed,foe_narrow,foe_broad
0,Psychology,Behavioural Science,SOCIETY AND CULTURE
1,Psychology,Behavioural Science,SOCIETY AND CULTURE
2,Psychology,Behavioural Science,SOCIETY AND CULTURE
3,Behavioural Science,Behavioural Science,SOCIETY AND CULTURE
4,Behavioural Science,Behavioural Science,SOCIETY AND CULTURE


Transform data types from float to int

In [84]:
# student data
columns_to_int = ["course_id", "unit_id", "unit_level_code", "foe_code"]
for col in columns_to_int:
    student_df[col] = student_df[col].astype("int64")

### Error Flag

#### FOE Error

The FOE codes 60000 and 120000 appear only in the student dataset.

In [85]:
# extract unique foe_codes from both datasets
student_codes = set(student_df["foe_code"].unique())
gov_codes = set(government_df["foe_code"].unique())

# check if all student codes are in government codes
all_in = student_codes.issubset(gov_codes)
print(all_in)  # returns False or True

False


In [86]:
# find out which codes in student_codes are not in gov_codes
diff = student_codes - gov_codes
print(diff)

{60000, 120000}


In [87]:
# find out which codes in gov_codes are not in student_codes
mask = student_df["foe_code"].astype(str).str.contains("60000|120000", na=False)
student_df[mask].head()

Unnamed: 0,course_id,course_type_broad,unit_id,unit_level_code,unit_level_name,unit_foe_detailed,unit_foe_narrow,foe_code,unit_foe_broad,eftsl_2024,funding_nation,funding_type
8771,3057,Postgraduate Coursework,192415,5,Level Five,Mixed Field Programmes,Mixed Field Programmes,120000,Mixed Field Programmes,0.125,International,Onshore Exchange
8782,3057,Postgraduate Coursework,205484,5,Level Five,Mixed Field Programmes,Mixed Field Programmes,120000,Mixed Field Programmes,0.125,International,Onshore Exchange
14101,4638,Higher Degree Research,242719,5,Level Five,Health,Health,60000,Health,1.0,Domestic,RTP


The FOE codes 60000 and 120000 appear only in the student dataset.

In [88]:
foe_err_stu = [60000, 120000]

Flag B (Fee mismatch):1 if abs(actual - expected) / expected > 0.10 else 0   

#### EFTSL Error

Overloaded 1 if EFTSL > 3 else 0 

In [90]:
student_df['overload'] = (student_df['eftsl_2024'] > 3).astype(int)

### Merge datasets based on FOE code

##### Special code in gov dataset

This error happens when the foe of gov dataset cannot map back to student dataset, as less information, causing the funding cluster error.

In [91]:
# check the data
government_df[['is_funding_cluster_variable', 'special_course_code', 'max_contrib_indicator']].head()

Unnamed: 0,is_funding_cluster_variable,special_course_code,max_contrib_indicator
0,Yes,Not E312=27,Not E392=8
1,Yes,27,Not E392=8
2,Yes,Not E312=27,8
3,Yes,Any E312 value,Not E392=8
4,Yes,Any E312 value,8


i) FOE 090701 (psychology) units of study are classified under funding cluster 1, unless the course's "special course type code" is flagged as being a postgraduate clinical psychology course (E312=27, "A course of study in clinical psychology (as defined in the Commonwealth Grant Scheme Guidelines"). That is,  if the FOE is 090701 and E312 =27 (**postgraduate clinical psychology**) then the unit of study is allocated to **funding cluster 2**, otherwise it is **allocated to funding cluster 1**.

In [92]:
# check the data
government_df["special_course_code"].value_counts()

special_course_code
Any E312 value    438
Not E312=27         2
27                  1
Name: count, dtype: int64

In [93]:
# transform to 'Yes'/'No'
government_df["special_course_code"] = np.where(
    government_df["special_course_code"] == 27, "Yes", "No"
)
government_df["special_course_code"].value_counts()

special_course_code
No     440
Yes      1
Name: count, dtype: int64

ii)  Professional pathway psychology and professional pathway social work units of study are classified under funding cluster 2. They are identified as having a narrow FOE of 0907 (behavioural Science) or a narrow FOE of 0905 (Human Welfare Studies and Services) respectively and having an E392 (Maximum student contribution indicator) equal to 8 (**students who are not subject to pre-2021 grandfathering but are doing a professional pathways units**). 
Note: In the "2024AllocationOfUnitsOfStudy" tab, the grandfathered amounts for pathway FOEs are "greyed out" (columns E and F). This is because the E392 = "8" value only applied to non-grandfathered students. The value is still shown for theoretical reference. However, grandfathering arrangements for units now defined as professional pathway students are the same as those for non-professional pathway students.

In [94]:
government_df["max_contrib_indicator"].value_counts()

max_contrib_indicator
Any E392 value    414
Not E392=8         14
8                  13
Name: count, dtype: int64

In [95]:
# transform to 'Yes'/'No'
government_df["max_contrib_indicator"] = np.where(
    government_df["max_contrib_indicator"] == 8, "Yes", "No"
)
government_df["max_contrib_indicator"].value_counts()

max_contrib_indicator
No     428
Yes     13
Name: count, dtype: int64

In [96]:
government_df[
    ["is_funding_cluster_variable", "special_course_code", "max_contrib_indicator"]
].head()

Unnamed: 0,is_funding_cluster_variable,special_course_code,max_contrib_indicator
0,Yes,No,No
1,Yes,Yes,No
2,Yes,No,Yes
3,Yes,No,No
4,Yes,No,Yes


In [98]:
# As these columns are hard to map back to student data, set err_flag.
special_code = list(
    government_df.loc[
        government_df["is_funding_cluster_variable"] == "Yes", "foe_code"
    ].drop_duplicates()
)

In [103]:
# filter the foe_err, cluster_err then merge
student_df_ok = student_df[~student_df["foe_code"].isin(foe_err_stu)]
government_df_ok = government_df[~government_df["foe_code"].isin(cluster_err_gov)]

merged_ok = pd.merge(student_df_ok, government_df_ok, on="foe_code", how="outer")

# foe error
merged_ok["foe_error"] = 0  # right flag
foe_err_df = pd.DataFrame({"foe_code": list(foe_err_stu)})
foe_err_df["foe_error"] = 1 # wrong flag = 1

# cluster error
merged_ok["special_foe"] = 0  # normal
special_code_df = pd.DataFrame({"foe_code": list(special_code)})
special_code_df["special_code"] = 1 # wrong flag = 1

# merge
tmpt_df = pd.concat([merged_ok, foe_err_df], ignore_index=True)
merged_df = pd.concat([tmpt_df, special_code_df], ignore_index=True)

In [104]:
# check foe error
merged_df[merged_df["foe_error"] == 1]

Unnamed: 0,course_id,course_type_broad,unit_id,unit_level_code,unit_level_name,unit_foe_detailed,unit_foe_narrow,foe_code,unit_foe_broad,eftsl_2024,...,is_funding_cluster_variable,special_course_code,max_contrib_indicator,foe_detailed_title,foe_detailed,foe_narrow,foe_broad,foe_error,special_foe,special_code
17878,,,,,,,,60000,,,...,,,,,,,,1.0,,
17879,,,,,,,,120000,,,...,,,,,,,,1.0,,


In [105]:
# check foe error
merged_df[merged_df["special_code"] == 1]

Unnamed: 0,course_id,course_type_broad,unit_id,unit_level_code,unit_level_name,unit_foe_detailed,unit_foe_narrow,foe_code,unit_foe_broad,eftsl_2024,...,is_funding_cluster_variable,special_course_code,max_contrib_indicator,foe_detailed_title,foe_detailed,foe_narrow,foe_broad,foe_error,special_foe,special_code
17880,,,,,,,,90701,,,...,,,,,,,,,,1.0
17881,,,,,,,,90700,,,...,,,,,,,,,,1.0
17882,,,,,,,,90799,,,...,,,,,,,,,,1.0
17883,,,,,,,,90500,,,...,,,,,,,,,,1.0
17884,,,,,,,,90501,,,...,,,,,,,,,,1.0
17885,,,,,,,,90503,,,...,,,,,,,,,,1.0
17886,,,,,,,,90505,,,...,,,,,,,,,,1.0
17887,,,,,,,,90507,,,...,,,,,,,,,,1.0
17888,,,,,,,,90509,,,...,,,,,,,,,,1.0
17889,,,,,,,,90511,,,...,,,,,,,,,,1.0


### Create measure to calculate CSP student payment

In [109]:
student_df['funding_type'].value_counts

<bound method IndexOpsMixin.value_counts of 0                Fee-Paying
1                   Onshore
2                   Onshore
3                   Onshore
4                Fee-Paying
                ...        
17666               Onshore
17667    C'wealth Supported
17668               Onshore
17669    C'wealth Supported
17670    C'wealth Supported
Name: funding_type, Length: 17671, dtype: object>

In [110]:
# calculate government payment for domestic student
merged_df['CSP_gov_payment'] = 0.0 
        
for row in merged_df.itertuples():
    if row.funding_nation == 'Domestic' and row.funding_type == "C'wealth Supported":
        if row.commonwealth_contrib_2024 >= row.commonwealth_contrib_gf_2024:
            merged_df.loc[row.Index, 'CSP_gov_payment'] = row.commonwealth_contrib_2024 * row.eftsl_2024
        else:
            merged_df.loc[row.Index, 'CSP_gov_payment'] = row.commonwealth_contrib_gf_2024 * row.eftsl_2024
    else:
        merged_df.loc[row.Index, 'CSP_gov_payment'] = 0.0 # no government payment for international students

In [111]:
# export merged data
merged_df.to_csv("../data/processed/merged_data.csv", index=False)