# ETL OF PROJECT

In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Extract data

In [255]:
# Define raw data path
student_path = "../data/raw/2024 UWA Student Data.xlsx"
government_path = "../data/raw/2024 allocation of units of study.xlsx"

# Load datasets
student_df = pd.read_excel(student_path, sheet_name="Sheet1")
government_df = pd.read_excel(government_path, sheet_name="2024AllocationOfUnitsOfStudy")

## Tramsform data

### Exploratory Data Analysis

In [256]:
def explore_dataframe(df, df_name):
    print(f"--- Exploring DataFrame: {df_name} ---")
    print("\nShape:")
    print(df.shape)
    print("\nInfo:")
    df.info()
    print("\nFirst 5 Rows:")
    print(df.head())
    print("\nDescriptive Statistics (for numeric columns):")
    print(df.describe())
    print("\nMissing Values Count:")
    print(df.isnull().sum())
    print("-" * 50 + "\n")

In [257]:
# Extract relevant sheets
explore_dataframe(student_df, "Student Data")

--- Exploring DataFrame: Student Data ---

Shape:
(17672, 11)

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17672 entries, 0 to 17671
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   UWACourseID                 17672 non-null  object 
 1   CourseTypeBroadName         17671 non-null  object 
 2   UWAUnitID                   17671 non-null  float64
 3   FundingGroupName            17671 non-null  object 
 4   UnitLevelCode               17671 non-null  float64
 5   UnitLevelName               17671 non-null  object 
 6   UnitPrimaryFOEDetailedName  17671 non-null  object 
 7   UnitPrimaryFOENarrowName    17671 non-null  object 
 8   UnitPrimaryFOECode          17671 non-null  float64
 9   UnitPrimaryFOEBroadName     17671 non-null  object 
 10  2024 EFTSL                  17672 non-null  float64
dtypes: float64(4), object(7)
memory usage: 1.5+ MB

First 5 Rows:
   UWACourseID

In [258]:
# Extract relevant sheets
explore_dataframe(government_df, "Government Data")

--- Exploring DataFrame: Government Data ---

Shape:
(441, 13)

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 441 entries, 0 to 440
Data columns (total 13 columns):
 #   Column                                                             Non-Null Count  Dtype 
---  ------                                                             --------------  ----- 
 0   Funding Cluster                                                    441 non-null    object
 1   Discipline Code
(FOE)                                              441 non-null    int64 
 2   2024 Maximum Student Contibution                                   441 non-null    int64 
 3   2024 Commonwealth Contribution                                     441 non-null    int64 
 4   2024 Grandfathered Maximum Student Contibution                     441 non-null    int64 
 5   2024 Grandfathered Commonwealth Contribution                       441 non-null    int64 
 6   Funding Cluster varies for FOE depending on E312 or E392 (Ye

### Data Cleaning

#### Normalnize column names

In [259]:
# Student Data columns
# create rename map dictionary
rename_map = {
    'UWACourseID': 'course_id',
    'CourseTypeBroadName': 'course_type_broad',
    'UWAUnitID': 'unit_id',
    'FundingGroupName': 'funding_group',
    'UnitLevelCode': 'unit_level_code',
    'UnitLevelName': 'unit_level_name',
    'UnitPrimaryFOEDetailedName': 'unit_foe_detailed',
    'UnitPrimaryFOENarrowName': 'unit_foe_narrow',
    'UnitPrimaryFOECode': 'foe_code',
    'UnitPrimaryFOEBroadName': 'unit_foe_broad',
    '2024 EFTSL': 'eftsl_2024'
}
# rename columns
student_df.rename(columns=rename_map, inplace=True)

In [260]:
# government Data columns
# create rename map dictionary
rename_map = {
    'Funding Cluster': 'funding_cluster',
    'Discipline Code\n(FOE)': 'foe_code',
    '2024 Maximum Student Contibution': 'max_student_contrib_2024',
    '2024 Commonwealth Contribution': 'commonwealth_contrib_2024',
    '2024 Grandfathered Maximum Student Contibution': 'max_student_contrib_gf_2024',
    '2024 Grandfathered Commonwealth Contribution': 'commonwealth_contrib_gf_2024',
    'Funding Cluster varies for FOE depending on E312 or E392 (Yes/No)': 'is_funding_cluster_variable',
    'Special Course Type Code for the Course of Study\n(E312 of 27)': 'special_course_code',
    'Maximum student contribution indicator\n(E392 =8)': 'max_contrib_indicator',
    'DETAILED Discipline (FOE) - Title': 'foe_detailed_title',
    'DETAILED Discipline (FOE) ': 'foe_detailed',
    'NARROW Discipline (FOE) ': 'foe_narrow',
    'BROAD Discipline (FOE) ': 'foe_broad'
}
government_df.rename(columns=rename_map, inplace=True)

### Missing Values Analysis

In [261]:
# For student data, there is one row - grand total row - that has all NaN values. We can remove this row.
student_df.dropna(axis= 0,how='any', inplace=True)
student_df.isna().sum()

course_id            0
course_type_broad    0
unit_id              0
funding_group        0
unit_level_code      0
unit_level_name      0
unit_foe_detailed    0
unit_foe_narrow      0
foe_code             0
unit_foe_broad       0
eftsl_2024           0
dtype: int64

### Duplicates

In [262]:
# remove duplicates from both datasets
student_df.drop_duplicates(inplace=True)
government_df.drop_duplicates(inplace=True)

#  drop columns that are not needed, 'foe_detailed_title','foe_detailed_y', 'foe_narrow_y', 'foe_broad_y' in student data are duplicates of government data
# government data has more complete information
student_df.drop(columns=[ 'unit_foe_detailed', 'unit_foe_narrow', 'unit_foe_broad'], inplace=True)

### Data type

In [263]:
# transform data types
# student data
columns_to_int= ['course_id', 'unit_id', 'unit_level_code', 'foe_code']
for col in columns_to_int:
    student_df[col] = student_df[col].astype('int64')


### Split columns

In [264]:
student_df['funding_group']

1        Domestic - C'wealth Supported
2        Domestic - C'wealth Supported
3        Domestic - C'wealth Supported
4        Domestic - C'wealth Supported
5        Domestic - C'wealth Supported
                     ...              
17667            Domestic - Fee-Paying
17668            Domestic - Fee-Paying
17669          International - Onshore
17670          International - Onshore
17671            Domestic - Fee-Paying
Name: funding_group, Length: 17671, dtype: object

In [265]:
# split funding_group into funding_type and funding_nationality
split_data = student_df['funding_group'].str.split(' - ', expand=True)
student_df['funding_nation'] = split_data[0]
student_df['funding_type'] = split_data[1]
# drop original funding_group column
student_df.drop(columns=['funding_group'], inplace=True)

In [266]:
government_df[['foe_detailed', 'foe_narrow', 'foe_broad']]

Unnamed: 0,foe_detailed,foe_narrow,foe_broad
0,090701 - Psychology,0907 - Behavioural Science,09 - SOCIETY AND CULTURE
1,090701 - Psychology,0907 - Behavioural Science,09 - SOCIETY AND CULTURE
2,090701 - Psychology,0907 - Behavioural Science,09 - SOCIETY AND CULTURE
3,090700 - Behavioural Science,0907 - Behavioural Science,09 - SOCIETY AND CULTURE
4,090700 - Behavioural Science,0907 - Behavioural Science,09 - SOCIETY AND CULTURE
...,...,...,...
436,"060799 - Dental Studies, n.e.c.",0607 - Dental Studies,06 - HEALTH
437,061100 - Veterinary Studies,0611 - Veterinary Studies,06 - HEALTH
438,061101 - Veterinary Science,0611 - Veterinary Studies,06 - HEALTH
439,061103 - Veterinary Assisting,0611 - Veterinary Studies,06 - HEALTH


In [267]:
# split foe related columns in government data
# As government data has more complete information
cols = ['foe_detailed', 'foe_narrow', 'foe_broad']
for col in cols:
    split_data = government_df[col].str.split(' - ', expand=True)
    government_df[col] = split_data[1]

government_df[cols]

Unnamed: 0,foe_detailed,foe_narrow,foe_broad
0,Psychology,Behavioural Science,SOCIETY AND CULTURE
1,Psychology,Behavioural Science,SOCIETY AND CULTURE
2,Psychology,Behavioural Science,SOCIETY AND CULTURE
3,Behavioural Science,Behavioural Science,SOCIETY AND CULTURE
4,Behavioural Science,Behavioural Science,SOCIETY AND CULTURE
...,...,...,...
436,"Dental Studies, n.e.c.",Dental Studies,HEALTH
437,Veterinary Studies,Veterinary Studies,HEALTH
438,Veterinary Science,Veterinary Studies,HEALTH
439,Veterinary Assisting,Veterinary Studies,HEALTH


### Data integration

In [268]:
# merge datasets on FOE Broad
merged_df = pd.merge(student_df, government_df, on = 'foe_code')


In [269]:
merged_df.columns

Index(['course_id', 'course_type_broad', 'unit_id', 'unit_level_code',
       'unit_level_name', 'foe_code', 'eftsl_2024', 'funding_nation',
       'funding_type', 'funding_cluster', 'max_student_contrib_2024',
       'commonwealth_contrib_2024', 'max_student_contrib_gf_2024',
       'commonwealth_contrib_gf_2024', 'is_funding_cluster_variable',
       'special_course_code', 'max_contrib_indicator', 'foe_detailed_title',
       'foe_detailed', 'foe_narrow', 'foe_broad'],
      dtype='object')

In [270]:
# Create new column to calculate the total funding for domestic students        
merged_df['stud_payment'] = 0.0
merged_df['gov_payment'] = 0.0

for row in merged_df.itertuples():
    if row.funding_nation == 'Domestic':
        if row.max_student_contrib_2024 <= row.max_student_contrib_gf_2024:
            merged_df.loc[row.Index, 'stud_payment'] = row.max_student_contrib_2024 * row.eftsl_2024
        else:
            merged_df.loc[row.Index, 'stud_payment'] = row.max_student_contrib_gf_2024 * row.eftsl_2024
    else:
        merged_df.loc[row.Index, 'stud_payment'] = row.eftsl_2024 * 20000 # need student payment for international students
  
for row in merged_df.itertuples():
    if row.funding_nation == 'Domestic':
        if row.commonwealth_contrib_2024 >= row.commonwealth_contrib_gf_2024:
            merged_df.loc[row.Index, 'gov_payment'] = row.commonwealth_contrib_2024 * row.eftsl_2024
        else:
            merged_df.loc[row.Index, 'gov_payment'] = row.commonwealth_contrib_gf_2024 * row.eftsl_2024
    else:
        merged_df.loc[row.Index, 'gov_payment'] = 0.0 # no government payment for international students

merged_df['total_payment'] = merged_df['stud_payment'] + merged_df['gov_payment']

In [271]:
# export merged data
merged_df.to_csv("../data/processed/merged_data.csv", index=False)

### Dimension Tables

#### Dimension Table for Funding category

In [272]:
dim_funding = merged_df[['foe_code','funding_cluster','funding_nation','funding_type']].drop_duplicates().reset_index(drop=True)
dim_funding.head()

Unnamed: 0,foe_code,funding_cluster,funding_nation,funding_type
0,60100,Funding Cluster 4,Domestic,C'wealth Supported
1,10103,Funding Cluster 2,Domestic,C'wealth Supported
2,91901,Funding Cluster 1,Domestic,C'wealth Supported
3,80101,Funding Cluster 1,Domestic,C'wealth Supported
4,80307,Funding Cluster 1,Domestic,C'wealth Supported


####  Dimension Table for unit category

In [273]:
dim_unit = merged_df[['unit_id', 'unit_level_code', 'unit_level_name']].drop_duplicates().reset_index(drop=True)
dim_unit.head()

Unnamed: 0,unit_id,unit_level_code,unit_level_name
0,240072,3,Level Three
1,240586,3,Level Three
2,239439,4,Level Four
3,240587,4,Level Four
4,243526,1,Level One


#### Dimension table for FOE category

In [274]:
dim_foe = merged_df[['foe_code', 'foe_detailed', 'foe_narrow', 'foe_broad', 'special_course_code','max_contrib_indicator']].drop_duplicates().reset_index(drop=True)
dim_foe.head()

Unnamed: 0,foe_code,foe_detailed,foe_narrow,foe_broad,special_course_code,max_contrib_indicator
0,60100,Medical Studies,Medical Studies,HEALTH,Any E312 value,Any E392 value
1,10103,Statistics,Mathematical Sciences,NATURAL AND PHYSICAL SCIENCES,Any E312 value,Any E392 value
2,91901,Economics,Economics and Econometrics,SOCIETY AND CULTURE,Any E312 value,Any E392 value
3,80101,Accounting,Accounting,MANAGEMENT AND COMMERCE,Any E312 value,Any E392 value
4,80307,Organisation Management,Business and Management,MANAGEMENT AND COMMERCE,Any E312 value,Any E392 value


#### Dimension Table for course category

In [275]:
dim_course = merged_df[['course_id', 'course_type_broad']].drop_duplicates().reset_index(drop=True)
dim_course.head()

Unnamed: 0,course_id,course_type_broad
0,4951,Postgraduate Coursework
1,4733,Undergraduate
2,3952,Postgraduate Coursework
3,2846,Postgraduate Coursework
4,3086,Higher Degree Research


#### Fact table

In [276]:
fact_table = merged_df[['course_id', 'unit_id', 'foe_code']]

In [277]:
# add measures to fact table
fact_table['eftsl_2024'] = merged_df['eftsl_2024']
fact_table['stud_payment'] = merged_df['stud_payment']
fact_table['gov_payment'] = merged_df['gov_payment']
fact_table['total_payment'] = merged_df['total_payment']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fact_table['eftsl_2024'] = merged_df['eftsl_2024']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fact_table['stud_payment'] = merged_df['stud_payment']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fact_table['gov_payment'] = merged_df['gov_payment']
A value is trying to be set on a copy of a sli

In [278]:
fact_table.head()

Unnamed: 0,course_id,unit_id,foe_code,eftsl_2024,stud_payment,gov_payment,total_payment
0,4951,240072,60100,97.0,1233840.0,2948315.0,4182155.0
1,4951,240586,60100,92.5,1176600.0,2811537.5,3988137.5
2,4951,239439,60100,86.5,1100280.0,2629167.5,3729447.5
3,4951,240587,60100,86.5,1100280.0,2629167.5,3729447.5
4,4733,243526,10103,80.0,355600.0,1193200.0,1548800.0


## Load data

In [279]:
# set processed data path
PROCESSED_DATA_PATH = '../data/processed/'

# save dimension tables and fact table as csv files
dim_funding.to_csv(os.path.join(PROCESSED_DATA_PATH, 'dim_funding.csv'), index=False)
dim_unit.to_csv(os.path.join(PROCESSED_DATA_PATH, 'dim_unit.csv'), index=False)
dim_foe.to_csv(os.path.join(PROCESSED_DATA_PATH, 'dim_foe.csv'), index=False)
dim_course.to_csv(os.path.join(PROCESSED_DATA_PATH, 'dim_course.csv'), index=False)
fact_table.to_csv(os.path.join(PROCESSED_DATA_PATH, 'fact_table.csv'), index=False)