# New South Wales Department of Education (NSW DOE) - Data Case Study 
## Data Analysis

In [1]:
import pandas as pd
import duckdb
from ydata_profiling import ProfileReport
import matplotlib.pyplot as plt
import seaborn as sns

### connect to the database

In [2]:
con = duckdb.connect('../../database/nsw_doe_data_case_study.duckdb',read_only=False)

#### Exploratory Data Analysis:

In [None]:
df = con.sql('select * from public_school_nsw_master_dataset').df()

In [None]:
print(df.shape)
df.head()

In [None]:
df = con.sql('select * from public_school_nsw_master_dataset').df()
profile_public_school_nsw_master_dataset = ProfileReport(df, title="Public School NSW Data Profiling Report")
profile_public_school_nsw_master_dataset.to_file("profile_public_school_nsw_master_dataset.html")
profile_public_school_nsw_master_dataset

In [None]:
df = con.sql('select * from multi_age_composite_unpivoted').df()
profile_multi_age_composite_unpivoted = ProfileReport(df, title="Multi Age Composite Profiling Report")
profile_multi_age_composite_unpivoted.to_file("profile_multi_age_composite_unpivoted.html")
profile_multi_age_composite_unpivoted

In [None]:
df = con.sql('select * from student_attendance_dataset').df()
profile_student_attendance_dataset = ProfileReport(df, title="Student Attendance Profiling Report")
profile_student_attendance_dataset.to_file("profile_student_attendance_dataset.html")
profile_student_attendance_dataset

In [3]:
# Retrieving the data from views into pandas DataFrames
school_master_data = pd.read_sql("SELECT * FROM school_master_data", con)
school_attendance_data = pd.read_sql("SELECT * FROM school_attendance_data", con)
school_composite_data = pd.read_sql("SELECT * FROM school_composite_data", con)
school_characteristics = pd.read_sql("SELECT * FROM school_characteristics", con)
print("Attendance Data Summary:")
print(school_attendance_data['Attendance_pct'].describe())
print("\nComposite Class Counts Summary:")
print(school_composite_data['Composite_Value'].describe())
print("\nCharacteristics Summary:")
print(school_characteristics[['latest_year_enrolment_FTE', 'Indigenous_pct', 'LBOTE_pct']].describe())

Attendance Data Summary:
count     22956
unique      381
top        94.2
freq        370
Name: Attendance_pct, dtype: object

Composite Class Counts Summary:
count     53952.000000
mean         81.766761
std        2080.187794
min           0.000000
25%           6.000000
50%          40.000000
75%          90.000000
max      176734.000000
Name: Composite_Value, dtype: float64

Characteristics Summary:
       latest_year_enrolment_FTE  Indigenous_pct    LBOTE_pct
count                2166.000000     1685.000000  1840.000000
mean                  365.111588       15.362611    30.189674
std                   323.158491       17.025052    30.153953
min                     2.000000        0.000000     0.000000
25%                   101.250000        4.000000     6.000000
50%                   293.000000       10.000000    16.000000
75%                   532.000000       21.000000    52.000000
max                  2079.000000      100.000000   100.000000


  school_master_data = pd.read_sql("SELECT * FROM school_master_data", con)
  school_attendance_data = pd.read_sql("SELECT * FROM school_attendance_data", con)
  school_composite_data = pd.read_sql("SELECT * FROM school_composite_data", con)
  school_characteristics = pd.read_sql("SELECT * FROM school_characteristics", con)


In [None]:
# Histogram for Attendance Rates
sns.histplot(school_attendance_data['Attendance_pct'], kde=True)
plt.title("Distribution of Attendance Rates")
plt.show()
# Histogram for Composite Class Counts
sns.histplot(school_composite_data['Composite_Value'], kde=True)
plt.title("Distribution of Composite Class Counts")
plt.show()


In [6]:
merged_data = pd.merge(school_attendance_data, school_composite_data, on=['School_Code', 'Year'], how='inner')

# Average attendance rates over years
average_attendance_per_year = merged_data.groupby('Year')['Attendance_pct'].mean()
average_attendance_per_year.plot()
plt.title("Average Attendance Rate Over Years")
plt.ylabel("Attendance Rate")
plt.show()

# Average composite class counts over years
average_composite_per_year = merged_data.groupby('Year')['Composite_Value'].mean()
average_composite_per_year.plot()
plt.title("Average Composite Class Counts Over Years")
plt.ylabel("Composite Counts")
plt.show()


KeyError: 'School_Code'

### Action: Do your analysis below