# New South Wales Department of Education (NSW DOE) - Data Case Study 
## Data Analysis

In [5]:
import pandas as pd
import duckdb
from ydata_profiling import ProfileReport
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from scipy.stats import ttest_ind

### connect to the database

In [8]:
con = duckdb.connect('../../database/nsw_doe_data_case_study.duckdb',read_only=False)

IOException: IO Error: Could not set lock on file "/workspaces/doe_case_study_task/analysis/analysis-jupyter/../../database/nsw_doe_data_case_study.duckdb": Resource temporarily unavailable

#### Exploratory Data Analysis:

#### Data profile check

In [None]:
df = con.sql('select * from public_school_nsw_master_dataset').df()
profile_public_school_nsw_master_dataset = ProfileReport(df, title="Public School NSW Data Profiling Report")
profile_public_school_nsw_master_dataset.to_file("profile_public_school_nsw_master_dataset.html")
profile_public_school_nsw_master_dataset

In [None]:
df = con.sql('select * from multi_age_composite_unpivoted').df()
profile_multi_age_composite_unpivoted = ProfileReport(df, title="Multi Age Composite Profiling Report")
profile_multi_age_composite_unpivoted.to_file("profile_multi_age_composite_unpivoted.html")
profile_multi_age_composite_unpivoted

In [None]:
df = con.sql('select * from student_attendance_unpivoted').df()
profile_student_attendance_dataset = ProfileReport(df, title="Student Attendance Profiling Report")
profile_student_attendance_dataset.to_file("student_attendance_unpivoted.html")
profile_student_attendance_dataset

In [None]:
df = con.sql('select * from nsw_composite_school_attendance_data').df()
profile_student_attendance_dataset = ProfileReport(df, title="nsw_composite_school_attendance_data Profiling Report")
profile_student_attendance_dataset.to_file("nsw_composite_school_attendance_data.html")
profile_student_attendance_dataset

## Action: Data analysis:

<span style="color:yellow; font-size:30px;">Hypothesis Formulation:</span>

| Title                   | Description                                                          |
|-------------------------|----------------------------------------------------------------------|
| **Objective**           | Determine if multi-age composite classes have an impact on attendance rates. |
| **Null Hypothesis (H₀)** | Multi-age composite classes have no impact on attendance rates.      |
| **Alternative Hypothesis (H₁)** | Multi-age composite classes have a significant impact on attendance rates. |


<span style="color:yellow; font-size:30px;">Statistical Test:</span>

In [7]:
# Query data from the database
query = "SELECT * FROM nsw_composite_school_attendance_data"
df = con.execute(query).fetch_df()

# Descriptive statistics
desc_stats = df.describe()
print(desc_stats)

# Correlation matrix
correlations = df.corr()
print(correlations)

# Separate schools based on ICSEA_value
mean_icsea = df['ICSEA_value'].mean()
high_icsea = df[df['ICSEA_value'] > mean_icsea]
low_icsea = df[df['ICSEA_value'] <= mean_icsea]

# Columns for t-tests
cols_for_ttest = [
    'Composite_class_count',
    'Composite_class_students',
    'Pct_composite_classes',
    'Pct_composite_class_students',
    'Attendance_pct',
    'ICSEA_value',
    'latest_year_enrolment_FTE',
    'Indigenous_pct',
    'LBOTE_pct'
]

# T-tests
significant_cols = []

for col in cols_for_ttest:
    t_stat, p_val = ttest_ind(high_icsea[col], low_icsea[col], nan_policy='omit')  # omitting NaN values
    
    # Apply Bonferroni correction for multiple testing
    adjusted_alpha = 0.05 / len(cols_for_ttest)
    
    if p_val < adjusted_alpha:
        significant_cols.append(col)

print("\nColumns with significant differences between high and low ICSEA schools:")
print(significant_cols)

NameError: name 'con' is not defined