# Week 2 - Seminar - Exploratory Data Analysis and Plotting


In [2]:
import duckdb
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# This prevents pandas from showing weird warnings
pd.options.mode.chained_assignment = None

Copy data preprocessing from the lecture notebook. If you're using Google Colab, keep in mind that your data might be saved in the `MyDrive/attendance` folder - adjust the query below accordingly. 

In [3]:
df = duckdb.sql(
    """
        WITH attendance AS (
            SELECT
                report_period,
                school_code,
                student_class,
                subject_code,
                electronic_diary,
                FIRST(student_count_diary) student_count_diary,
                SUM(excused_lessons_illness) excused_lessons_illness,
                SUM(excused_lessons_other) excused_lessons_other,
                SUM(unexcused_lessons) unexcused_lessons
            FROM 'attendance/attendance.parquet'
            WHERE division_code IS NULL
            GROUP BY all
        ),
            school AS (
            SELECT 
                school_code, 
                school_name, 
                municipality_name
            FROM 'attendance/school.parquet' 
            WHERE division_code IS NULL
        ),
            subject AS (
            SELECT
                electronic_diary,
                subject_code,
                subject_name_en,
            FROM 'attendance/subject.parquet'
        )
        SELECT
            school.school_name,
            school.school_code,
            school.municipality_name,
            attendance.student_class,
            attendance.report_period,
            subject.subject_name_en,
            FIRST(attendance.student_count_diary) AS student_count,
            SUM(attendance.excused_lessons_illness) AS excused_lessons_illness,
            SUM(attendance.excused_lessons_other) AS excused_lessons_other,
            SUM(attendance.unexcused_lessons) AS unexcused_lessons
        FROM attendance
        JOIN school ON attendance.school_code = school.school_code
        JOIN subject ON attendance.subject_code = subject.subject_code
            AND attendance.electronic_diary = subject.electronic_diary
        GROUP BY all
""").df()


In [4]:
df = df[(df["subject_name_en"] == "Mathematics") & (df["student_class"] == 12)]

In [5]:
# Filter on report period
df = df[df["report_period"].between("2018-09-01", "2024-05-31")]
# Get rid of summer months
df["month"] = df["report_period"].dt.month
df = df[~df["month"].isin([6, 7, 8])]

# Make sure the set of schools is the same across the years
# First, count how many report periods each school has
schools = df.groupby("school_name").agg(
    report_period_count=("report_period", "nunique")
)
# Then, filter on the schools that have data for all 6 school years (9 months each year)
schools = schools[schools["report_period_count"] == 6 * 9]
df = df[df["school_name"].isin(schools.index)]

df["school_year"] = df["report_period"].dt.year - (
    df["report_period"].dt.month <= 5
).astype(int)
df[["report_period", "school_year"]].sample(5)

df["excused_illness_per_student"] = df["excused_lessons_illness"] / df["student_count"]
df["excused_other_per_student"] = df["excused_lessons_other"] / df["student_count"]
df["unexcused_per_student"] = df["unexcused_lessons"] / df["student_count"]
df["total_missed"] = (
    df["excused_lessons_illness"]
    + df["excused_lessons_other"]
    + df["unexcused_lessons"]
)
df["total_missed_per_student"] = df["total_missed"] / df["student_count"]

df.sample(5)


Unnamed: 0,school_name,school_code,municipality_name,student_class,report_period,subject_name_en,student_count,excused_lessons_illness,excused_lessons_other,unexcused_lessons,month,school_year,excused_illness_per_student,excused_other_per_student,unexcused_per_student,total_missed,total_missed_per_student
1853047,Druskininkų „Ryto“ gimnazija,195328350,Druskininkų sav.,12,2023-02-01,Mathematics,123,68.0,32.0,45.0,2,2022,0.552846,0.260163,0.365854,145.0,1.178862
1507126,Molėtų gimnazija,191227820,Molėtų r. sav.,12,2023-10-01,Mathematics,74,94.0,59.0,3.0,10,2023,1.27027,0.797297,0.040541,156.0,2.108108
3857421,Prienų r. Stakliškių gimnazija,190192277,Prienų r. sav.,12,2021-03-01,Mathematics,14,0.0,10.0,0.0,3,2020,0.0,0.714286,0.0,10.0,0.714286
3854836,Šalčininkų r. Eišiškių gimnazija,191416098,Šalčininkų r. sav.,12,2020-01-01,Mathematics,36,64.0,21.0,0.0,1,2019,1.777778,0.583333,0.0,85.0,2.361111
3407861,Šalčininkų r. Kalesninkų Liudviko Narbuto gimn...,191416664,Šalčininkų r. sav.,12,2020-10-01,Mathematics,12,13.0,2.0,0.0,10,2020,1.083333,0.166667,0.0,15.0,1.25
