# school_data.ipynb code implementation overview

Author: Yael Gonzalez

## Data Cleaning

Import required packages:

In [84]:
import numpy as np
from given_data import year_2013, year_2014, year_2015, year_2016, year_2017, year_2018, year_2019, year_2020, year_2021, year_2022

In given_data.py we have school enrollment data provided in single NumPy arrays for each year. There are 20 schools with data for Grade 10, Grade 11, and Grade 12 across ten years. **The schools are arranged from smallest school code to largest.**

Displaying *year_2013* to see it's structure:

In [4]:
year_2013

array([591, 572, 558, 472, 346,   0,  45,  57,  52, 160, 176, 189, 426,
       483, 567, 620, 584, 585, 658, 631, 632, 289, 280, 311, 496, 465,
       528, 523, 467, 517, 487, 413, 457,  29,  29,  45, 399, 361, 380,
       210, 225, 359, 657, 566, 501, 163, 146, 228, 587, 611, 648, 514,
       577, 522, 435, 364, 509, 504, 530, 512])

We can see all the enrollments for year 2013 for ordered for grades 10, 11, and 12 from smallest school code to largest.

So, we can reshape it to a 2D array, where we have 20 schools and 3 grades. Like so:

In [20]:
year_2013.reshape(20, 3) # 20 schools, 3 grades (10, 11, 12)

array([[591, 572, 558],
       [472, 346,   0],
       [ 45,  57,  52],
       [160, 176, 189],
       [426, 483, 567],
       [620, 584, 585],
       [658, 631, 632],
       [289, 280, 311],
       [496, 465, 528],
       [523, 467, 517],
       [487, 413, 457],
       [ 29,  29,  45],
       [399, 361, 380],
       [210, 225, 359],
       [657, 566, 501],
       [163, 146, 228],
       [587, 611, 648],
       [514, 577, 522],
       [435, 364, 509],
       [504, 530, 512]])

Each row represents a school that contains the enrollment data for the 3 grades.

Next, we check if any year contains NaN data:

In [24]:
print(np.any(np.isnan(year_2013)))
print(np.any(np.isnan(year_2014)))
print(np.any(np.isnan(year_2015)))
print(np.any(np.isnan(year_2016)))
print(np.any(np.isnan(year_2017)))
print(np.any(np.isnan(year_2018)))
print(np.any(np.isnan(year_2019)))
print(np.any(np.isnan(year_2020)))
print(np.any(np.isnan(year_2021)))
print(np.any(np.isnan(year_2022)))

False
False
False
False
False
False
False
False
True
True


We can see that years 2021 and 2022 contain NaN data.

In [27]:
year_2021.reshape(20, 3)

array([[518., 453., 493.],
       [464., 493., 420.],
       [  4.,   7.,  42.],
       [164., 136., 139.],
       [475., 424., 450.],
       [664., 623., 642.],
       [663., 679., 737.],
       [489., 522., 566.],
       [482., 431., 413.],
       [507., 476., 643.],
       [448., 404., 448.],
       [ nan,  nan,  nan],
       [652., 497., 552.],
       [435., 384., 404.],
       [574., 474., 498.],
       [139., 123., 176.],
       [750., 728., 747.],
       [529., 571., 508.],
       [578., 459., 484.],
       [478., 487., 571.]])

In [44]:
# Check which row contains NaN values
rows_with_nan = np.any(np.isnan(year_2021.reshape(20, 3)), axis=1)

# Print the index of the row
print("Year 2021 contains NaN values in row: ", np.where(rows_with_nan)[0][0])

Year 2021 contains NaN values in row:  11


In [28]:
year_2022.reshape(20, 3)

array([[419., 445., 446.],
       [427., 378., 352.],
       [  2.,   4.,  38.],
       [170., 148., 127.],
       [424., 365., 424.],
       [583., 602., 601.],
       [706., 728., 714.],
       [558., 522., 437.],
       [372., 289., 318.],
       [514., 510., 491.],
       [413., 463., 405.],
       [ nan,  nan,  nan],
       [488., 520., 458.],
       [375., 403., 391.],
       [465., 486., 290.],
       [109., 127., 166.],
       [815., 748., 742.],
       [565., 521., 435.],
       [605., 589., 477.],
       [496., 507., 527.]])

In [45]:
# Check which row contains NaN values
rows_with_nan = np.any(np.isnan(year_2022.reshape(20, 3)), axis=1)

# Print the index of the row
print("Year 2022 contains NaN values in row: ", np.where(rows_with_nan)[0][0])

Year 2022 contains NaN values in row:  11


This means that the school in the 12th position (from smalles school code to largest) is missing data in the years 2021 and 2022.



In [64]:
school12_grade10_years_2013_to_2020 = [year_2013.reshape(20, 3)[11][0], year_2014.reshape(20, 3)[11][0], year_2015.reshape(20, 3)[11][0], year_2016.reshape(20, 3)[11][0], year_2017.reshape(20, 3)[11][0], year_2018.reshape(20, 3)[11][0], year_2019.reshape(20, 3)[11][0], year_2020.reshape(20, 3)[11][0]]
school12_grade11_years_2013_to_2020 = [year_2013.reshape(20, 3)[11][1], year_2014.reshape(20, 3)[11][1], year_2015.reshape(20, 3)[11][1], year_2016.reshape(20, 3)[11][1], year_2017.reshape(20, 3)[11][1], year_2018.reshape(20, 3)[11][1], year_2019.reshape(20, 3)[11][1], year_2020.reshape(20, 3)[11][1]]
school12_grade12_years_2013_to_2020 = [year_2013.reshape(20, 3)[11][2], year_2014.reshape(20, 3)[11][2], year_2015.reshape(20, 3)[11][2], year_2016.reshape(20, 3)[11][2], year_2017.reshape(20, 3)[11][2], year_2018.reshape(20, 3)[11][2], year_2019.reshape(20, 3)[11][2], year_2020.reshape(20, 3)[11][2]]

print(school12_grade10_years_2013_to_2020)
print(school12_grade11_years_2013_to_2020)
print(school12_grade12_years_2013_to_2020)

[29, 36, 48, 41, 45, 38, 61, 34]
[29, 44, 37, 46, 50, 45, 56, 59]
[45, 44, 43, 48, 56, 57, 69, 64]


In [65]:
import math

school12_mean_grade10 = math.floor(np.mean(school12_grade10_years_2013_to_2020))
school12_mean_grade11 = math.floor(np.mean(school12_grade11_years_2013_to_2020))
school12_mean_grade12 = math.floor(np.mean(school12_grade12_years_2013_to_2020))

In [71]:
clean_2D_2021 = year_2021.reshape(20, 3)

clean_2D_2021[11][0] = school12_mean_grade10
clean_2D_2021[11][1] = school12_mean_grade11
clean_2D_2021[11][2] = school12_mean_grade12

print(clean_2D_2021)

[[518. 453. 493.]
 [464. 493. 420.]
 [  4.   7.  42.]
 [164. 136. 139.]
 [475. 424. 450.]
 [664. 623. 642.]
 [663. 679. 737.]
 [489. 522. 566.]
 [482. 431. 413.]
 [507. 476. 643.]
 [448. 404. 448.]
 [ 41.  45.  53.]
 [652. 497. 552.]
 [435. 384. 404.]
 [574. 474. 498.]
 [139. 123. 176.]
 [750. 728. 747.]
 [529. 571. 508.]
 [578. 459. 484.]
 [478. 487. 571.]]


In [73]:
clean_2021 = clean_2D_2021.reshape(60)
print(clean_2021)

[518. 453. 493. 464. 493. 420.   4.   7.  42. 164. 136. 139. 475. 424.
 450. 664. 623. 642. 663. 679. 737. 489. 522. 566. 482. 431. 413. 507.
 476. 643. 448. 404. 448.  41.  45.  53. 652. 497. 552. 435. 384. 404.
 574. 474. 498. 139. 123. 176. 750. 728. 747. 529. 571. 508. 578. 459.
 484. 478. 487. 571.]


In [68]:
clean_2D_2022 = year_2022.reshape(20, 3)

clean_2D_2022[11][0] = school12_mean_grade10
clean_2D_2022[11][1] = school12_mean_grade11
clean_2D_2022[11][2] = school12_mean_grade12

print(clean_2D_2022)

[[419. 445. 446.]
 [427. 378. 352.]
 [  2.   4.  38.]
 [170. 148. 127.]
 [424. 365. 424.]
 [583. 602. 601.]
 [706. 728. 714.]
 [558. 522. 437.]
 [372. 289. 318.]
 [514. 510. 491.]
 [413. 463. 405.]
 [ 41.  45.  53.]
 [488. 520. 458.]
 [375. 403. 391.]
 [465. 486. 290.]
 [109. 127. 166.]
 [815. 748. 742.]
 [565. 521. 435.]
 [605. 589. 477.]
 [496. 507. 527.]]


In [None]:
np.array()

In [2]:
school_dict = {
    1224: "Centennial High School",
    1679: "Robert Thirsk School", 
    9626: "Louise Dean School",
    9806: "Queen Elizabeth High School",
    9813: "Forest Lawn High School",
    9815: "Crescent Heights High School",
    9816: "Western Canada High School",
    9823: "Central Memorial High School",
    9825: "James Fowler High School",
    9826: "Ernest Manning High School",
    9829: "William Aberhart High School",
    9830: "National Sport School",
    9836: "Henry Wise Wood High School",
    9847: "Bowness High School",
    9850: "Lord Beaverbrook High School",
    9856: "Jack James High School",
    9857: "Sir Winston Churchill High School",
    9858: "Dr. E. P. Scarlett High School",
    9860: "John G Diefenbaker High School",
    9865: "Lester B. Pearson High School"
}

In [3]:
school_codes = np.array([1224, 1679, 9626, 9806, 9813, 9815, 9816, 9823, 9825, 9826, 9829, 9830, 9836, 9847, 9850, 9856, 9857, 9858, 9860, 9865])

In [None]:
def main():
    print("ENSF 692 School Enrollment Statistics")

    # Print Stage 1 requirements here
    print(year_2013)

    # Prompt for user input

    # Print Stage 2 requirements here
    print("\n***Requested School Statistics***\n")

    # Print Stage 3 requirements here
    print("\n***General Statistics for All Schools***\n")

In [None]:
if __name__ == '__main__':
    main()