# Preparing Winter Test Results

By Kenneth Burchfiel
Released under the MIT License

This script creates a modified version of the test_results table that can then be 'un-modified' by our data cleaning notebook.

In [17]:
import pandas as pd
import sqlalchemy
import numpy as np
rng = np.random.default_rng(seed=1150) 

pfn_db_engine = sqlalchemy.create_engine(
'sqlite:///'+'../../data/network_database.db')

pfn_db_engine

Engine(sqlite:///../../data/network_database.db)

Reading in fall test results: (these will be converted into winter scores)

In [18]:
df_test_results = pd.read_sql("Select * from test_results where Period == 'Fall'", 
con = pfn_db_engine)

df_test_results

Unnamed: 0,Student_ID,School,Grade,Starting_Year,Period,Score
0,42026,CA,1,2023,Fall,47
1,43491,CA,1,2023,Fall,49
2,41637,CA,1,2023,Fall,57
3,40365,CA,1,2023,Fall,63
4,41516,CA,1,2023,Fall,51
...,...,...,...,...,...,...
3995,41060,SA,K,2023,Fall,51
3996,43942,SA,K,2023,Fall,57
3997,40479,SA,K,2023,Fall,57
3998,41160,SA,K,2023,Fall,57


Simulating a general decrease in test scores from the fall to the winter:

In [19]:
df_test_results['Score'] = df_test_results['Score'] - rng.integers(low = -3, high = 15, size = 4000)
df_test_results

Unnamed: 0,Student_ID,School,Grade,Starting_Year,Period,Score
0,42026,CA,1,2023,Fall,34
1,43491,CA,1,2023,Fall,44
2,41637,CA,1,2023,Fall,43
3,40365,CA,1,2023,Fall,56
4,41516,CA,1,2023,Fall,47
...,...,...,...,...,...,...
3995,41060,SA,K,2023,Fall,42
3996,43942,SA,K,2023,Fall,44
3997,40479,SA,K,2023,Fall,56
3998,41160,SA,K,2023,Fall,43


Reformatting the Student_ID values so that they'll have either an 'ID' or 'Student_ID' prefix (depending on the school) along with a hyphen in between the 1st 2 and last 3 digits:

In [20]:
df_test_results['Identification Code'] = df_test_results['Student_ID'].astype('str').str[0:2] + '-' + df_test_results['Student_ID'].astype('str').str[2:5]
df_test_results['Identification Code'] = np.where(df_test_results['School'].isin(['CA', 'SA']), 'ID:' + df_test_results['Identification Code'], 'Student ID: ' + df_test_results['Identification Code'])
df_test_results

Unnamed: 0,Student_ID,School,Grade,Starting_Year,Period,Score,Identification Code
0,42026,CA,1,2023,Fall,34,ID:42-026
1,43491,CA,1,2023,Fall,44,ID:43-491
2,41637,CA,1,2023,Fall,43,ID:41-637
3,40365,CA,1,2023,Fall,56,ID:40-365
4,41516,CA,1,2023,Fall,47,ID:41-516
...,...,...,...,...,...,...,...
3995,41060,SA,K,2023,Fall,42,ID:41-060
3996,43942,SA,K,2023,Fall,44,ID:43-942
3997,40479,SA,K,2023,Fall,56,ID:40-479
3998,41160,SA,K,2023,Fall,43,ID:41-160


We'll now create a 'Test Day' field that can be used to help split each school's data into 6 different files (one for each test day). This will increase the value of a Python-based approach that allows all of these 24 files to be cleaned automatically. (If there was just one file per school, the user might be tempted to perform the cleaning tasks manually.)

In [21]:
df_test_results['Test Day'] = df_test_results.index % 5 + 1 # This code uses
# the modulus operator to assign each row to one of five datasets. + 1 is added
# in so that the resulting datasets range from 1 to 5 rather than from 0 to 4.
# A 6th test day field will be added below.
df_test_results

Unnamed: 0,Student_ID,School,Grade,Starting_Year,Period,Score,Identification Code,Test Day
0,42026,CA,1,2023,Fall,34,ID:42-026,1
1,43491,CA,1,2023,Fall,44,ID:43-491,2
2,41637,CA,1,2023,Fall,43,ID:41-637,3
3,40365,CA,1,2023,Fall,56,ID:40-365,4
4,41516,CA,1,2023,Fall,47,ID:41-516,5
...,...,...,...,...,...,...,...,...
3995,41060,SA,K,2023,Fall,42,ID:41-060,1
3996,43942,SA,K,2023,Fall,44,ID:43-942,2
3997,40479,SA,K,2023,Fall,56,ID:40-479,3
3998,41160,SA,K,2023,Fall,43,ID:41-160,4


We'll now simulate a condition in which 10% of students retook their test and received a generally higher score on their second attempt. We'll then add these additional tests to our dataset, thus creating duplicate values that our data cleaning script will need to process.

In [22]:
# Creating a retakes dataset:
df_retakes = df_test_results.sample(frac = 0.1).copy()
df_retakes['Score'] = df_retakes['Score'] + rng.integers(low = -2, high = 8, size = len(df_retakes))
# All of these retakes occurred on the 6th day of testing.
df_retakes['Test Day'] = 6
df_retakes

# Adding these retakes back to our original dataset:

df_test_results = pd.concat([df_test_results, df_retakes]).reset_index(drop=True)
df_test_results

Unnamed: 0,Student_ID,School,Grade,Starting_Year,Period,Score,Identification Code,Test Day
0,42026,CA,1,2023,Fall,34,ID:42-026,1
1,43491,CA,1,2023,Fall,44,ID:43-491,2
2,41637,CA,1,2023,Fall,43,ID:41-637,3
3,40365,CA,1,2023,Fall,56,ID:40-365,4
4,41516,CA,1,2023,Fall,47,ID:41-516,5
...,...,...,...,...,...,...,...,...
4395,40300,CA,4,2023,Fall,47,ID:40-300,6
4396,41662,SA,11,2023,Fall,65,ID:41-662,6
4397,42593,SA,K,2023,Fall,34,ID:42-593,6
4398,40603,CA,5,2023,Fall,44,ID:40-603,6


Adding in a 'School/Test Day' column that will prove useful when saving results in this table to separate files:

In [23]:
df_test_results['School/Test Day'] = df_test_results['School'].astype('str') + ' Test Day ' + df_test_results['Test Day'].astype('str')
df_test_results

Unnamed: 0,Student_ID,School,Grade,Starting_Year,Period,Score,Identification Code,Test Day,School/Test Day
0,42026,CA,1,2023,Fall,34,ID:42-026,1,CA Test Day 1
1,43491,CA,1,2023,Fall,44,ID:43-491,2,CA Test Day 2
2,41637,CA,1,2023,Fall,43,ID:41-637,3,CA Test Day 3
3,40365,CA,1,2023,Fall,56,ID:40-365,4,CA Test Day 4
4,41516,CA,1,2023,Fall,47,ID:41-516,5,CA Test Day 5
...,...,...,...,...,...,...,...,...,...
4395,40300,CA,4,2023,Fall,47,ID:40-300,6,CA Test Day 6
4396,41662,SA,11,2023,Fall,65,ID:41-662,6,SA Test Day 6
4397,42593,SA,K,2023,Fall,34,ID:42-593,6,SA Test Day 6
4398,40603,CA,5,2023,Fall,44,ID:40-603,6,CA Test Day 6


Using np.select() to create longer forms of grade names (e.g. 'K' to 'Kindergarten', '1' to '1st Grade', '12' to '12th Grade', etc.)

In [24]:
condlist = [df_test_results['Grade'] == 'K', 
            df_test_results['Grade'] == '1', 
            df_test_results['Grade'] == '2',
            df_test_results['Grade'] == '3']
choicelist = ['Kindergarten', 
              '1st Grade',
              '2nd Grade',
              '3rd Grade']
df_test_results["Student's Grade"] = np.select(condlist, choicelist, df_test_results['Grade'] + 'th Grade')

df_test_results["Student's Grade"].value_counts()
            

Student's Grade
10th Grade      368
Kindergarten    364
2nd Grade       361
4th Grade       356
8th Grade       339
3rd Grade       338
6th Grade       337
5th Grade       327
7th Grade       326
12th Grade      324
11th Grade      323
9th Grade       320
1st Grade       317
Name: count, dtype: int64

In [25]:
df_test_results

Unnamed: 0,Student_ID,School,Grade,Starting_Year,Period,Score,Identification Code,Test Day,School/Test Day,Student's Grade
0,42026,CA,1,2023,Fall,34,ID:42-026,1,CA Test Day 1,1st Grade
1,43491,CA,1,2023,Fall,44,ID:43-491,2,CA Test Day 2,1st Grade
2,41637,CA,1,2023,Fall,43,ID:41-637,3,CA Test Day 3,1st Grade
3,40365,CA,1,2023,Fall,56,ID:40-365,4,CA Test Day 4,1st Grade
4,41516,CA,1,2023,Fall,47,ID:41-516,5,CA Test Day 5,1st Grade
...,...,...,...,...,...,...,...,...,...,...
4395,40300,CA,4,2023,Fall,47,ID:40-300,6,CA Test Day 6,4th Grade
4396,41662,SA,11,2023,Fall,65,ID:41-662,6,SA Test Day 6,11th Grade
4397,42593,SA,K,2023,Fall,34,ID:42-593,6,SA Test Day 6,Kindergarten
4398,40603,CA,5,2023,Fall,44,ID:40-603,6,CA Test Day 6,5th Grade


Adding percentage strings to scores:

In [26]:
df_test_results['Score'] = df_test_results['Score'].astype('str')+'%'
df_test_results

Unnamed: 0,Student_ID,School,Grade,Starting_Year,Period,Score,Identification Code,Test Day,School/Test Day,Student's Grade
0,42026,CA,1,2023,Fall,34%,ID:42-026,1,CA Test Day 1,1st Grade
1,43491,CA,1,2023,Fall,44%,ID:43-491,2,CA Test Day 2,1st Grade
2,41637,CA,1,2023,Fall,43%,ID:41-637,3,CA Test Day 3,1st Grade
3,40365,CA,1,2023,Fall,56%,ID:40-365,4,CA Test Day 4,1st Grade
4,41516,CA,1,2023,Fall,47%,ID:41-516,5,CA Test Day 5,1st Grade
...,...,...,...,...,...,...,...,...,...,...
4395,40300,CA,4,2023,Fall,47%,ID:40-300,6,CA Test Day 6,4th Grade
4396,41662,SA,11,2023,Fall,65%,ID:41-662,6,SA Test Day 6,11th Grade
4397,42593,SA,K,2023,Fall,34%,ID:42-593,6,SA Test Day 6,Kindergarten
4398,40603,CA,5,2023,Fall,44%,ID:40-603,6,CA Test Day 6,5th Grade


Removing a random set of rows from the dataset (in order to show how to handle missing data):

In [27]:
df_test_results = df_test_results.sample(frac = 0.71).reset_index(drop=True).copy()
# This script preserves 71% of the dataset's original rows.
# See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sample.html
df_test_results

Unnamed: 0,Student_ID,School,Grade,Starting_Year,Period,Score,Identification Code,Test Day,School/Test Day,Student's Grade
0,42239,SA,12,2023,Fall,48%,ID:42-239,5,SA Test Day 5,12th Grade
1,41246,HA,10,2023,Fall,28%,Student ID: 41-246,4,HA Test Day 4,10th Grade
2,40527,SA,4,2023,Fall,49%,ID:40-527,5,SA Test Day 5,4th Grade
3,40589,SA,8,2023,Fall,57%,ID:40-589,4,SA Test Day 4,8th Grade
4,41879,HA,2,2023,Fall,47%,Student ID: 41-879,6,HA Test Day 6,2nd Grade
...,...,...,...,...,...,...,...,...,...,...
3119,40710,SA,9,2023,Fall,49%,ID:40-710,3,SA Test Day 3,9th Grade
3120,41269,DA,10,2023,Fall,41%,Student ID: 41-269,4,DA Test Day 4,10th Grade
3121,40209,SA,4,2023,Fall,32%,ID:40-209,2,SA Test Day 2,4th Grade
3122,40768,DA,2,2023,Fall,44%,Student ID: 40-768,4,DA Test Day 4,2nd Grade


Removing original columns:

In [28]:
df_test_results.drop(['Starting_Year', 'Student_ID', 'Period', 
'School', 'Grade'], 
axis = 1, inplace = True)
df_test_results

Unnamed: 0,Score,Identification Code,Test Day,School/Test Day,Student's Grade
0,48%,ID:42-239,5,SA Test Day 5,12th Grade
1,28%,Student ID: 41-246,4,HA Test Day 4,10th Grade
2,49%,ID:40-527,5,SA Test Day 5,4th Grade
3,57%,ID:40-589,4,SA Test Day 4,8th Grade
4,47%,Student ID: 41-879,6,HA Test Day 6,2nd Grade
...,...,...,...,...,...
3119,49%,ID:40-710,3,SA Test Day 3,9th Grade
3120,41%,Student ID: 41-269,4,DA Test Day 4,10th Grade
3121,32%,ID:40-209,2,SA Test Day 2,4th Grade
3122,44%,Student ID: 40-768,4,DA Test Day 4,2nd Grade


In [29]:
df_test_results

Unnamed: 0,Score,Identification Code,Test Day,School/Test Day,Student's Grade
0,48%,ID:42-239,5,SA Test Day 5,12th Grade
1,28%,Student ID: 41-246,4,HA Test Day 4,10th Grade
2,49%,ID:40-527,5,SA Test Day 5,4th Grade
3,57%,ID:40-589,4,SA Test Day 4,8th Grade
4,47%,Student ID: 41-879,6,HA Test Day 6,2nd Grade
...,...,...,...,...,...
3119,49%,ID:40-710,3,SA Test Day 3,9th Grade
3120,41%,Student ID: 41-269,4,DA Test Day 4,10th Grade
3121,32%,ID:40-209,2,SA Test Day 2,4th Grade
3122,44%,Student ID: 40-768,4,DA Test Day 4,2nd Grade


Splitting this file into separate results for each school/test day pair, then saving those results to a .csv file within the data cleaning section of Python for Nonprofits:

In [30]:
# Determining all school/test day pairs present in the dataset:
pair_list = list(df_test_results['School/Test Day'].unique())
pair_list.sort()
pair_list

['CA Test Day 1',
 'CA Test Day 2',
 'CA Test Day 3',
 'CA Test Day 4',
 'CA Test Day 5',
 'CA Test Day 6',
 'DA Test Day 1',
 'DA Test Day 2',
 'DA Test Day 3',
 'DA Test Day 4',
 'DA Test Day 5',
 'DA Test Day 6',
 'HA Test Day 1',
 'HA Test Day 2',
 'HA Test Day 3',
 'HA Test Day 4',
 'HA Test Day 5',
 'HA Test Day 6',
 'SA Test Day 1',
 'SA Test Day 2',
 'SA Test Day 3',
 'SA Test Day 4',
 'SA Test Day 5',
 'SA Test Day 6']

In [31]:
# Saving each pair's results to its own .csv file:
for pair in pair_list:
    df_pair = df_test_results.query("`School/Test Day` == @pair").copy()
    df_pair.drop(['School/Test Day', 'Test Day'], axis = 1, inplace = True)
    df_pair.to_csv(f'../../part_x_data_cleaning/\
winter_2023_2024_test_results/{pair} Results.csv', index = False)


df_pair


Unnamed: 0,Score,Identification Code,Student's Grade
6,47%,ID:40-922,2nd Grade
8,42%,ID:41-206,11th Grade
59,46%,ID:43-698,4th Grade
153,48%,ID:42-472,3rd Grade
187,28%,ID:40-936,3rd Grade
...,...,...,...
2838,24%,ID:43-134,3rd Grade
2961,42%,ID:42-465,2nd Grade
3013,72%,ID:42-209,4th Grade
3048,34%,ID:41-380,4th Grade


These results can now be processed within the data cleaning section of Python for Nonprofits. That section's script wil 'clean' the results by converting them into a format that matches the fall and spring results already in our dataset.