This school district isn't real. I'm making a database just so I can do a mockup dashboard.

I'm only doing 7th-12th grades, with 7-8 in "junior high" and the rest in high school.

Tables created in this notebook:

Students  
Schools  
Absences  
Discipline  
Student Surveys  

Tables that could be added in the future:  
Grades  
Parents  
Teachers  
Enrollment (to courses)  
Courses  
Staff  

In [1]:
import pandas as pd
import numpy as np
import spacy
import random
from datetime import date, datetime, timedelta
import secrets
import string
import json
import csv
import os
import openai
from openai import OpenAI

nlp = spacy.load('en_core_web_sm')

### Students Table

student_id  (str, primary key)  
first_name  (str)  
last_name  (str)  
dob  (date)  
gender  (str)  
grade_year  (int)  
zip_code  (str)  
current_school  (str, foreign key)

In [2]:
students = pd.DataFrame()

In [3]:
# 10,000 random and unique 8-character student IDs
legal_characters = string.ascii_letters + string.digits
student_id = [''.join(secrets.choice(legal_characters) for _ in range(8)) for _ in range(10000)]

students['student_id'] = student_id

In [4]:
# Leave names blank for now; they won't factor into the dashboard
first_name = ""
last_name = ""

students['first_name'] = first_name
students['last_name'] = last_name

In [5]:
# Generate random dates of birth.
# The earliest should be the oldest possible high school senior this year,
# and the latest should be the youngest possible 7th grader.
# Real life would have different frequencies in different months,
# attrition as age increases, etc, but we'll ignore that for now.
oldest = pd.Timestamp("2006-9-1").timestamp()
youngest = pd.Timestamp("2012-9-1").timestamp()
date_of_birth = np.random.uniform(oldest, youngest, 10000)
date_of_birth = pd.to_datetime(date_of_birth, unit='s')
date_of_birth = list(date_of_birth.date)

students['dob'] = date_of_birth

In [6]:
# For gender, we'll randomize it with extremely ballparked proportions.
values = ['female', 'male', 'nonbinary', 'genderqueer', 'genderfluid', \
          'agender', 'two-spirit', 'perefer not to say', 'other']
weights = [45, 45, 2, 1, 1, .5, .5, 4, 1]

gender = random.choices(values, weights, k=10000)

students['gender'] = gender

In [7]:
# Generate grade by age.
# I know this is too clean for life - this is a virtual set!
# We can introduce dirtiness later if we need to.
cutoffs = {
    7: datetime(2011, 9, 1).date(),
    8: datetime(2010, 9, 1).date(),
    9: datetime(2009, 9, 1).date(),
    10: datetime(2008, 9, 1).date(),
    11: datetime(2007, 9, 1).date(),
    12: datetime(2006, 9, 1).date()
}

grade_year = []

for b_day in date_of_birth :
    done = 0
    for grade, cutoff in cutoffs.items() :
        if b_day >= cutoff :
            grade_year.append(grade)
            break

students['grade_year'] = grade_year

In [8]:
# We don't need full addresses - let's just go for zip codes.
available_zipcodes = ['12345', '23456', '34567', '45678', '56789']
zip_pop_ratios = [5, 4, 2, 1.5, 0.5]
zip_code = random.choices(available_zipcodes, zip_pop_ratios, k=10000)

students['zip_code'] = zip_code

In [9]:
# Code for current school.
# Generate based on zip and grade.
# Simulate different zips having different ratios of
# students attending the schools.

current_school = []

junior_highs = ['ajh', 'bjh']
high_schools = ['chs', 'dhs']

for s_grade, s_zipcode in zip(grade_year, zip_code) :
    if s_zipcode=='12345' :
        if s_grade==7 or s_grade==8 :
            current_school.append(random.choices(junior_highs, [10, .5])[0])
        else :
            current_school.append(random.choices(high_schools, [20, .5])[0])
    if s_zipcode=='23456' :
        if s_grade==7 or s_grade==8 :
            current_school.append(random.choices(junior_highs, [10, 1])[0])
        else :
            current_school.append(random.choices(high_schools, [10, .5])[0])
    if s_zipcode=='34567' :
        if s_grade==7 or s_grade==8 :
            current_school.append(random.choices(junior_highs, [3, 7])[0])
        else :
            current_school.append(random.choices(high_schools, [7, 3])[0])
    if s_zipcode=='45678' :
        if s_grade==7 or s_grade==8 :
            current_school.append(random.choices(junior_highs, [1, 10])[0])
        else :
            current_school.append(random.choices(high_schools, [1, 9])[0])
    if s_zipcode=='56789' :
        if s_grade==7 or s_grade==8 :
            current_school.append(random.choices(junior_highs, [.5, 15])[0])
        else :
            current_school.append(random.choices(high_schools, [.5, 20])[0])

students['current_school'] = current_school

students.head()

Unnamed: 0,student_id,first_name,last_name,dob,gender,grade_year,zip_code,current_school
0,heSLV36I,,,2007-09-21,male,11,23456,chs
1,OUeZNY6q,,,2007-01-18,female,12,23456,chs
2,nIGn7Jgh,,,2008-05-06,perefer not to say,11,45678,dhs
3,Nh1iBWas,,,2007-09-05,male,11,23456,chs
4,KRpBGRXg,,,2012-05-18,male,7,23456,ajh


In [10]:
# Let's take a peek at our work...
students.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   student_id      10000 non-null  object
 1   first_name      10000 non-null  object
 2   last_name       10000 non-null  object
 3   dob             10000 non-null  object
 4   gender          10000 non-null  object
 5   grade_year      10000 non-null  int64 
 6   zip_code        10000 non-null  object
 7   current_school  10000 non-null  object
dtypes: int64(1), object(7)
memory usage: 625.1+ KB


In [11]:
students.to_csv('../data/students.csv', index=False, quoting=csv.QUOTE_ALL)

### Schools Table

school_id  (str, primary key)  
school_name  (str)

(Obviously we could have more here, but this isn't the real world)

In [12]:
school_ids = ['ajh', 'bjh', 'chs', 'dhs']

school_names = [
    'Avery Alex Junior High',
    'Bruno Buckster Junior High',
    'Carmina Chavez High School',
    'Devon Ducation High School'
]

schools = pd.DataFrame({
    "school_id":school_ids,
    "school_name":school_names
})

In [13]:
schools.to_csv('../data/schools.csv', index=False, quoting=csv.QUOTE_ALL)

### Absences Table

absence_id: (int, primary key)  
student_id: (str, foreign key)  
absence_type: (str)  
school_code:  (str, foreign key)  
date:  (date)  

In [14]:
absences = pd.DataFrame()

In [15]:
# First, let's make a list of school days for the current semester.
# I'll ignore holidays.

first_day = date(2024, 8, 26)
last_day = date(2024, 12, 20)
school_days = []

# Create a dummy day to let us 'iterate' over all possible days.
a_day = first_day
while a_day <= last_day :
    if a_day.weekday() < 5 :
        school_days.append(a_day)
    a_day += timedelta(days=1)

len(school_days)

85

In [16]:
# Let's define our absence types.
absence_types = [
    "medical",
    "family",
    "religious",
    "bereavement",
    "personal",
    "unknown",
    "suspension",
    "tardy"
]

In [17]:
# This is a bit more difficult to simulate.
# Let's decide what realities we want to simulate in this data.
# It won't be comprehensive, but we can't have nothing, can we?

# 1. Students have individual likelihoods of absence and
#    tendencies toward specific absence types.
# 2. Zip codes have distinct absence rates and tendencies
#    toward specific absence types.
# 3. If a student has a single absence, it will be more likely
#    that they will have an absence on the following day as
#    well. This effect does not stack.
# 4. The absence rates for zip codes will demonstrate trends
#    over time.


# Let's make a dictionary of each zip code's general tenency
# toward absense and absence types. I'll arbitrarily choose
# the values. The general tendency will be the first item
# in a list, and a second will be a sub-list of the zip's
# tendency toward each of the eight possible absence types.
zip_absence_tendencies = {
    '12345':[.3, [3, 5, 4, .05, 3, 20, 2, 30]],
    '23456':[.1, [3, 5, 2, .05, 5, 15, .1, 30]],
    '34567':[.15, [3, 3, 1, .05, 5, 3, .01, 40]],
    '45678':[.08, [3, 4, 1, .05, 2, 1, .001, 40]],
    '56789':[.02, [3, 5, 1, .05, 3, .5, .00001, 50]]
}

In [18]:
# Now, let's give each student an individual tendency toward
# absence & type of absence. This is random and unrelated to
# zip code (though it will be used in combo with zip code to
# generate the final absence list).
student_absence_tendencies = dict()
for a_student in students['student_id'] :
    # I'll make a list, with the first being their general
    # absence tendency, and the second being a list of their
    # personal tendency toward specific absence types.

    # Let's fill in those general absence tendencies, with a strong
    # (logarithmic) bias toward low tendency.
    # Most will hover around 1% absence/tardy rate.
    # The max value will be 8%.
    rate = min(np.random.lognormal(mean=np.log(0.01)), 0.6)

    # Now we'll set each student's absence type tendency.
    # Let's just make it random for now; might tweak later.
    weights = [min(np.random.lognormal(mean=np.log(0.01)), 20) for _ in range(8)]

    # Now we smoosh 'em together.
    student_absence_tendencies[a_student] = [rate, weights]

In [19]:
# Now we're ready to start filling in absence data!

# Let's iterate over each day, deciding whether each student
# was absent on that day and why.

# First, a holder list the  absence instances.
absences = []

# Make the zip code tendencies fluctuate over time
def modify_zip_absence_tendencies(originals) :
    # General absences will decrease; "unknown" will increase relatively
    originals['12345'][0] -= originals['12345'][0] * (random.random()/100)
    originals['12345'][1][4] += originals['12345'][1][5] * (random.random()/80)
    # General absences will stay the same; suspensions will increase relatively
    originals['23456'][1][5] += originals['23456'][1][6] * (random.random()/150)
    # General absences will slightly decrease; personal will decrease relatively
    originals['34567'][0] -= originals['34567'][0] * (random.random()/150)
    originals['34567'][1][1] -= originals['34567'][1][4] * (random.random()/200)
    # General absences will stay the same; personal will increase relatively
    originals['45678'][1][3] += originals['45678'][1][4] * (random.random()/200)
    # General absences will increase; family will increase relatively
    originals['56789'][0] += originals['56789'][0] * (random.random()/100)
    originals['56789'][1][5] += originals['56789'][1][1] * (random.random()/150)
    # Return the resulting object, which becomes the new base truth for the zip codes
    return(originals)


# Also, a holder dict to see which students are on an
# absence "streak"; if they have an absence on one day,
# they're more likely to have an absence of the same type
# on the next day. This effect does not stack.
streaks = dict()

# Iterate over all school days
for ab_day in school_days :
    # Decide whether each student was absent on that day
    zip_absence_tendencies = modify_zip_absence_tendencies(zip_absence_tendencies)
    for ab_student, ab_zipcode, ab_school in zip(students['student_id'], students['zip_code'], \
                                        students['current_school']) :
        # Set this to false now to help manage our streak logic later
        absence_type = False
        # Combine the individual's tendency with their zip's tendency
        chance = (student_absence_tendencies[ab_student][0] + zip_absence_tendencies[ab_zipcode][0])/15
        # If the student is on a streak, the absence type will remain the same
        if ab_student in streaks.keys() :
            # Make the streaking student 1/3 less likely to be at school the next day
            chance += (1-chance)/3
            # Since the student is on a streak, ensure that the reason stays the same
            absence_type = streaks[ab_student]
        # Determine if the student is absent on this day
        if random.random() < chance :
            # If the student is not on a streak, we need to find out their absence reason
            if not absence_type :
                # Define the relative chance of each absence type
                type_chances = [student_absence_tendencies[ab_student][1][i] \
                                + zip_absence_tendencies[ab_zipcode][1][i] for i in range(8)]
                # Determine the absence type
                absence_type = random.choices(absence_types, weights=type_chances, k=1)[0]
                # Start the student's streak
                streaks[ab_student] = absence_type
            # Add the absence to our master absence list
            absences.append({
                'student_id':ab_student,
                'absence_type':absence_type,
                'school_id':ab_school,
                'date':ab_day
                })
        # If the student was on a streak but is not absent on this day, end their streak
        elif ab_student in streaks.keys() :
            del streaks[ab_student]

# Make our list a table and inspect it
absences = pd.DataFrame(absences)
absences.groupby(['school_id', 'absence_type'])['date'].count()

school_id  absence_type
ajh        family           261
           medical          203
           personal         452
           religious        210
           suspension        69
           tardy           1828
           unknown         1013
bjh        family           108
           medical           57
           personal          85
           religious         27
           suspension         3
           tardy            720
           unknown           80
chs        bereavement        7
           family           588
           medical          407
           personal         877
           religious        392
           suspension       160
           tardy           3782
           unknown         1933
dhs        bereavement        3
           family            92
           medical           68
           personal          84
           religious         29
           suspension         1
           tardy            934
           unknown           84
Name: date, dtyp

In [20]:
# Let's see the resulting overall absence rate for the district
len(absences) / (len(students) * len(school_days))

0.017125882352941175

In [21]:
# And now, let's generate the absence_id. Privacy is not a concern here,
# so we'll make it numeric and sequential.
absence_id = [i for i in range(len(absences))]
absences['absence_id'] = absence_id

# Let's rearrange our columns to put absence_id at the front.
col_order = ['absence_id', 'student_id', 'absence_type', 'school_id', 'date']
absences = absences[col_order]
absences.head()

Unnamed: 0,absence_id,student_id,absence_type,school_id,date
0,0,nuZOHwaa,family,bjh,2024-08-26
1,1,sWigONib,unknown,chs,2024-08-26
2,2,7cYzzjsn,tardy,dhs,2024-08-26
3,3,gAfallzt,unknown,chs,2024-08-26
4,4,fA2eedtF,family,chs,2024-08-26


In [22]:
absences.to_csv('../data/absences.csv', index=False, quoting=csv.QUOTE_ALL)

### Discipline Table

event_id  (str, primary key)  
student_id  (str, foreign key)  
date_of_event  (date)  
incident_type  (str)  
description  (text)  
action_taken  (str)   
date_of_action  (date)  
TODO: reporter  (str, foreign key)  
TODO: owner  (str, foreign key)  

In [34]:
# We already know something about discipline actions - we know when
# students were absent due to suspensions. Let's log those first.

# We can get these four data directly or indirectly from the absences
# table. Let's initialize these lists, then iterate over "absences"
# and fill in what we can. 
student_id = []
date_of_suspension_event = []
action_taken = []
date_of_action = []

# Look at every absence
for index, row in absences.iterrows() :
    # See if it was due to suspension
    if row['absence_type'] == 'suspension' :
        # Grab some vars for concision and readability
        absent_student = row['student_id']
        absent_day = row['date']
        # We have to check to see if this is a multi-day suspension.
        # To do this, we have to determine whether or not we're 
        # dealing with a Monday, so we can know whether to look back
        # one day or three days for another suspension absence.
        if absent_day.weekday() == 0 :
            day_delta = 3
        else :
            day_delta = 1
        # Now we see whether or not the student was absent due to
        # suspension on that previous school day.
        previous_school_day = absent_day - timedelta(days=day_delta)

        if len(absences.loc[(absences['student_id']==absent_student) & \
                            (absences['date']==previous_school_day) & \
                            (absences['absence_type']=='suspension')]) == 1 :
            # If it is a suspension, we should skip this absence, since its
            # disciplinary action has already been recorded.
            continue
        else :
            # If this is the beginning of a suspension, we should log it as a 
            # disciplinary action.
            student_id.append(absent_student)
            # Let's say the student was marked present on the day of the event,
            # so the action was taken on that day and the suspension "began" on
            # the next day.
            date_of_action.append(previous_school_day)
            # The date of the event could be one or two days before action was
            # taken. Let's say there's a 90% chance the action was taken on the
            # same day as the event.
            date_of_suspension_event.append(random.choices([previous_school_day, \
                                                  previous_school_day-timedelta(days=1)], \
                                                  [9, 1], k=1)[0])
            # Obviously, the action taken was a suspension
            action_taken.append('suspension')

In [35]:
# We've filled in some of the information about those suspensions, but not all of it.
# We still need an incident_type and description.
# Let's flesh out the incident_type before we continue.
# We'll generate the descriptions for all incidents at once later.

# To help, let's go ahead and establish a list of possible disciplinary actions.
# Then, we'll build a list of possible incident types and the likelihood of each
# one to result in each of those disciplinary actions.

actions = [
    "warning",
    "detention",
    "ISS",
    "suspension",
    "parent conference",
]

# Now we can fill in the likely reasons for those suspensions.
# We'll use the same values that we used to determine how likely
# each of those events was to result in suspension.

incident_types = ['bullying', 'altercation', 'theft', 'disruption', 'dress code', 'academic dishonesty']
weights = [3, 6, 4, 1, 1, 5]

incident_type = [random.choices(incident_types, weights, k=1)[0] for _ in range(len(action_taken))]

In [36]:
# Now we'll generate the rest of the rest of the incidents.
# To make things interesting, let's observe the following tendencies:
# 1. Different zip codes will have different likelihoods of various incident types.
# 2. Different students will have different likelihoods of various incident types.
# 3. Different schools will have different action weights, meaning that each school
#    tends to take different actions for the same types of infractions.

# Let's start with the students.
student_incident_tendencies = dict()
for i_student in students['student_id'] :
    # This process will look very similar to what we did with absences.
    rate = min(np.random.lognormal(mean=np.log(0.009)), 0.08)

    # Now we'll set each student's absence type tendency.
    # Basically, this weird math just tries to make sure each student will have
    # 1 or 2 strong tendencies maximum, and often no tendency at all.
    weights = [min(np.random.lognormal(mean=np.log(.2))**2, 7) for _ in range(6)]

    # Now we smoosh 'em together.
    student_incident_tendencies[i_student] = [rate, weights]

# Now, zip code tendencies.
zip_incident_tendencies = {
    '12345':[.1, [1, 3, 3, 3, 2, 2]],
    '23456':[.08, [2, 2, 2, 3, 2, 2]],
    '34567':[.05, [3, 2, 1, 1, 2, 2]],
    '45678':[.03, [2, 1, 1, 1, 1, 3]],
    '56789':[.02, [3, 1, 1, 1, 1, 5]]
}

# Now, the schools' tendencies toward different actions taken for different
# incident types. Unfortunately, we can't calculate suspensions like the others,
# because we've already put suspensions in the attendance.
# We make it a dict of dicts to make it easier to reference later.
school_incident_tendencies = {
    'ajh':{
        "bullying":[1, 2, 5, 0, 1],
        "altercation":[.5, 3, 3, 0, 1],
        "theft":[.5, 5, 3, 0, 4],
        "disruption":[1, 4, 4, 0, 1],
        "dress code":[2, 2, 2, 0, .5],
        "academic dishonesty":[1, 1, 2, 0, 3]
    },
    'bjh':{
        "bullying":[3, 2, 2, 0, 1],
        "altercation":[4, 2, 2, 0, 1],
        "theft":[2, 3, 2, 0, 2],
        "disruption":[4, 2, 1, 0, 1],
        "dress code":[4, .5, .5, 0, .1],
        "academic dishonesty":[2, 3, 3, 0, 3]
    },
    'chs':{
        "bullying":[1, 2, 6, 0, 3],
        "altercation":[.5, 2, 7, 0, 1],
        "theft":[.5, 2, 5, 0, 3],
        "disruption":[1, 4, 4, 0, .5],
        "dress code":[2, 2, 2, 0, .5],
        "academic dishonesty":[1, 1, 6, 0, 2]
    },
    'dhs':{
        "bullying":[4, 1, 1, 0, .5],
        "altercation":[4, 1, 1, 0, 1],
        "theft":[0, 0, 0, 0, 1],
        "disruption":[5, 1, 1, 0, .5],
        "dress code":[5, .5, .5, 0, .1],
        "academic dishonesty":[0, 0, 0, 0, 1]
    }
}

In [37]:
# Now that we have our tendencies, let's build the table!

# A list for holder dicts.
discipline_list = []

# To make the following operation take less than three years,
# I'll make a dict with keys=dates and values=list of students
# absent on that day.
absence_dict = dict()
for index, row in absences.iterrows() :
    if row['date'] in absence_dict :
        absence_dict[row['date']].add(row['student_id'])
    else :
        absence_dict[row['date']] = {row['student_id']}

# Manufacture some incident trends.
# I'll modify the general trend and one incident type slightly
# with every iteration.
def modify_zip_incident_tendencies(originals) :
    # General incidents will decrease; dress code infractions will increase relatively
    originals['12345'][0] -= originals['12345'][0] * (random.random()/100)
    originals['12345'][1][4] += originals['12345'][1][4] * (random.random()/150)
    # General incidents will increase; academic dishonesty will increase relatively
    originals['23456'][0] += originals['23456'][0] * (random.random()/200)
    originals['23456'][1][5] += originals['23456'][1][5] * (random.random()/150)
    # General incidents will decrease; altercations will decrease relatively
    originals['34567'][0] -= originals['34567'][0] * (random.random()/150)
    originals['34567'][1][1] -= originals['34567'][1][1] * (random.random()/100)
    # General incidents will stay the same; disruption will increase relatively
    originals['45678'][1][3] += originals['45678'][1][4] * (random.random()/80)
    # General incidents will decrease; academic dishonesty will increase markedly
    originals['56789'][0] -= originals['56789'][0] * (random.random()/150)
    originals['56789'][1][5] += originals['56789'][1][5] * (random.random()/20)
    # Return the resulting object, which becomes the new base truth for the zip codes
    return(originals)

# Day by day
for day in school_days :
    # Modify the zipcode's weights very slightly every day to simulate trends
    zip_incident_tendencies = modify_zip_incident_tendencies(zip_incident_tendencies) 
    # Student by student
    for in_student, in_zipcode, in_school in zip(students['student_id'], students['zip_code'], \
                                        students['current_school']) :
        # Let's see if the student was even at school that day
        if in_student not in absence_dict[day] :
            # If so, decide the likelihood that the student was involved in an incident that day.
            chance = (student_incident_tendencies[in_student][0] + zip_incident_tendencies[in_zipcode][0]) / 100
            # Randomly decide whether the student had an incident on this day, given the likelihood.
            # If not, none of the following code will execute and we'll move to the next student.
            if random.random() < chance :
                # We got an incident, folks! Reset the holder...
                holder = {}
                # Combine the student's tendencies with the zip's tendencies and then
                # randomly determine what the incident was.
                type_chances = [student_incident_tendencies[in_student][1][i] \
                                + zip_absence_tendencies[in_zipcode][1][i] for i in range(6)]
                incident = random.choices(incident_types, type_chances, k=1)[0]
                # To determine the action taken, we consider the school's tendencies.
                action = random.choices(actions, school_incident_tendencies[in_school][incident], k=1)[0]
                # Aaaand, we determine if the incident happened on the day of the action
                # or the previous school day...
                if absent_day.weekday() == 1 :
                    day_delta = 3
                else :
                    day_delta = 1
                # Done! Now pack it all up...
                holder['student_id'] = in_student
                # Decide if the incident occurred on the same day it was reported, or the previous school day.
                # There's a 90% chance it was reported on the same day.
                holder['date_of_event'] = day - timedelta(days=random.choices([0, day_delta], [9, 1], k=1)[0])
                holder['date_of_action'] = day
                holder['incident_type'] = incident
                holder['action_taken'] = action
                # Add the holder dict to the list.
                discipline_list.append(holder.copy())

In [38]:
# Putting all these events into a dataframe here will make the next
# step much easier.

# First, combine the incidents we just generated with the suspension
# incidents we generated previously.
student_id = student_id + [item['student_id'] for item in discipline_list]
date_of_event = date_of_suspension_event + [item['date_of_event'] for item in discipline_list]
action_taken = action_taken + [item['action_taken'] for item in discipline_list]
incident_type = incident_type + [item['incident_type'] for item in discipline_list]
date_of_action = date_of_action + [item['date_of_action'] for item in discipline_list]

# Make it a dict of lists to prep for framing.
data = {
    "student_id":student_id,
    "date_of_event":date_of_event,
    "date_of_action":date_of_action,
    "incident_type":incident_type,
    "action_taken":action_taken
}

# Frame it!
discipline = pd.DataFrame(data)

# I'll also add the school code. It's redundant, but data size is not a concern
# here and it will make querying much easier.
discipline = discipline.merge(
    students[['student_id', 'current_school']], 
    on='student_id', 
    how='left'
)
# That made a new column named "current_school", but we want it to say "student_id"
# instead.
discipline.rename(columns={'current_school': 'school_id'}, inplace=True)

# Check it.
discipline.head()

Unnamed: 0,student_id,date_of_event,date_of_action,incident_type,action_taken,school_id
0,BMuNUb4p,2024-08-22,2024-08-23,academic dishonesty,suspension,ajh
1,dOpE1ADr,2024-08-23,2024-08-23,academic dishonesty,suspension,chs
2,C1H6QRZU,2024-08-22,2024-08-23,altercation,suspension,chs
3,PJpoIX1S,2024-08-26,2024-08-26,academic dishonesty,suspension,chs
4,fhU7wnqy,2024-08-26,2024-08-26,theft,suspension,chs


In [39]:
# Now we need a text description of each incident.
# Who has time to write all that??? AI, of course!
# I've written a prompt that will be dynamically modified depending on which
# school, which incident type, and which action type is involved.
# This prompt will be fed to ChatGPT, and the response will be saved as the
# description.

# First, load the system messages. Each school has its own distinct system
# message.
sys_messages = dict()
for school in schools['school_id'] :
    with open(f'./prompts/discipline_description_prompts/{school}.txt', 'r') as file :
        message = file.read()
        sys_messages[school] = message

# Now load the basic prompt that will be modified on the fly based on the
# specific incident.
with open('./prompts/discipline_description_prompts/prompt_body.txt', 'r') as file :
    prompt_body = file.read()

# Set up our API call
client = OpenAI(
  organization='ORGANIZATION',
  project='PROJECT',
)

# Make a holder list...
descriptions = []

for this_incident_type, this_action_taken, this_school, date_of_this_event in zip(
    discipline['incident_type'], 
    discipline['action_taken'], 
    discipline['school_id'], 
    discipline['date_of_event']
) :
    # Set/modify the prompts
    system_message = sys_messages[this_school]
    # I kept getting identical responses, which is to be expected. So I'm inserting 
    # a random string of numbers into each prompt as a way to jiggle the handle 
    # without giving specific guidance to the direction.
    modifier = str(np.random.randint(100000000, 999999999)) * 3
    # Now, insert all of this information into the prompt.
    prompt = prompt_body.replace("INCIDENT", this_incident_type)
    prompt = prompt.replace("ACTION", this_action_taken)
    prompt = prompt.replace("DATE", str(date_of_this_event))
    prompt = prompt.replace("MODIFIER", modifier)
    # Query our robot overlords
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        temperature=1,
        seed=42,
        messages=[
            {"role":"system", "content":system_message},
            {"role":"user", "content":prompt}
        ]
    )
    # Extract the text part from the completion.
    response = completion.choices[0].message.content
    # Throw it on the pile.
    descriptions.append(response)

# Did we get the right number of them?
len(descriptions)

848

In [40]:
# Add them to the table.
discipline['description'] = descriptions
discipline.head()

Unnamed: 0,student_id,date_of_event,date_of_action,incident_type,action_taken,school_id,description
0,BMuNUb4p,2024-08-22,2024-08-23,academic dishonesty,suspension,ajh,Student was caught using a smartwatch to acces...
1,dOpE1ADr,2024-08-23,2024-08-23,academic dishonesty,suspension,chs,Student caught plagiarizing an essay during En...
2,C1H6QRZU,2024-08-22,2024-08-23,altercation,suspension,chs,"Altercation in cafeteria during lunch, student..."
3,PJpoIX1S,2024-08-26,2024-08-26,academic dishonesty,suspension,chs,Student caught plagiarizing an essay during En...
4,fhU7wnqy,2024-08-26,2024-08-26,theft,suspension,chs,Student was caught stealing multiple packs of ...


In [41]:
# Since we added all the suspensions first, our table is not in chronological order.
# It doesn't really matter, but let's fix it anyway.
discipline = discipline.sort_values(by='date_of_action').reset_index(drop=True)

# And now we can add the incident_id. Since it contains no protected info, we can just
# do it chronologically.
discipline['incident_id'] = [i for i in range(len(discipline))]

# And now let's rearrange the columns to a more reasonable order.
discipline = discipline[['incident_id', 'student_id', 'date_of_event', 'date_of_action',
                         'incident_type', 'action_taken', 'school_id', 'description']]

discipline.head()

Unnamed: 0,incident_id,student_id,date_of_event,date_of_action,incident_type,action_taken,school_id,description
0,0,BMuNUb4p,2024-08-22,2024-08-23,academic dishonesty,suspension,ajh,Student was caught using a smartwatch to acces...
1,1,dOpE1ADr,2024-08-23,2024-08-23,academic dishonesty,suspension,chs,Student caught plagiarizing an essay during En...
2,2,C1H6QRZU,2024-08-22,2024-08-23,altercation,suspension,chs,"Altercation in cafeteria during lunch, student..."
3,3,3A5FnVxO,2024-08-26,2024-08-26,academic dishonesty,ISS,chs,Student caught plagiarizing a research paper. ...
4,4,xARclOYM,2024-08-26,2024-08-26,altercation,warning,dhs,Description: Student engaged in a verbal alter...


In [42]:
discipline.to_csv('../data/discipline.csv', index=False, quoting=csv.QUOTE_ALL)

### Student Surveys Table

Here I will use generative AI to simulate some "street data" in the form of some ficticious student survey responses.  
  
Obviously the content of these surveys will be entirely shapedby my own bias. I'mnot trying to create a truthful body of data; I'm trying to create a body of data that has the same shape and properties as real street data, to demonstrate some of the analytical tools and techniques that could be applied to it.  

Many types of surveys are possible, but here I'm including a "fresh semester" survey given to all students at the beginning of the semester to assess hopes and fears. Not all students completed it (completion rates vary by school).

The survey questions are:
1. What are you most looking forward to this semester?
2. What is one hope or goal you have for this semester?
3. What do you think will be challenging for you this semester?
4. What can the school do to help you achieve your hopes and goals?

answer_id  (str, primary key)  
student_id  (str, foreign key)  
school_id  (str, foreign key)  
source  (str)  
question  (text)  
answer  (text)  

In [43]:
# First we will randomly select a subset of students.
# Let's say 200 per school.
per_school_respondents = 200

# Let's make a dictionary of all the students in each school.
# This will make random sampling of the students easier.
students_by_school = dict()
# Now, populate it.
for ss_school in schools['school_id'] :
    students_by_school[ss_school] = students.query(f'current_school=="{ss_school}"')['student_id'].tolist()

# And finally, randomly choose 100 of them from each school. Put that in a dict, with the school
# code as keys.
survey_responding_students = dict()
for ss_2school, ss_2student_list in students_by_school.items() :
    survey_responding_students[ss_2school] = random.sample(ss_2student_list, per_school_respondents)

In [44]:
# Load the prompt templates.
with open(f'./prompts/student_survey_prompts/survey_system_message.txt', 'r') as file :
    raw_system_message = file.read()
# Now load the basic prompt that will be modified on the fly based on the
# specific incident.
with open('./prompts/student_survey_prompts/survey_prompt.txt', 'r') as file :
    prompt = file.read()

# These "inserts" will be inserted into the system message to let the bot know the student's
# school & zip code situation.
school_inserts = dict()
with open(f'./prompts/student_survey_prompts/school_inserts.txt', 'r') as file :
    for i in range(len(schools['school_id'])) :
        school_inserts[schools.loc[i, 'school_id']] = file.readline().strip("\n")

# Now the zip codes. We don't have a handy list of those to iterate over yet, so let's make it.
zip_codes = ['12345', '23456', '34567', '45678', '56789']
zip_inserts = dict()
with open(f'./prompts/student_survey_prompts/zipcode_inserts.txt', 'r') as file :
    for i in range(len(zip_codes)) :
        zip_inserts[zip_codes[i]] = file.readline().strip("\n")

# Load absence type inserts
absence_inserts = dict()
with open(f'./prompts/student_survey_prompts/absence_inserts.txt', 'r') as file :
    for i in range(len(absence_types)) :
        absence_inserts[absence_types[i]] = file.readline().strip("\n")
absence_inserts['none'] = ""

# Load incident type inserts
incident_inserts = dict()
with open(f'./prompts/student_survey_prompts/incident_inserts.txt', 'r') as file :
    for i in range(len(incident_types)) :
        incident_inserts[incident_types[i]] = file.readline().strip("\n")
incident_inserts['none'] = ""

# I've got 40 distinct personality descriptions that will be randomly assigned to each student.
# Each student will be randomly assigned one during prompting.
personality_inserts = []
with open('./prompts/student_survey_prompts/personality_inserts.txt', 'r') as file :
    for line in file :
        personality_inserts.append(line.strip("\n"))

# Make our holder lists...
student_id = []
school_id = []
question = []
answer = []
# We can easily make the other lists after these are filled in below.

# We'll give each school a different mean & std for their GPA.
# These are entirely arbitrary.
gpa_dict = {
    'ajh':(3, 1),
    'bjh':(3.3, .5),
    'chs':(3, 1.3),
    'dhs':(3.5, .5)
}

# Now we go through all schools, getting all responses from each school in turn.
for sur_school in schools['school_id'] :
    # Load the system message insert for this school.
    school_insert = school_inserts[sur_school]
    # Now go through all responding students in that school.
    for sur_student in survey_responding_students[sur_school] :
        # Gather our necessary information from existing variables.
        # We already have the school.
        zipcode = students[students['student_id']==sur_student]['zip_code'].iloc[0]
        # Load that zipcode's insert.
        zip_insert = zip_inserts[zipcode]
        # Only include a most common absence type if the student is likely to be frequently absent.
        if student_absence_tendencies[sur_student][0] > 0.015 :
            student_absence_weights = student_absence_tendencies[sur_student][1]
            most_common_absence_type = absence_types[student_absence_weights.index(max(student_absence_weights))]
        else :
            most_common_absence_type = "none"
        absence_insert = absence_inserts[most_common_absence_type]
        # Only include a most common incident type if the student is likely to encounter incidents.
        if student_incident_tendencies[sur_student][0] > 0.015 :
            student_incident_weights = student_incident_tendencies[sur_student][1]
            most_common_incident_type = incident_types[student_incident_weights.index(max(student_incident_weights))]
        else :
            most_common_incident_type = "none"
        incident_insert = incident_inserts[most_common_incident_type]
        # Randomly assign a personality profile.
        personality_insert = personality_inserts[random.choice([i for i in range(1, 40)])]
        # profile.append(personality_insert)
        # Assign a random max length for this student's responses. Not everybody likes to write a lot!
        max_response_length = random.choice([i for i in range(5, 25)])
        # Give each student a random GPA (max 4.0), with a tendency toward higher ones.
        gpa = round(np.random.normal(gpa_dict[sur_school][0], gpa_dict[sur_school][1]), 2)
        if gpa > 4 :
            gpa = 4
        if gpa < 0.1 :
            gpa = 0.1
        # I kept getting identical responses, which is to be expected. So I'm inserting a random
        # string of numbers into each prompt as a way to jiggle the handle without giving specific
        # guidance. Basically, forcing it to be slightly random in ways I can't predict.
        modifier = str(np.random.randint(1, 999999999)) * 5
        # Use all this info to assemple the system message.
        this_system_message = raw_system_message.replace("SCHOOL_INSERT", school_insert)
        this_system_message = this_system_message.replace("ZIP_INSERT", zip_insert)
        this_system_message = this_system_message.replace("ABSENCE_INSERT", absence_insert)
        this_system_message = this_system_message.replace("INCIDENT_INSERT", incident_insert)
        this_system_message = this_system_message.replace("PERSONALITY_INSERT", personality_insert)
        this_system_message = this_system_message.replace("GPA_INSERT", str(gpa))
        this_system_message = this_system_message.replace("MAX_LEN", str(max_response_length))
        this_system_message = this_system_message.replace("MODIFIER", modifier)

        # Query our robot overlords.
        # Since we need 4 answers, let's just ask the bot to reply with a four-key json file.
        # We'll save money on prompt tokens that way.
        completion = client.chat.completions.create(
            model="gpt-3.5-turbo",
            temperature=1,
            seed=42,
            response_format={ "type": "json_object" },
            messages=[
                {"role":"system", "content":this_system_message},
                {"role":"user", "content":prompt}
            ]
        )
        # Extract the text part from the completion.
        response = completion.choices[0].message.content
        response = json.loads(response)
        for i in range(1, 5) :
            student_id.append(sur_student)
            school_id.append(sur_school)
            question.append(i)
            # I'll manually replace these numbers with the corresponding questions later.
            answer.append(response[str(f'{i}')])

In [45]:
# Now I'll insert the question text... and on second thought, each question
# should have its own ID, so I'll maintain the numeric ID column and then
# also add the question text in a different column.

# Let's start by making a list of questions.

question_texts = [
    "What are you most looking forward to this semester?",
    "What is one hope or goal you have for this semester?",
    "What do you think will be challenging for you this semester?",
    "What can the school do to help you achieve your hopes and goals?"
]

# Now let's insert the question texts into a new column.
questions_col = [question_texts[i-1] for i in question]

# And now, let's flesh out those question codes to something more usable.
question = ["0-"+str(i) for i in question]

In [46]:
# Now let's structure all of that into a dataframe, add in the last "survey_id" and "source", and save it.

# Each of the last two fields should be exactly as long as the other lists, so let's find out how long they are.
length = len(question)
# "survey_id" is unique to each single q/a pair. It can be a sequential int with no security issue.
survey_id = [i for i in range(length)]
# "source" is the name of the survey the questions are from.
# All of these questions are from the same survey.
source = ['fresh_semester_survey' for i in range(length)]

# Frame it!
student_surveys = pd.DataFrame({
    'survey_id':survey_id,
    'student_id':student_id,
    'school_id':school_id,
    'source':source,
    'question_code':question,
    'question':questions_col,
    'answer':answer
})

# Save it.
student_surveys.to_csv('../data/student_surveys.csv', index=False, quoting=csv.QUOTE_ALL)

### Survey Blobs Table

To make the survey answers susceptible to word-clouding in PowerBI, we'll do a little more natural language processing to them.

We'll store the result of these processes in a new table, with questions as columns and rows as schools, allowing us to easily filter in PowerBI.

In [47]:
# Make a container for my word blobs.
# Let's make it a "veritcal" container,
# since we're going to want to filter by
# both school and question.
question_codes = ['0-1', '0-2', '0-3', '0-4']
school_id = []
question_code = []
for sb_school in school_ids :
    for sb_code in question_codes :
        school_id.append(sb_school)
        question_code.append(sb_code)

word_blobs = dict()
# Iterate through all the schools
for sbt_school in school_ids :
    # Make a holder for that school's answers
    word_blobs[sbt_school] = dict()
    # Look at the answers to each question from that school
    for sbt_question_code in question_codes :
        school_answers = student_surveys.loc[(student_surveys['school_id']==sbt_school) & (student_surveys['question_code']==sbt_question_code)]
        # Make them into a list so we don't have to reference a df so many times
        school_answers = [row['answer'] for index, row in school_answers.iterrows()]
        # Make a container to hold your blobs
        blobs = ""
        for sbt_answer in school_answers :
            # "Tokenize" the answer so I can mess with the words by their parts of speech
            text = nlp(sbt_answer)
            # Look at each "token," which is the word with some extra semantic info attached
            for token in text :
                # See if the word is a noun
                if token.pos_ == 'NOUN' :
                    # Find any adjectives modifying that noun
                    adjs = [child.text for child in token.children if child.dep_ == "amod"]
                    # Add all those pairs to the school's blob
                    for adj in adjs :
                        # Create the blob
                        blob = f"{adj} {token.text}"
                        # Make sure it's all lowercase so we don't accidentally differentiate
                        # by case.
                        blob = blob.lower()
                        # Toss it on the heap.
                        blobs = f"{blobs}{blob}, "
        # Now add that blob to the dataframe and move on to the next school
        word_blobs[sbt_school][sbt_question_code] = blobs

In [48]:
# Now assemble the lists and the dict of dicts into a df!
final_blobs = []
for key, item in word_blobs.items() :
    for subkey, subitem in item.items() :
        final_blobs.append(subitem)

student_survey_blobs = pd.DataFrame({
    "school_id":school_id,
    "question_code":question_code,
    "word_blob":final_blobs
})

student_survey_blobs.head()

Unnamed: 0,school_id,question_code,word_blob
0,ajh,0-1,"advanced concepts, hard work, new subjects, cl..."
1,ajh,0-2,"reliable resource, personal interests, extracu..."
2,ajh,0-3,"academic responsibilities, busy times, cultura..."
3,ajh,0-4,"additional resources, flexible deadlines, pers..."
4,bjh,0-1,"new topics, new friends, new people, new extra..."


In [49]:
# Save it!
student_survey_blobs.to_csv('../data/student_survey_blobs.csv', index=False, quoting=csv.QUOTE_ALL)