# Mock data generation

In [497]:
import pandas as pd
import numpy as np
import random
from collections import Counter

In [498]:
random.seed(60)

# Generate a dataset (without error)

## Setting

- 2 years (2015, 2016)
- 3 grades each year (1, 2, 3)
- 2000 students each grade
- Each student ranks 5 schools
- 10 schools
    - 1-6: traditional
        - 1-2: non-lottery
        - 3-6: lottery
    - 7-8: charter
        - lottery
    - 9-10: innovation
        - lottery
- Capacity at each school: random between 180-200
- 4 default tie-breaker indices (1 - 4)
    - Lottery: 1 - 2
    - Non-lottery: 3 - 4
        - unfair coin toss (favor high motivation students)
            - 3: 0.8
            - 4: 0.7
- 2 tie-breaker student groups (0, 1)
    - 3 types of advantage (0.1, 0.3, 0.5) randomly applied to student group 1
- Priority: random among (0,1,2) (0: 5%)
    - Priority = 0: guaranteed assignment
- Outcome 1,2
    - math (continuous)
    - read (continuous)
- Covariate 1,2,3
    - IQ (continuous)
    - gender (categorical (integer))
    - race (categorical (string))

## Parameters

In [499]:
#========Years & Grades===========

# 1. number of years
num_years = 2
years = [2015 + x for x in range(num_years)]

# 2. number of grades
num_grades = 3
grades = [x for x in range(1, num_grades+1)]

#========Students===========

# 3. number of students in each (year, grade)
num_students_per_grade = 2000

# generate total student pool
num_total_students = len(years) * len(grades) * num_students_per_grade
all_students = list(range(1, num_total_students + 1))
random.shuffle(all_students)

# 4. ratio of high motivation students
p_high_motivation = 0.5

# 5. ratio of low motivation students
p_low_motivation = 1 - p_high_motivation

#========Schools===========

# 6. number of schools
num_schools = 10

# 7. number of traditional schools
num_trad = 6

# 8. number of charter schools
num_char = 2

# 9. number of innovation schools
num_inno = 2

# 10. number of non-lottery schools       
    # School ID (1 ~ num_non_lottery) will be non-lottery schools
num_non_lottery = 2

# 11. max number of schools that each student can rank
max_num_schools_per_student = 5

# 12. lower bound of school capacity
capa_low = 180

# 13. upper bound of school capacity
capa_up = 200

#========Choice===========

# probability of a student ranking a charter school
    # if all the charter schools are consumed, they just choose among traditional and innovation randomly
    # regardless of these parameters

# 14. probability of a high-motivation student ranking a charter school
p_high_char = 0.7       # prob. high motivated students pick a charter among [charter, innovation, traditional]
p_high_inno = 0.7       # prob. high motivated students pick a innovation between [innovation, traditional]

# 15. probability of a low-motivation student ranking a charter school
p_low_char = 0.3       # prob. low motivated students pick a charter among [charter, innovation, traditional]
p_low_inno = 0.7       # prob. low motivated students pick a innovation between [innovation, traditional]

#========Priority===========

# 16. number of priorities (excluding 0 (guaranteed assignment))
num_priority = 2

# 17. proportion of students who are guaranteed assignment (priority = 0)
p_guarantee = 0.05

#========Tie-breaker===========

# 18. number of different lottery tie-breakers
num_lot_tie = 2

# 19. number of different non-lottery tie-breakers
num_nonlot_tie = 2

# number of different tie breakers
num_tie_breakers = num_lot_tie + num_nonlot_tie

# 20. probability of choosing a highly motivated student when forming a ranking for non-lottery tie-breaker
p_nonlot_high = [0.8, 0.7, 0.6]

#========Advantage===========

# 21. extent of advantage given to favored groups
advatage = [0.9, 0.8, 0.7]

#========Enrollment===========

# 22. probability of enrollment at assigned school
p_enroll_assign = 0.9

#========Output===========
# All the output variable follows a normal distribution

# baseline output      
# baseline = low motivation students who enrolled at traditional schools

# 23. baseline mean
base_mean = 500

# 24. baseline standard deviation
base_sd = 100

# treatment effect

# 25. high motivation effect
    # high motivation students achieve 0.2 SD higher score compared to low motivation students
endo_high = 0.4       

# endogeneity

# 26. innovation effect
    # Students enrolled an innovation school achieve 0.2 SD higher score compared to traditional school
treat_inno = 0.4  

# 27. charter effect
    # Students enrolled an charter school achieve 0.4 SD higher score compared to traditional school
treat_char = 0.4  

# 28. advantage effect
    # Favored students achieve 0.2 SD higher score compared to non-favored students
endo_adv = 0.2

## Assign student-specific information

In [500]:
# year and grade
year_grade_dict = {}
temp = all_students[:]
for y in years:
    year_grade_dict[y] = {}
    for g in grades:
        allocate = temp[:num_students_per_grade]
        year_grade_dict[y][g] = allocate
        temp = [x for x in temp if x not in allocate]

# motivation level
# x = student ID, y = motivation level
mot_dict = {}
for n in range(1, num_total_students+1):
    mot_dict[n] = np.random.choice([0, 1], p = [1 - p_high_motivation, p_high_motivation])

# x = motivation level , y = student ID
mot_dict2 = {}
for n in [0, 1]:
    mot_dict2[n] = []
    for s in range(1, num_total_students+1):
        if mot_dict[s] == n:
            mot_dict2[n].append(s)

# lottery / non-lottery indicator dictionary
tie_dict = {}
for n in range(1, num_tie_breakers + 1):
    if 1 <= n <= num_lot_tie:
        tie_dict[n] = 0
    else:
        tie_dict[n] = 1
            
# lottery tie-breaker value
lot_dict = {}
for n in range(1, num_total_students+1):
    lot_dict[n] = {}
    for t in range(1, num_lot_tie+1):
        lot_dict[n][t] = random.uniform(0, 1)
        
# non-lottery tie-breaker value
# x = year & grade & (non-lottery) tie-breaker index, y = ordered list
nonlot_dict = {}
for y in years:
    nonlot_dict[y] = {}
    for g in grades:
        nonlot_dict[y][g] = {}
        students = year_grade_dict[y][g]
        
        for n in range(num_lot_tie+1, num_tie_breakers + 1):
            
            high = [s for s in students if mot_dict[s] == 1]
            low = [s for s in students if mot_dict[s] == 0]
            
            index = n - num_lot_tie - 1
            nonlot_dict[y][g][n] = []
            while (len(high) + len(low)) > 0:
                if random.uniform(0,1) < p_nonlot_high[index]:
                    if len(high) > 0:
                        choice = random.choice(high)
                        high.remove(choice)
                    else:
                        choice = random.choice(low)
                        low.remove(choice)
                    nonlot_dict[y][g][n].append(choice)
                else:
                    if len(low) > 0:
                        choice = random.choice(low)
                        low.remove(choice)
                    else:
                        choice = random.choice(high)
                        high.remove(choice)
                    nonlot_dict[y][g][n].append(choice)

# x = (non-lottery) tie-breaker index & student ID, y = (non-lottery) tie-breaker value
nonlot_dict2 = {}
for y in years:
    for g in grades:
        for i in year_grade_dict[y][g]:
            nonlot_dict2[i] = {}
            for n in range(num_lot_tie+1, num_tie_breakers + 1):
                rank = nonlot_dict[y][g][n].index(i) / len(nonlot_dict[y][g][n])
                nonlot_dict2[i][n] = rank
                    
# covariate 1, 2, 3
covariate_dict = {}
for n in range(1, num_total_students+1):
    covariate_dict[n] = (random.randint(0, 100), random.randint(0, 2), random.choice(['white', 'black', 'asian']))

## Assign school-specific information

In [501]:
# types (0 = trad, 1 = charter, 2 = inno)
# x = school ID, y = type number
type_dict = {}
for n in range(1, num_schools+1):
    if 1 <= n <= num_trad:
        type_dict[n] = 0
    elif num_trad + 1 <= n <= num_trad + num_char:
        type_dict[n] = 1
    elif num_trad + num_char + 1 <= n <= num_schools:
        type_dict[n] = 2
        
# x = type number, y = school ID
type_dict2 = {}
for n in [0, 1, 2]:
    type_dict2[n] = []
    for s in range(1, num_schools+1):
        if type_dict[s] == n:
            type_dict2[n].append(s)

# capacity
capacity_dict = {}
for n in range(1, num_schools+1):
    capacity_dict[n] = {}
    for y in years:
        capacity_dict[n][y] = {}
        for g in grades:
            capacity_dict[n][y][g] = random.randint(capa_low, capa_up)
            
            # reduce charter capacity
            # if (n == 7) or (n == 8):
                # capacity_dict[n][y][g] -= 30
            
# advantage
advantage_dict = {}
for n in range(1, num_schools+1):
    advantage_dict[n] = {}
    for y in years:
        advantage_dict[n][y] = {}
        for g in grades:
            advantage_dict[n][y][g] = random.choice(advatage)
            
# non-Lottery
non_lottery_dict = {}
for n in range(1, num_schools+1):
    non_lottery_dict[n] = {}
    for y in years:
        non_lottery_dict[n][y] = {}
        for g in grades:
            if 1 <= n <= num_non_lottery:
                non_lottery_dict[n][y][g] = 1
            else:
                non_lottery_dict[n][y][g] = 0
            
# default tie-breaker index
default_dict = {}
for n in range(1, num_schools+1):
    default_dict[n] = {}
    for y in years:
        default_dict[n][y] = {}
        for g in grades:
            if non_lottery_dict[n][y][g] == 0:
                default_dict[n][y][g] = random.randint(1, num_lot_tie)
            else:
                default_dict[n][y][g] = random.randint(num_lot_tie + 1, num_tie_breakers)

# Put information into a data frame

In [502]:
# Define the column names
columns = ['Student ID', 'Year', 'Grade', 'Choice Rank', 'School ID', 'Treatment', 'Capacity', 
           'Priority', 'Default Tie-breaker Index', 'Non-lottery', 'Tie-breaker Student Group Index', 
           'Advantage', 'Default Tie-breaker', 'Effective Tie-breaker',
           'Outcome 1', 'Outcome 2', 'Covariate 1', 'Covariate 2', 'Covariate 3']

In [503]:
# Create an empty list to store the data
data = []

In [504]:
# Loop through the years, grades, and schools to generate the data
count = 0
for year in years:
    for grade in grades:
        students = year_grade_dict[year][grade]
        count += 1
        for student_index in students:
            student_id = student_index
            school_ids = list(range(1, num_schools+1))
            
            # pick schools
            picks = []
            
            temp_trad = type_dict2[0][:]
            temp_char = type_dict2[1][:]
            temp_inno = type_dict2[2][:]
            
            while len(picks) < max_num_schools_per_student:
                
                # high-motivation students
                if mot_dict[student_id] == 1:
                    p_char = p_high_char
                    p_inno = p_high_inno
                
                # low-motivation students
                else:
                    p_char = p_low_char
                    p_inno = p_low_inno

                if random.uniform(0, 1) < p_char:
                    # 1. charter -> innovation -> traditional
                    if random.uniform(0, 1) < p_inno:
                        if len(temp_char) > 0:
                            choice = random.choice(temp_char)
                            temp_char.remove(choice)
                        elif len(temp_inno) > 0:
                            choice = random.choice(temp_inno)
                            temp_inno.remove(choice)
                        else:
                            choice = random.choice(temp_trad)
                            temp_trad.remove(choice)
                    # 2. charter -> traditional -> innovation
                    else:
                        if len(temp_char) > 0:
                            choice = random.choice(temp_char)
                            temp_char.remove(choice)
                        elif len(temp_trad) > 0:
                            choice = random.choice(temp_trad)
                            temp_trad.remove(choice)
                        else:
                            choice = random.choice(temp_inno)
                            temp_inno.remove(choice)

                else:
                    # 3. innovation -> traditional -> charter
                    if random.uniform(0, 1) < p_inno:
                        if len(temp_inno) > 0:
                            choice = random.choice(temp_inno)
                            temp_inno.remove(choice)
                        elif len(temp_trad) > 0:
                            choice = random.choice(temp_trad)
                            temp_trad.remove(choice)
                        else:
                            choice = random.choice(temp_char)
                            temp_char.remove(choice)

                    # 4. traditional -> innovation -> charter
                    else:
                        if len(temp_trad) > 0:
                            choice = random.choice(temp_trad)
                            temp_trad.remove(choice)
                        elif len(temp_inno) > 0:
                            choice = random.choice(temp_inno)
                            temp_inno.remove(choice)
                        else:
                            choice = random.choice(temp_char)
                            temp_char.remove(choice)

                picks.append(choice)

            # Loop through the school IDs the student picked
            for rank, school_id in enumerate(picks):
                capacity = capacity_dict[school_id][year][grade] # Random capacity in range
                treatment = type_dict[school_id]
                non_lottery = non_lottery_dict[school_id][year][grade]
                
                priority = random.randint(1, num_priority) # Random priority ranking 
                if random.uniform(0,1) < p_guarantee:
                    priority = 0
                
                tie_breaker_student_group_index = random.randint(0, 1) # Random tie-breaker student group index
                default_tie_breaker_index = default_dict[school_id][year][grade] # Default tie-breaker index
                
                if tie_breaker_student_group_index == 0:
                    advantage = 1
                else:
                    advantage = advantage_dict[school_id][year][grade]
                
                if non_lottery == 0:
                    default_tie_breaker = lot_dict[student_id][default_tie_breaker_index]
                else:
                    default_tie_breaker = nonlot_dict2[student_id][default_tie_breaker_index]
                effective_tie_breaker = default_tie_breaker * advantage
                
                outcome_1 = 0
                outcome_2 = 0
                covariate_1 = covariate_dict[student_id][0]
                covariate_2 = covariate_dict[student_id][1]
                covariate_3 = covariate_dict[student_id][2]
                
                # Create a list of values for this row
                row = [student_id, year, grade, rank+1, school_id, treatment, capacity, 
                       priority, default_tie_breaker_index, non_lottery, tie_breaker_student_group_index, 
                       advantage, default_tie_breaker, effective_tie_breaker,
                       outcome_1, outcome_2, covariate_1, covariate_2, covariate_3]
                data.append(row)

In [505]:
df = pd.DataFrame(data, columns = columns)
df = df.sort_values(['Student ID', 'Choice Rank'])
df = df.reset_index(drop=True)
df

Unnamed: 0,Student ID,Year,Grade,Choice Rank,School ID,Treatment,Capacity,Priority,Default Tie-breaker Index,Non-lottery,Tie-breaker Student Group Index,Advantage,Default Tie-breaker,Effective Tie-breaker,Outcome 1,Outcome 2,Covariate 1,Covariate 2,Covariate 3
0,1,2015,2,1,10,2,194,2,1,0,0,1.0,0.304295,0.304295,0,0,24,2,black
1,1,2015,2,2,3,0,195,1,1,0,0,1.0,0.304295,0.304295,0,0,24,2,black
2,1,2015,2,3,9,2,199,1,2,0,1,0.9,0.438436,0.394592,0,0,24,2,black
3,1,2015,2,4,1,0,180,2,4,1,1,0.9,0.981000,0.882900,0,0,24,2,black
4,1,2015,2,5,5,0,194,2,1,0,1,0.8,0.304295,0.243436,0,0,24,2,black
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,12000,2015,3,1,7,1,200,2,1,0,0,1.0,0.161412,0.161412,0,0,87,1,white
59996,12000,2015,3,2,8,1,188,1,2,0,0,1.0,0.378519,0.378519,0,0,87,1,white
59997,12000,2015,3,3,10,2,182,1,2,0,0,1.0,0.378519,0.378519,0,0,87,1,white
59998,12000,2015,3,4,9,2,193,1,2,0,1,0.9,0.378519,0.340667,0,0,87,1,white


# Assign students to schools by DA Algorithm

In [506]:
df['Applicant Rank'] = df['Priority'] + df['Effective Tie-breaker'] 

In [507]:
# Initialize all students and schools as "free"
df['Student Status'] = 'free'
df['School Status'] = 'free'
df['Assignment'] = None
df['Enrollment'] = None
df['Temp Capacity'] = df['Capacity']

In [508]:
program_list = []
for y in years:
    for g in grades:
        print ('Year:', y, 'Grade:', g)
        temp = df.loc[(df['Year'] == y) & (df['Grade'] == g)]
        program = temp.copy()

        # iterate cycle until everyone is either matched or all-rejected
        while ('free' in program['Student Status'].values):

            # Update available students and available schools
            available_students = sorted(list(set(program[program['Student Status'] == 'free']['Student ID'].values)))
            # available_schools = list(set(program[program['Temp Capacity'] > 0]['School ID'].values))

            # one loop = one round
            for student_id in available_students:
                student_choices = program[program['Student ID'] == student_id].sort_values(by='Choice Rank')

                # Find the student's most preferred school among the ones they haven't applied to yet
                for i, choice in student_choices.iterrows():
                    if (choice['School Status'] == 'free'):
                        school_id = choice['School ID']
                        program.loc[(program['Student ID'] == student_id) & 
                                    (program['School ID'] == school_id), 'School Status'] = 'propose'
                        break
                    # all-rejected
                    if student_choices.iloc[-1]['School Status'] == 'propose':
                        program.loc[program['Student ID'] == student_id, 'Student Status'] = 'all rejected'

                # apply if he is not just all-rejected
                if (program.loc[program['Student ID'] == student_id]['Student Status'].iloc[0] != 'all rejected'):

                    # If the school has available capacity, accept the student
                    if program[program['School ID'] == school_id]['Temp Capacity'].iloc[0] > 0:
                        program.loc[program['Student ID'] == student_id, 'Student Status'] = 'matched'
                        program.loc[program['Student ID'] == student_id, 'Assignment'] = school_id
                        program.loc[program['School ID'] == school_id, 'Temp Capacity'] -= 1

                    # If the school is already full, compare the applicant ranks of the already admitted students
                    else:    
                        current_students = program.loc[(program['Assignment'] == school_id) & (program['School ID'] == school_id)]
                        current_students = current_students.sort_values(by = 'Applicant Rank')

                        # Check if the current student has a higher rank than the worst-ranked admitted student
                        if choice['Applicant Rank'] < current_students.iloc[-1]['Applicant Rank']:

                            # Reject the worst-ranked admitted student and accept the current student
                            worst_student_id = current_students.iloc[-1]['Student ID']
                            program.loc[program['Student ID'] == worst_student_id, 'Student Status'] = 'free'
                            program.loc[program['Student ID'] == worst_student_id, 'Assignment'] = None

                            program.loc[program['Student ID'] == student_id, 'Student Status'] = 'matched'
                            program.loc[program['Student ID'] == student_id, 'Assignment'] = school_id

            # show how many students are matched so far
            print(list(program['Student Status']).count('matched'))
            
        program_list.append(program)

Year: 2015 Grade: 1
5155
5830
6915
7920
9160
9475
9480
9480
9480
9480
9480
9480
9480
9480
9480
9480
9480
9480
9480
9480
9480
9480
9480
9480
9480
9480
9480
Year: 2015 Grade: 2
5370
6095
7100
8015
9060
9400
9430
9475
9500
9510
9510
9510
9510
9510
9510
9510
9510
9510
9510
9510
9510
9510
9510
Year: 2015 Grade: 3
5345
6000
6970
8030
9195
9425
9425
9425
9425
9425
9425
9425
9425
9425
9425
9425
9425
9425
9425
9425
9425
9425
9425
9425
9425
Year: 2016 Grade: 1
5465
6065
7025
8075
9155
9410
9535
9570
9580
9585
9585
9585
9585
9585
9585
9585
9585
9585
9585
9585
9585
9585
9585
9585
9585
9585
Year: 2016 Grade: 2
5275
5995
7060
7950
9165
9475
9580
9625
9650
9660
9665
9675
9675
9675
9675
9685
9685
9685
9690
9690
9690
9690
9690
9690
9690
Year: 2016 Grade: 3
5270
5885
7000
7915
9120
9410
9460
9465
9465
9465
9465
9465
9465
9465
9465
9465
9465
9465
9465
9465
9465
9465
9465
9465
9465
9465
9465
9465
9465
9465
9465
9465
9465
9465
9465
9465
9465
9465
9465
9465


In [482]:
result = pd.concat(program_list)
result

Unnamed: 0,Student ID,Year,Grade,Choice Rank,School ID,Treatment,Capacity,Priority,Default Tie-breaker Index,Non-lottery,...,Outcome 2,Covariate 1,Covariate 2,Covariate 3,Applicant Rank,Student Status,School Status,Assignment,Enrollment,Temp Capacity
55,12,2015,1,1,8,1,186,1,2,0,...,0,92,1,black,1.335401,matched,propose,7,,0
56,12,2015,1,2,7,1,194,1,1,0,...,0,92,1,black,1.143699,matched,propose,7,,0
57,12,2015,1,3,9,2,181,1,2,0,...,0,92,1,black,1.293476,matched,free,7,,0
58,12,2015,1,4,2,0,183,2,3,1,...,0,92,1,black,2.290500,matched,free,7,,0
59,12,2015,1,5,10,2,181,1,2,0,...,0,92,1,black,1.293476,matched,free,7,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59935,11988,2016,3,1,8,1,194,1,1,0,...,0,96,2,asian,1.843271,matched,propose,4,,0
59936,11988,2016,3,2,7,1,187,2,2,0,...,0,96,2,asian,2.442774,matched,propose,4,,0
59937,11988,2016,3,3,10,2,180,2,1,0,...,0,96,2,asian,2.758944,matched,propose,4,,0
59938,11988,2016,3,4,4,0,187,1,2,0,...,0,96,2,asian,1.309942,matched,propose,4,,0


## Randomness in Enrollment

In [483]:
for s in all_students:
    student_choices = list(result[result['Student ID'] == s]['School ID'])
    assigned = result[result['Student ID'] == s]['Assignment'].values[0]
    not_assigned = [x for x in student_choices if x != assigned]
    
    if random.uniform(0,1) < p_enroll_assign:
        result.loc[result['Student ID'] == s, 'Enrollment'] = assigned
    else:
        result.loc[result['Student ID'] == s, 'Enrollment'] = random.choice(not_assigned)

In [484]:
result

Unnamed: 0,Student ID,Year,Grade,Choice Rank,School ID,Treatment,Capacity,Priority,Default Tie-breaker Index,Non-lottery,...,Outcome 2,Covariate 1,Covariate 2,Covariate 3,Applicant Rank,Student Status,School Status,Assignment,Enrollment,Temp Capacity
55,12,2015,1,1,8,1,186,1,2,0,...,0,92,1,black,1.335401,matched,propose,7,2,0
56,12,2015,1,2,7,1,194,1,1,0,...,0,92,1,black,1.143699,matched,propose,7,2,0
57,12,2015,1,3,9,2,181,1,2,0,...,0,92,1,black,1.293476,matched,free,7,2,0
58,12,2015,1,4,2,0,183,2,3,1,...,0,92,1,black,2.290500,matched,free,7,2,0
59,12,2015,1,5,10,2,181,1,2,0,...,0,92,1,black,1.293476,matched,free,7,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59935,11988,2016,3,1,8,1,194,1,1,0,...,0,96,2,asian,1.843271,matched,propose,4,4,0
59936,11988,2016,3,2,7,1,187,2,2,0,...,0,96,2,asian,2.442774,matched,propose,4,4,0
59937,11988,2016,3,3,10,2,180,2,1,0,...,0,96,2,asian,2.758944,matched,propose,4,4,0
59938,11988,2016,3,4,4,0,187,1,2,0,...,0,96,2,asian,1.309942,matched,propose,4,4,0


In [485]:
# Make Assignment and Enrollment dummy
for i in result.index:
    if result.loc[i, 'Assignment'] == result.loc[i, 'School ID']:
        result.loc[i, 'Assignment_dummy'] = 1
    else:
        result.loc[i, 'Assignment_dummy'] = 0
        
    if result.loc[i, 'Enrollment'] == result.loc[i, 'School ID']:
        result.loc[i, 'Enrollment_dummy'] = 1
    else:
        result.loc[i, 'Enrollment_dummy'] = 0

In [486]:
temp = result[result['Assignment_dummy'] == 1]
temp

Unnamed: 0,Student ID,Year,Grade,Choice Rank,School ID,Treatment,Capacity,Priority,Default Tie-breaker Index,Non-lottery,...,Covariate 2,Covariate 3,Applicant Rank,Student Status,School Status,Assignment,Enrollment,Temp Capacity,Assignment_dummy,Enrollment_dummy
56,12,2015,1,2,7,1,194,1,1,0,...,1,black,1.143699,matched,propose,7,2,0,1.0,0.0
66,14,2015,1,2,7,1,194,1,1,0,...,2,black,1.060528,matched,propose,7,7,0,1.0,1.0
71,15,2015,1,2,5,0,196,1,2,0,...,2,black,1.678567,matched,propose,5,5,0,1.0,1.0
103,21,2015,1,4,3,0,193,2,2,0,...,1,black,2.677722,matched,propose,3,3,0,1.0,1.0
189,38,2015,1,5,6,0,192,2,1,0,...,0,asian,2.502094,matched,propose,6,6,0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59635,11928,2016,3,1,9,2,189,0,2,0,...,2,black,0.063487,matched,propose,9,9,0,1.0,1.0
59666,11934,2016,3,2,6,0,192,0,2,0,...,0,white,0.382038,matched,propose,6,6,0,1.0,1.0
59688,11938,2016,3,4,1,0,196,1,4,1,...,2,asian,1.180800,matched,propose,1,1,0,1.0,1.0
59705,11942,2016,3,1,8,1,194,1,1,0,...,0,white,1.126799,matched,propose,8,8,0,1.0,1.0


## Experiment

In [274]:
check_trad = temp[temp['Treatment'] == 0]
check_char = temp[temp['Treatment'] == 1]
check_inno = temp[temp['Treatment'] == 2]
print(len(check_trad))
print(len(check_char))
print(len(check_inno))

6794
2243
2302


## Treatment Effect & Endogeneity

### 1. Baseline (Low motivation + traditional)

In [487]:
temp = result.groupby(['Student ID']).first()
temp = temp[['Outcome 1', 'Outcome 2']]
temp

Unnamed: 0_level_0,Outcome 1,Outcome 2
Student ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
...,...,...
11996,0,0
11997,0,0
11998,0,0
11999,0,0


In [488]:
base1 = np.random.normal(0, 1, len(temp))
base2 = np.random.normal(0, 1, len(temp))

In [489]:
temp['Outcome 1'] = base1
temp['Outcome 2'] = base2
temp

Unnamed: 0_level_0,Outcome 1,Outcome 2
Student ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,-2.877724,-0.730294
2,-0.772967,0.189505
3,0.094455,1.261416
4,-0.837246,0.413025
5,-0.062481,1.606479
...,...,...
11996,-1.187084,0.409415
11997,0.170040,-1.562898
11998,0.292887,0.562413
11999,1.425719,-1.895031


In [490]:
result2 = result.set_index('Student ID')
result2

Unnamed: 0_level_0,Year,Grade,Choice Rank,School ID,Treatment,Capacity,Priority,Default Tie-breaker Index,Non-lottery,Tie-breaker Student Group Index,...,Covariate 2,Covariate 3,Applicant Rank,Student Status,School Status,Assignment,Enrollment,Temp Capacity,Assignment_dummy,Enrollment_dummy
Student ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12,2015,1,1,8,1,186,1,2,0,1,...,1,black,1.335401,matched,propose,7,2,0,0.0,0.0
12,2015,1,2,7,1,194,1,1,0,0,...,1,black,1.143699,matched,propose,7,2,0,1.0,0.0
12,2015,1,3,9,2,181,1,2,0,1,...,1,black,1.293476,matched,free,7,2,0,0.0,0.0
12,2015,1,4,2,0,183,2,3,1,1,...,1,black,2.290500,matched,free,7,2,0,0.0,1.0
12,2015,1,5,10,2,181,1,2,0,1,...,1,black,1.293476,matched,free,7,2,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11988,2016,3,1,8,1,194,1,1,0,0,...,2,asian,1.843271,matched,propose,4,4,0,0.0,0.0
11988,2016,3,2,7,1,187,2,2,0,0,...,2,asian,2.442774,matched,propose,4,4,0,0.0,0.0
11988,2016,3,3,10,2,180,2,1,0,1,...,2,asian,2.758944,matched,propose,4,4,0,0.0,0.0
11988,2016,3,4,4,0,187,1,2,0,1,...,2,asian,1.309942,matched,propose,4,4,0,1.0,1.0


In [491]:
result2.update(temp)
result2

Unnamed: 0_level_0,Year,Grade,Choice Rank,School ID,Treatment,Capacity,Priority,Default Tie-breaker Index,Non-lottery,Tie-breaker Student Group Index,...,Covariate 2,Covariate 3,Applicant Rank,Student Status,School Status,Assignment,Enrollment,Temp Capacity,Assignment_dummy,Enrollment_dummy
Student ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12,2015,1,1,8,1,186,1,2,0,1,...,1,black,1.335401,matched,propose,7,2,0,0.0,0.0
12,2015,1,2,7,1,194,1,1,0,0,...,1,black,1.143699,matched,propose,7,2,0,1.0,0.0
12,2015,1,3,9,2,181,1,2,0,1,...,1,black,1.293476,matched,free,7,2,0,0.0,0.0
12,2015,1,4,2,0,183,2,3,1,1,...,1,black,2.290500,matched,free,7,2,0,0.0,1.0
12,2015,1,5,10,2,181,1,2,0,1,...,1,black,1.293476,matched,free,7,2,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11988,2016,3,1,8,1,194,1,1,0,0,...,2,asian,1.843271,matched,propose,4,4,0,0.0,0.0
11988,2016,3,2,7,1,187,2,2,0,0,...,2,asian,2.442774,matched,propose,4,4,0,0.0,0.0
11988,2016,3,3,10,2,180,2,1,0,1,...,2,asian,2.758944,matched,propose,4,4,0,0.0,0.0
11988,2016,3,4,4,0,187,1,2,0,1,...,2,asian,1.309942,matched,propose,4,4,0,1.0,1.0


In [492]:
result2 = result2.dropna(subset=['Assignment'])
result2 = result2.dropna(subset=['Enrollment'])
result2

Unnamed: 0_level_0,Year,Grade,Choice Rank,School ID,Treatment,Capacity,Priority,Default Tie-breaker Index,Non-lottery,Tie-breaker Student Group Index,...,Covariate 2,Covariate 3,Applicant Rank,Student Status,School Status,Assignment,Enrollment,Temp Capacity,Assignment_dummy,Enrollment_dummy
Student ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12,2015,1,1,8,1,186,1,2,0,1,...,1,black,1.335401,matched,propose,7,2,0,0.0,0.0
12,2015,1,2,7,1,194,1,1,0,0,...,1,black,1.143699,matched,propose,7,2,0,1.0,0.0
12,2015,1,3,9,2,181,1,2,0,1,...,1,black,1.293476,matched,free,7,2,0,0.0,0.0
12,2015,1,4,2,0,183,2,3,1,1,...,1,black,2.290500,matched,free,7,2,0,0.0,1.0
12,2015,1,5,10,2,181,1,2,0,1,...,1,black,1.293476,matched,free,7,2,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11988,2016,3,1,8,1,194,1,1,0,0,...,2,asian,1.843271,matched,propose,4,4,0,0.0,0.0
11988,2016,3,2,7,1,187,2,2,0,0,...,2,asian,2.442774,matched,propose,4,4,0,0.0,0.0
11988,2016,3,3,10,2,180,2,1,0,1,...,2,asian,2.758944,matched,propose,4,4,0,0.0,0.0
11988,2016,3,4,4,0,187,1,2,0,1,...,2,asian,1.309942,matched,propose,4,4,0,1.0,1.0


In [493]:
result3 = result2.reset_index()

### Treatment Effect & Endogeneity

In [494]:
for i in result3.index:
    # treatment effect - charter
    if type_dict[result3.loc[i, 'Enrollment']] == 1:
        result3.loc[i, 'Outcome 1'] += treat_char
        result3.loc[i, 'Outcome 2'] += treat_char
        
    # treatment effect - innovation
    if type_dict[result3.loc[i, 'Enrollment']] == 2:
        result3.loc[i, 'Outcome 1'] += treat_inno
        result3.loc[i, 'Outcome 2'] += treat_inno

In [495]:
for i in result3.index:
    # Endogeneity - high motivation students
    if mot_dict[result3.loc[i, 'Student ID']] == 1:
        result3.loc[i, 'Outcome 1'] += endo_high
        result3.loc[i, 'Outcome 2'] += endo_high

In [None]:
# Endogeneity - favored students
if type_dict[result3.loc[i, 'Tie-breaker Student Group Index']] == 1:
    result3.loc[i, 'Outcome 1'] -= endo_adv * base_sd

# Export data

In [496]:
result3.to_excel('mock_endo_da_0903-60.xlsx')

In [250]:
temp = result3[result3['Enrollment_dummy'] == 1]
temp0 = temp[temp['Treatment'] == 0]
temp1 = temp[temp['Treatment'] == 1]
temp2 = temp[temp['Treatment'] == 2]

In [251]:
# average score 1
print(sum(temp0['Outcome 1']) / len(temp0))
print(sum(temp1['Outcome 1']) / len(temp1))
print(sum(temp2['Outcome 1']) / len(temp2))

0.1759043544587058
0.6739169854427438
0.5750193010264989


In [252]:
# average score 2
print(sum(temp0['Outcome 2']) / len(temp0))
print(sum(temp1['Outcome 2']) / len(temp1))
print(sum(temp2['Outcome 2']) / len(temp2))

0.19636169924330318
0.6799774540588839
0.6039364439747547


## ratio check

In [253]:
temp0_high = temp0[temp0['Student ID'].map(mot_dict) == 1] 
temp1_high = temp1[temp1['Student ID'].map(mot_dict) == 1] 
temp2_high = temp2[temp2['Student ID'].map(mot_dict) == 1] 

In [254]:
# proportion of high motivation students
print(len(temp0_high) / len(temp0) * 100)
print(len(temp1_high) / len(temp1) * 100)
print(len(temp2_high) / len(temp2) * 100)

45.76851275668948
66.78981937602627
46.09650843468027


In [255]:
print(sum(temp1_high['Outcome 1']) / len(temp1_high))
print(sum(temp2_high['Outcome 1']) / len(temp2_high))

0.8170165767970493
0.7637619637835683


In [256]:
print(sum(temp1_high['Outcome 2']) / len(temp1_high))
print(sum(temp2_high['Outcome 2']) / len(temp2_high))

0.8213415035718841
0.7989384627338464


# Import data & Experiment

In [2078]:
df = pd.read_excel('mock_endo_da_0901-46.xlsx')
df = df.iloc[:, 1:]
df

Unnamed: 0,Student ID,Year,Grade,Choice Rank,School ID,Treatment,Capacity,Priority,Default Tie-breaker Index,Non-lottery,...,Covariate 2,Covariate 3,Applicant Rank,Student Status,School Status,Assignment,Enrollment,Temp Capacity,Assignment_dummy,Enrollment_dummy
0,2,2015,1,1,7,1,180,2,1,0,...,2,asian,2.904772,matched,propose,5,5,0,0,0
1,2,2015,1,2,5,0,200,1,1,0,...,2,asian,1.904772,matched,propose,5,5,0,1,1
2,2,2015,1,3,2,0,200,1,3,1,...,2,asian,1.503000,matched,free,5,5,0,0,0
3,2,2015,1,4,8,1,183,1,1,0,...,2,asian,1.633340,matched,free,5,5,0,0,0
4,2,2015,1,5,3,0,189,1,1,0,...,2,asian,1.723818,matched,free,5,5,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56805,11987,2016,3,1,1,0,192,1,4,1,...,0,asian,1.618800,matched,propose,1,1,0,1,1
56806,11987,2016,3,2,5,0,189,1,1,0,...,0,asian,1.537450,matched,free,1,1,0,0,0
56807,11987,2016,3,3,7,1,190,1,1,0,...,0,asian,1.767785,matched,free,1,1,0,0,0
56808,11987,2016,3,4,10,2,196,2,1,0,...,0,asian,2.767785,matched,free,1,1,0,0,0


In [2132]:
df = result3.copy()

In [2133]:
df = df[(df['Year'] == 2015) & (df['Grade'] == 1)]
df

Unnamed: 0,Student ID,Year,Grade,Choice Rank,School ID,Treatment,Capacity,Priority,Default Tie-breaker Index,Non-lottery,...,Covariate 2,Covariate 3,Applicant Rank,Student Status,School Status,Assignment,Enrollment,Temp Capacity,Assignment_dummy,Enrollment_dummy
0,9,2015,1,1,7,1,194,0,2,0,...,0,black,0.659331,matched,propose,3,3,0,0.0,0.0
1,9,2015,1,2,8,1,196,2,1,0,...,0,black,2.243707,matched,propose,3,3,0,0.0,0.0
2,9,2015,1,3,3,0,188,2,1,0,...,0,black,2.243707,matched,propose,3,3,0,1.0,1.0
3,9,2015,1,4,2,0,182,0,4,1,...,0,black,0.300000,matched,free,3,3,0,0.0,0.0
4,9,2015,1,5,10,2,197,2,2,0,...,0,black,2.941902,matched,free,3,3,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9475,11996,2015,1,1,3,0,188,2,1,0,...,1,black,2.330964,matched,propose,3,3,0,1.0,1.0
9476,11996,2015,1,2,10,2,197,1,2,0,...,1,black,1.459315,matched,free,3,3,0,0.0,0.0
9477,11996,2015,1,3,6,0,192,2,2,0,...,1,black,2.459315,matched,free,3,3,0,0.0,0.0
9478,11996,2015,1,4,2,0,182,1,4,1,...,1,black,1.730800,matched,free,3,3,0,0.0,0.0


In [2134]:
assign = df[df['Assignment_dummy'] == 1]
assign

Unnamed: 0,Student ID,Year,Grade,Choice Rank,School ID,Treatment,Capacity,Priority,Default Tie-breaker Index,Non-lottery,...,Covariate 2,Covariate 3,Applicant Rank,Student Status,School Status,Assignment,Enrollment,Temp Capacity,Assignment_dummy,Enrollment_dummy
2,9,2015,1,3,3,0,188,2,1,0,...,0,black,2.243707,matched,propose,3,3,0,1.0,1.0
5,16,2015,1,1,8,1,196,1,1,0,...,1,black,1.144544,matched,propose,8,8,0,1.0,1.0
11,17,2015,1,2,2,0,182,2,4,1,...,1,asian,2.694400,matched,propose,2,2,0,1.0,1.0
19,19,2015,1,5,1,0,184,1,3,1,...,0,asian,1.102550,matched,propose,1,1,0,1.0,1.0
23,21,2015,1,4,1,0,184,0,3,1,...,1,white,0.208000,matched,propose,1,1,0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9455,11973,2015,1,1,7,1,194,1,2,0,...,0,black,1.312788,matched,propose,7,7,0,1.0,1.0
9464,11980,2015,1,5,2,0,182,2,4,1,...,0,asian,2.016000,matched,propose,2,2,0,1.0,1.0
9468,11986,2015,1,4,1,0,184,1,3,1,...,2,white,1.074900,matched,propose,1,1,0,1.0,1.0
9470,11995,2015,1,1,10,2,197,2,2,0,...,1,white,2.523837,matched,propose,10,9,0,1.0,0.0


In [2135]:
assign_trad = assign[assign['Treatment'] == 0]
assign_char = assign[assign['Treatment'] == 1]
assign_inno = assign[assign['Treatment'] == 2]

In [2136]:
assign_8 = assign[assign['School ID'] == 8]
assign_8

Unnamed: 0,Student ID,Year,Grade,Choice Rank,School ID,Treatment,Capacity,Priority,Default Tie-breaker Index,Non-lottery,...,Covariate 2,Covariate 3,Applicant Rank,Student Status,School Status,Assignment,Enrollment,Temp Capacity,Assignment_dummy,Enrollment_dummy
5,16,2015,1,1,8,1,196,1,1,0,...,1,black,1.144544,matched,propose,8,8,0,1.0,1.0
200,283,2015,1,1,8,1,196,1,1,0,...,2,black,1.146959,matched,propose,8,7,0,1.0,0.0
230,320,2015,1,1,8,1,196,1,1,0,...,0,asian,1.278229,matched,propose,8,8,0,1.0,1.0
265,359,2015,1,1,8,1,196,1,1,0,...,1,black,1.337810,matched,propose,8,8,0,1.0,1.0
320,412,2015,1,1,8,1,196,1,1,0,...,2,white,1.032766,matched,propose,8,8,0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9210,11649,2015,1,1,8,1,196,2,1,0,...,1,white,2.065734,matched,propose,8,8,0,1.0,1.0
9220,11658,2015,1,1,8,1,196,2,1,0,...,0,asian,2.400850,matched,propose,8,8,0,1.0,1.0
9260,11695,2015,1,1,8,1,196,1,1,0,...,1,black,1.566596,matched,propose,8,8,0,1.0,1.0
9305,11778,2015,1,1,8,1,196,1,1,0,...,1,asian,1.015035,matched,propose,8,8,0,1.0,1.0


In [2137]:
set(assign_8['Choice Rank'])

{1, 2, 3}

In [2138]:
apply_8 = df[(df['Choice Rank'] == 1) & (df['School ID'] == 8)]
apply_8

Unnamed: 0,Student ID,Year,Grade,Choice Rank,School ID,Treatment,Capacity,Priority,Default Tie-breaker Index,Non-lottery,...,Covariate 2,Covariate 3,Applicant Rank,Student Status,School Status,Assignment,Enrollment,Temp Capacity,Assignment_dummy,Enrollment_dummy
5,16,2015,1,1,8,1,196,1,1,0,...,1,black,1.144544,matched,propose,8,8,0,1.0,1.0
15,19,2015,1,1,8,1,196,2,1,0,...,0,asian,2.774742,matched,propose,1,1,0,0.0,0.0
20,21,2015,1,1,8,1,196,2,1,0,...,1,white,2.633375,matched,propose,1,1,0,0.0,0.0
30,43,2015,1,1,8,1,196,0,1,0,...,0,asian,0.617525,matched,propose,2,10,0,0.0,0.0
60,72,2015,1,1,8,1,196,2,1,0,...,1,black,2.800964,matched,propose,2,2,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9385,11863,2015,1,1,8,1,196,1,1,0,...,1,asian,1.637411,matched,propose,1,1,0,0.0,0.0
9390,11865,2015,1,1,8,1,196,1,1,0,...,2,black,1.445159,matched,propose,8,8,0,1.0,1.0
9395,11871,2015,1,1,8,1,196,2,1,0,...,1,asian,2.757382,matched,propose,6,6,0,0.0,0.0
9420,11905,2015,1,1,8,1,196,2,1,0,...,1,asian,2.048597,matched,propose,4,4,0,0.0,0.0


In [2139]:
assign_8['Applicant Rank'].describe()

count    196.000000
mean       1.612072
std        0.554503
min        0.032192
25%        1.202591
50%        1.570039
75%        2.106580
max        2.940442
Name: Applicant Rank, dtype: float64

In [2140]:
apply_8['Applicant Rank'].describe()

count    469.000000
mean       1.828198
std        0.640638
min        0.032192
25%        1.337810
50%        1.894969
75%        2.393638
max        2.968677
Name: Applicant Rank, dtype: float64

## Check if it's (partly) clean

In [15]:
df.loc[(df['Priority'] == 0)]

Unnamed: 0,Student ID,Year,Grade,Choice Rank,School ID,Treatment,Capacity,Priority,Default Tie-breaker Index,Non-lottery,...,Default Tie-breaker,Effective Tie-breaker,Assignment,Enrollment,Outcome 1,Outcome 2,Outcome 3,Covariate 1,Covariate 2,Covariate 3
0,1,2017,3,1,7,1,44,0,4,0,...,0.046638,0.041974,1,0,61,1,nocollege,84,0,asian
42,9,2018,2,3,14,0,42,0,18,1,...,0.737355,0.737355,1,0,30,2,college2,99,1,white
68,14,2017,1,4,10,1,45,0,11,1,...,0.269556,0.269556,1,0,12,0,nocollege,83,2,asian
84,17,2017,2,5,2,2,44,0,17,1,...,0.703187,0.703187,1,0,9,1,college2,55,1,black
120,25,2018,3,1,25,0,41,0,4,0,...,0.351395,0.351395,1,1,59,1,college2,51,2,black
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29880,5977,2018,3,1,3,0,45,0,16,1,...,0.025036,0.022532,1,0,90,1,college2,29,1,white
29888,5978,2018,1,4,24,2,42,0,18,1,...,0.170245,0.085123,1,0,20,2,college4,35,1,white
29916,5984,2018,2,2,21,1,44,0,10,0,...,0.503744,0.453370,0,1,87,2,college2,70,2,white
29918,5984,2018,2,4,17,0,41,0,9,0,...,0.343394,0.343394,1,0,87,2,college2,70,2,white


In [16]:
df.loc[(df['School ID'] == 15) & (df['Year'] == 2017)]

Unnamed: 0,Student ID,Year,Grade,Choice Rank,School ID,Treatment,Capacity,Priority,Default Tie-breaker Index,Non-lottery,...,Default Tie-breaker,Effective Tie-breaker,Assignment,Enrollment,Outcome 1,Outcome 2,Outcome 3,Covariate 1,Covariate 2,Covariate 3
4,1,2017,3,5,15,0,44,5,18,1,...,0.806738,0.403369,0,0,61,1,nocollege,84,0,asian
165,34,2017,3,1,15,0,44,0,18,1,...,0.501119,0.250559,1,0,82,2,college4,48,1,black
269,54,2017,3,5,15,0,44,4,18,1,...,0.204144,0.102072,0,0,35,2,nocollege,87,0,black
339,68,2017,1,5,15,0,44,5,9,0,...,0.836572,0.836572,0,1,33,2,college2,87,1,asian
340,69,2017,2,1,15,0,41,1,10,0,...,0.043625,0.030538,1,1,98,1,college4,66,1,white
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29725,5946,2017,3,1,15,0,44,4,18,1,...,0.367194,0.367194,1,1,3,1,college2,75,0,black
29820,5965,2017,2,1,15,0,41,2,10,0,...,0.370495,0.370495,0,0,72,0,college2,76,0,white
29861,5973,2017,2,2,15,0,41,5,10,0,...,0.791994,0.791994,0,0,62,0,nocollege,66,2,asian
29930,5987,2017,1,1,15,0,44,2,9,0,...,0.014310,0.014310,0,0,76,2,college2,75,2,black


In [17]:
df.loc[(df['Student ID'] == 1502) & (df['Year'] == 2017)]

Unnamed: 0,Student ID,Year,Grade,Choice Rank,School ID,Treatment,Capacity,Priority,Default Tie-breaker Index,Non-lottery,...,Default Tie-breaker,Effective Tie-breaker,Assignment,Enrollment,Outcome 1,Outcome 2,Outcome 3,Covariate 1,Covariate 2,Covariate 3
7505,1502,2017,3,1,14,2,40,3,17,1,...,0.758016,0.758016,0,0,24,1,college2,98,2,black
7506,1502,2017,3,2,18,1,41,1,20,1,...,0.4446,0.4446,0,0,24,1,college2,98,2,black
7507,1502,2017,3,3,12,1,41,5,2,0,...,0.645095,0.451566,0,0,24,1,college2,98,2,black
7508,1502,2017,3,4,11,0,45,4,9,0,...,0.231134,0.231134,1,1,24,1,college2,98,2,black
7509,1502,2017,3,5,16,0,40,1,5,0,...,0.032324,0.032324,0,0,24,1,college2,98,2,black


# 2. Create issues that result in errors / warnings

## (erroneous cells are marked  red in the excel file)

## (1) Inconsistency within a student

### (1.1) Inconsistent grade within a student

Student ID 1 \
Grade \
index 2

### (1.2) Inconsistent Outcomes within a student

Student ID 11 \
Outcome 1 \
index 50

### (1.3) Inconsistent Default Tie-breaker within a (Student ID, Default Tie-breaker Index) pair

In [None]:
df.loc[(df['Student ID'] == 8)]

Student ID 8 \
Default Tie-breaker \
index 38 and a lot more due to checkpoint #12 (correlation, orange mark in excel)

## (2) Inconsistency within a (school, year, grade)

### (2.1) Inconsistent Treatment value within a (School, Year, Grade)

In [None]:
df.loc[(df['School ID'] == 4) & (df['Year'] == 2017) & (df['Grade'] == 3)]

School ID 4, 25 \
Treatment \
index 13, 25

### (2.2) Inconsistent Capacity within a (School, Year, Grade)

In [None]:
df.loc[(df['School ID'] == 7) & (df['Year'] == 2018) & (df['Grade'] == 2)]

School ID 7 \
Capacity \
index 31

### (2.3) Inconsistent advantage within a (School, Year, Grade) pair

In [None]:
df.loc[(df['School ID'] == 16) & (df['Year'] == 2017) & (df['Grade'] == 1)]

School ID 16 \
Advantage \
index 40

### (3) A student chose multiple schools for the same rank

Student ID 2, 3 \
Choice Rank \
index 6, 7, 12, 13

### (4) A student chose the same school for multiple ranks

Student ID 3, 4 \
School ID \
index 12, 13, 15, 16

### (5) Inconsecutive Choice Rank (e.g., 1,2,4)

Student ID 5 \
Choice Rank \
index 22, 23, 24

## (6) Multiple Assignment / Enrollment

### (6.1) A student is assigned to multiple schools

Student ID 12 \
Assignment \
index 55, 56

### (6.2) A student is enrolled in multiple schools

Student ID 13 \
Enrollment \
index 60, 61

## (7) Over capacity 

### (7.1) A (school, year, grade) is assigned with more students than its capacity and contains at least one student who is not guaranteed an assignment

In [None]:
over_assign

In [None]:
df.loc[(df['School ID'] == 6) & (df['Year'] == 2018) & (df['Grade'] == 3)]

In [None]:
df.loc[(df['School ID'] == 6) & (df['Year'] == 2018) & (df['Grade'] == 3)]['Assignment'].sum()

Most of people have non-zero priority.

Thus, Stata should return the 3 schools (6, 15, 16) in the over_assign list.

### (7.2) A (school, year, grade) pair is enrolled by more students than its capacity and contains at least one student who is not guaranteed an assignment

In [None]:
over_enroll

Stata should return these two schools

### (8) A student is not assigned to a school although the student was guaranteed an assignment to that school and she was not assigned to any school she prefers to that school.

Student ID 15 \
Assignment \
index 74

### (9) A student is assigned to school s, even though (1) she prefers school s' to s, (2) her applicant position at s' was better than her position at s, (3) there were still available spots at s', (4) and she is eligible at s'

There should be a lot of cases of this. The following case should be included: \
Student ID 16 \
School ID 15 \
while she prefers school 3, 24 and her applicant position is better at those schools. \
Assignment \
index 78

### (10) Abnormally large value (e.g., greater than 10 × mean of that column) found in a column that is unlikely to have a huge outlier

Student ID 18 \
Grade \
index 85-89

### (11) A school uses non-lottery tie-breaker, and correlation between Priority and Tie-breaker within the (school, year, grade) approximates 1

School ID 1 \
Year 2017 \
Grade 1 \
Orange