# Mock data generation

In [356]:
import pandas as pd
import numpy as np
import random
from collections import Counter

In [357]:
random.seed(10)

# Generate a dataset (without error)

## Setting

- 2 years (2015, 2016)
- 3 grades each year (1, 2, 3)
- 1000 students each grade
- Each student ranks 5 schools
- 10 schools
    - 1-6: traditional
        - 1-2: non-lottery
        - 3-6: lottery
    - 7-8: charter
        - lottery
    - 9-10: innovation
        - lottery
- Capacity at each school: random between 100-110
- 4 default tie-breaker indices (1 - 11)
    - Lottery: 1 - 2
    - Non-lottery: 3 - 4
        - unfair coin toss (favor high motivation students)
            - 3: 0.8
            - 4: 0.7
- 2 tie-breaker student groups (0, 1)
    - 3 types of advantage (0.1, 0.3, 0.5) randomly applied to student group 1
- Priority: random among (0,1,2) (0: 5%)
    - Priority = 0: guaranteed assignment
- Outcome 1,2
    - math (continuous)
    - read (continuous)
- Covariate 1,2,3
    - IQ (continuous)
    - gender (categorical (integer))
    - race (categorical (string))

## Parameters

In [358]:
#========Years & Grades===========

# 1. number of years
num_years = 2
years = [2015 + x for x in range(num_years)]

# 2. number of grades
num_grades = 3
grades = [x for x in range(1, num_grades+1)]

#========Students===========

# 3. number of students in each (year, grade)
num_students_per_grade = 1000

# generate total student pool
num_total_students = len(years) * len(grades) * num_students_per_grade
all_students = list(range(1, num_total_students + 1))
random.shuffle(all_students)

# 4. ratio of high motivation students
p_high_motivation = 0.5

# 5. ratio of low motivation students
p_low_motivation = 1 - p_high_motivation

#========Schools===========

# 6. number of schools
num_schools = 10

# 7. number of traditional schools
num_trad = 6

# 8. number of charter schools
num_char = 2

# 9. number of innovation schools
num_inno = 2

# 10. number of non-lottery schools       
    # School ID 1 ~ num_non_lottery will be non-lottery schools
num_non_lottery = 1

# 11. max number of schools that a student can rank
max_num_schools_per_student = 5

# 12. lower bound of school capacity
capa_low = 100

# 13. upper bound of school capacity
capa_up = 110

#========Choice===========

# probability of a student ranking a charter school
    # if all the charter schools are consumed, they just choose among traditional and innovation randomly
    # regardless of these parameters

# 14. probability of a high-motivation student ranking a charter school
p_high_charter = 0.7        # prob. high motivated students pick charter

# 15. probability of a low-motivation student ranking a charter school
p_low_charter = 0.3         # prob. low motivated students pick charter

#========Priority===========

# 16. number of priorities (excluding 0 (guaranteed assignment))
num_priority = 2

# 17. proportion of students who are guaranteed assignment (priority = 0)
p_guarantee = 0.05

#========Tie-breaker===========

# 18. number of different lottery tie-breakers
num_lot_tie = 2

# 19. number of different non-lottery tie-breakers
num_nonlot_tie = 2

# number of different tie breakers
num_tie_breakers = num_lot_tie + num_nonlot_tie

# 20. probability of choosing a highly motivated student when forming a ranking for non-lottery tie-breaker
p_nonlot_high = [0.8, 0.7, 0.6]

#========Enrollment===========

# 21. probability of enrollment at assigned school
p_enroll_assign = 0.9

#========Output===========
# All the output variable follows a normal distribution

# baseline output      
# baseline = low motivation students who enrolled at traditional schools

# 22. baseline mean
base_mean = 500

# 23. baseline standard deviation
base_sd = 100

# treatment effect

# 24. high motivation effect
    # high motivation students achieve 0.2 SD higher score compared to low motivation students
treat_high = 0.2       

# endogeneity

# 25. innovation effect
    # Students enrolled an innovation school achieve 0.2 SD higher score compared to traditional school
endo_inno = 0.2   

# 26. charter effect
    # Students enrolled an charter school achieve 0.4 SD higher score compared to traditional school
endo_char = 0.4          

## Assign student-specific information

In [359]:
# year and grade
year_grade_dict = {}
temp = all_students[:]
for y in years:
    year_grade_dict[y] = {}
    for g in grades:
        allocate = temp[:num_students_per_grade]
        year_grade_dict[y][g] = allocate
        temp = [x for x in temp if x not in allocate]

# motivation level
# x = student ID, y = motivation level
mot_dict = {}
for n in range(1, num_total_students+1):
    mot_dict[n] = np.random.choice([0, 1], p = [1 - p_high_motivation, p_high_motivation])

# x = motivation level , y = student ID
mot_dict2 = {}
for n in [0, 1]:
    mot_dict2[n] = []
    for s in range(1, num_total_students+1):
        if mot_dict[s] == n:
            mot_dict2[n].append(s)

# lottery / non-lottery indicator dictionary
tie_dict = {}
for n in range(1, num_tie_breakers + 1):
    if 1 <= n <= num_lot_tie:
        tie_dict[n] = 0
    else:
        tie_dict[n] = 1
            
# lottery tie-breaker value
lot_dict = {}
for n in range(1, num_total_students+1):
    lot_dict[n] = {}
    for t in range(1, num_lot_tie+1):
        lot_dict[n][t] = random.uniform(0, 1)
        
# non-lottery tie-breaker value
# x = year & grade & (non-lottery) tie-breaker index, y = ordered list
nonlot_dict = {}
for y in years:
    nonlot_dict[y] = {}
    for g in grades:
        nonlot_dict[y][g] = {}
        students = year_grade_dict[y][g]
        
        for n in range(num_lot_tie+1, num_tie_breakers + 1):
            
            high = [s for s in students if mot_dict[s] == 1]
            low = [s for s in students if mot_dict[s] == 0]
            
            index = n - num_lot_tie - 1
            nonlot_dict[y][g][n] = []
            while (len(high) + len(low)) > 0:
                if random.uniform(0,1) < p_nonlot_high[index]:
                    if len(high) > 0:
                        choice = random.choice(high)
                        high.remove(choice)
                    else:
                        choice = random.choice(low)
                        low.remove(choice)
                    nonlot_dict[y][g][n].append(choice)
                else:
                    if len(low) > 0:
                        choice = random.choice(low)
                        low.remove(choice)
                    else:
                        choice = random.choice(high)
                        high.remove(choice)
                    nonlot_dict[y][g][n].append(choice)

# x = (non-lottery) tie-breaker index & student ID, y = (non-lottery) tie-breaker value
nonlot_dict2 = {}
for y in years:
    for g in grades:
        for i in year_grade_dict[y][g]:
            nonlot_dict2[i] = {}
            for n in range(num_lot_tie+1, num_tie_breakers + 1):
                rank = nonlot_dict[y][g][n].index(i) / len(nonlot_dict[y][g][n])
                nonlot_dict2[i][n] = rank
                    
# covariate 1, 2, 3
covariate_dict = {}
for n in range(1, num_total_students+1):
    covariate_dict[n] = (random.randint(0, 100), random.randint(0, 2), random.choice(['white', 'black', 'asian']))

## Assign school-specific information

In [360]:
# types (0 = trad, 1 = charter, 2 = inno)
# x = school ID, y = type number
type_dict = {}
for n in range(1, num_schools+1):
    if 1 <= n <= num_trad:
        type_dict[n] = 0
    elif num_trad + 1 <= n <= num_char:
        type_dict[n] = 1
    elif num_char + 1 <= n <= num_schools:
        type_dict[n] = 2
        
# x = type number, y = school ID
type_dict2 = {}
for n in [0, 1, 2]:
    type_dict2[n] = []
    for s in range(1, num_schools+1):
        if type_dict[s] == n:
            type_dict2[n].append(s)

# capacity
capacity_dict = {}
for n in range(1, num_schools+1):
    capacity_dict[n] = {}
    for y in years:
        capacity_dict[n][y] = {}
        for g in grades:
            capacity_dict[n][y][g] = random.randint(capa_low, capa_up)

# test 08-05 (1 non-lottery school case)
capacity_dict[1][2015][1] = 80
capacity_dict[1][2015][2] = 80
capacity_dict[1][2015][3] = 80
capacity_dict[1][2016][1] = 80
capacity_dict[1][2016][2] = 80
capacity_dict[1][2016][3] = 80
            
# treatment
treatment_dict = {}
for n in range(1, num_schools+1):
    treatment_dict[n] = {}
    for y in years:
        treatment_dict[n][y] = {}
        for g in grades:
            treatment_dict[n][y][g] = random.randint(0, 2)
            
# advantage
advantage_dict = {}
for n in range(1, num_schools+1):
    advantage_dict[n] = {}
    for y in years:
        advantage_dict[n][y] = {}
        for g in grades:
            advantage_dict[n][y][g] = random.choice([0.9, 0.8, 0.7])
            
# non-Lottery
non_lottery_dict = {}
for n in range(1, num_schools+1):
    non_lottery_dict[n] = {}
    for y in years:
        non_lottery_dict[n][y] = {}
        for g in grades:
            if 1 <= n <= num_non_lottery:
                non_lottery_dict[n][y][g] = 1
            else:
                non_lottery_dict[n][y][g] = 0
            
# default tie-breaker index
default_dict = {}
for n in range(1, num_schools+1):
    default_dict[n] = {}
    for y in years:
        default_dict[n][y] = {}
        for g in grades:
            if non_lottery_dict[n][y][g] == 0:
                default_dict[n][y][g] = random.randint(1, num_lot_tie)
            else:
                default_dict[n][y][g] = random.randint(num_lot_tie + 1, num_tie_breakers)

# Put information into a data frame

In [361]:
# Define the column names
columns = ['Student ID', 'Year', 'Grade', 'Choice Rank', 'School ID', 'Treatment', 'Capacity', 
           'Priority', 'Default Tie-breaker Index', 'Non-lottery', 'Tie-breaker Student Group Index', 
           'Advantage', 'Default Tie-breaker', 'Effective Tie-breaker',
           'Outcome 1', 'Outcome 2', 'Covariate 1', 'Covariate 2', 'Covariate 3']

In [362]:
# Create an empty list to store the data
data = []

In [363]:
# Loop through the years, grades, and schools to generate the data
count = 0
for year in years:
    for grade in grades:
        students = year_grade_dict[year][grade]
        count += 1
        for student_index in students:
            student_id = student_index
            school_ids = list(range(1, num_schools+1))
            
            # pick schools
            picks = []
            
            temp_trad = type_dict2[0][:]
            temp_char = type_dict2[1][:]
            temp_inno = type_dict2[2][:]
            temp_non_char = temp_trad + temp_inno
            
            while len(picks) < max_num_schools_per_student:
                if mot_dict[student_id] == 1:
                    if random.uniform(0, 1) < p_high_charter:
                        if len(temp_char) > 0:
                            choice = random.choice(temp_char)
                            temp_char.remove(choice)
                        else:
                            choice = random.choice(temp_non_char)
                            temp_non_char.remove(choice)  
                    else:
                        if len(temp_non_char) > 0:
                            choice = random.choice(temp_non_char)
                            temp_non_char.remove(choice)
                        else:
                            choice = random.choice(temp_char)
                            temp_char.remove(choice) 
                else:
                    if random.uniform(0, 1) < p_low_charter:
                        if len(temp_char) > 0:
                            choice = random.choice(temp_char)
                            temp_char.remove(choice)
                        else:
                            choice = random.choice(temp_non_char)
                            temp_non_char.remove(choice) 
                    else:
                        if len(temp_non_char) > 0:
                            choice = random.choice(temp_non_char)
                            temp_non_char.remove(choice)
                        else:
                            choice = random.choice(temp_char)
                            temp_char.remove(choice) 
                picks.append(choice)

            # Loop through the school IDs the student picked
            for rank, school_id in enumerate(picks):
                capacity = capacity_dict[school_id][year][grade] # Random capacity between 100 and 110
                treatment = treatment_dict[school_id][year][grade] # Random treatment
                non_lottery = non_lottery_dict[school_id][year][grade]
                
                priority = random.randint(1, num_priority) # Random priority ranking 
                if random.uniform(0,1) < p_guarantee:
                    priority = 0
                
                tie_breaker_student_group_index = random.randint(0, 1) # Random tie-breaker student group index
                default_tie_breaker_index = default_dict[school_id][year][grade] # Default tie-breaker index
                
                if tie_breaker_student_group_index == 0:
                    advantage = 1
                else:
                    advantage = advantage_dict[school_id][year][grade]
                
                if non_lottery == 0:
                    default_tie_breaker = lot_dict[student_id][default_tie_breaker_index]
                else:
                    default_tie_breaker = nonlot_dict2[student_id][default_tie_breaker_index]
                effective_tie_breaker = default_tie_breaker * advantage
                
                outcome_1 = 0
                outcome_2 = 0
                covariate_1 = covariate_dict[student_id][0]
                covariate_2 = covariate_dict[student_id][1]
                covariate_3 = covariate_dict[student_id][2]
                
                # Create a list of values for this row
                row = [student_id, year, grade, rank+1, school_id, treatment, capacity, 
                       priority, default_tie_breaker_index, non_lottery, tie_breaker_student_group_index, 
                       advantage, default_tie_breaker, effective_tie_breaker,
                       outcome_1, outcome_2, covariate_1, covariate_2, covariate_3]
                data.append(row)

In [364]:
df = pd.DataFrame(data, columns = columns)
df = df.sort_values(['Student ID', 'Choice Rank'])
df = df.reset_index(drop=True)
df

Unnamed: 0,Student ID,Year,Grade,Choice Rank,School ID,Treatment,Capacity,Priority,Default Tie-breaker Index,Non-lottery,Tie-breaker Student Group Index,Advantage,Default Tie-breaker,Effective Tie-breaker,Outcome 1,Outcome 2,Covariate 1,Covariate 2,Covariate 3
0,1,2015,2,1,5,2,109,2,1,0,1,0.9,0.888356,0.799521,0,0,1,2,black
1,1,2015,2,2,6,0,101,2,2,0,1,0.7,0.986513,0.690559,0,0,1,2,black
2,1,2015,2,3,10,0,103,2,1,0,1,0.8,0.888356,0.710685,0,0,1,2,black
3,1,2015,2,4,1,2,80,1,3,1,1,0.7,0.425000,0.297500,0,0,1,2,black
4,1,2015,2,5,3,2,107,1,1,0,0,1.0,0.888356,0.888356,0,0,1,2,black
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,6000,2016,1,1,5,2,109,1,1,0,1,0.9,0.694682,0.625214,0,0,46,0,white
29996,6000,2016,1,2,1,0,80,1,3,1,1,0.8,0.723000,0.578400,0,0,46,0,white
29997,6000,2016,1,3,8,0,103,1,2,0,1,0.7,0.225773,0.158041,0,0,46,0,white
29998,6000,2016,1,4,2,0,108,2,2,0,1,0.7,0.225773,0.158041,0,0,46,0,white


# Assign students to schools by DA Algorithm

In [365]:
df['Applicant Rank'] = df['Priority'] + df['Effective Tie-breaker'] 

In [366]:
# Initialize all students and schools as "free"
df['Student Status'] = 'free'
df['School Status'] = 'free'
df['Assignment'] = None
df['Enrollment'] = None
df['Temp Capacity'] = df['Capacity']

In [367]:
program_list = []
for y in years:
    for g in grades:
        print ('Year:', y, 'Grade:', g)
        temp = df.loc[(df['Year'] == y) & (df['Grade'] == g)]
        program = temp.copy()

        while 'free' in program['Student Status'].values:
            
            # Update available students and available schools
            available_students = list(set(program[program['Student Status'] == 'free']['Student ID'].values))
            available_schools = list(set(program[program['Temp Capacity'] > 0]['School ID'].values))
            
            # one loop = one round
            for student_id in available_students:
                student_choices = program[program['Student ID'] == student_id].sort_values(by='Choice Rank')

                # Find the student's most preferred school among the ones they haven't applied to yet
                for _, choice in student_choices.iterrows():
                    if (choice['School ID'] in available_schools) and (choice['School Status'] == 'free'):
                        school_id = choice['School ID']
                        program.loc[(program['Student ID'] == student_id) & 
                                    (program['School ID'] == school_id), 'School Status'] = 'propose'
                        break
            
                # If the school has available capacity, accept the student
                if program[program['School ID'] == school_id]['Temp Capacity'].iloc[0] > 0:
                    program.loc[program['Student ID'] == student_id, 'Student Status'] = 'matched'
                    program.loc[program['Student ID'] == student_id, 'Assignment'] = school_id
                    program.loc[program['School ID'] == school_id, 'Temp Capacity'] -= 1

                # If the school is already full, compare the applicant ranks of the already admitted students
                else:    
                    current_students = program.loc[(program['Assignment'] == school_id)]
                    current_students = current_students.sort_values(by = 'Applicant Rank')

                    # Check if the current student has a higher rank than the worst-ranked admitted student
                    if choice['Applicant Rank'] < current_students.iloc[-1]['Applicant Rank']:

                        # Reject the worst-ranked admitted student and accept the current student
                        worst_student_id = current_students.iloc[-1]['Student ID']
                        program.loc[program['Student ID'] == worst_student_id, 'Student Status'] = 'free'
                        program.loc[program['Student ID'] == worst_student_id, 'Assignment'] = None

                        program.loc[program['Student ID'] == student_id, 'Student Status'] = 'matched'
                        program.loc[program['Student ID'] == student_id, 'Assignment'] = school_id

            print(list(program['Student Status']).count('matched'))
        
        program_list.append(program)

Year: 2015 Grade: 1
4850
4965
5000
Year: 2015 Grade: 2
4805
4965
4995
5000
Year: 2015 Grade: 3
4735
4910
5000
Year: 2016 Grade: 1
4825
4970
5000
Year: 2016 Grade: 2
4870
5000
Year: 2016 Grade: 3
4720
4960
4990
5000


In [368]:
result = pd.concat(program_list)
result

Unnamed: 0,Student ID,Year,Grade,Choice Rank,School ID,Treatment,Capacity,Priority,Default Tie-breaker Index,Non-lottery,...,Outcome 2,Covariate 1,Covariate 2,Covariate 3,Applicant Rank,Student Status,School Status,Assignment,Enrollment,Temp Capacity
35,8,2015,1,1,4,0,106,2,1,0,...,0,54,2,black,2.989828,matched,propose,6,,0
36,8,2015,1,2,7,0,107,2,1,0,...,0,54,2,black,2.989828,matched,propose,6,,0
37,8,2015,1,3,6,1,106,2,1,0,...,0,54,2,black,2.989828,matched,propose,6,,6
38,8,2015,1,4,5,0,110,1,2,0,...,0,54,2,black,1.388933,matched,free,6,,10
39,8,2015,1,5,8,2,103,1,1,0,...,0,54,2,black,1.890846,matched,free,6,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29980,5997,2016,3,1,8,0,102,2,1,0,...,0,83,1,white,2.036750,matched,propose,8,,0
29981,5997,2016,3,2,10,0,106,1,2,0,...,0,83,1,white,1.244574,matched,free,8,,0
29982,5997,2016,3,3,4,0,105,2,2,0,...,0,83,1,white,2.244574,matched,free,8,,0
29983,5997,2016,3,4,3,0,105,2,1,0,...,0,83,1,white,2.036750,matched,free,8,,0


In [369]:
result.head(20)

Unnamed: 0,Student ID,Year,Grade,Choice Rank,School ID,Treatment,Capacity,Priority,Default Tie-breaker Index,Non-lottery,...,Outcome 2,Covariate 1,Covariate 2,Covariate 3,Applicant Rank,Student Status,School Status,Assignment,Enrollment,Temp Capacity
35,8,2015,1,1,4,0,106,2,1,0,...,0,54,2,black,2.989828,matched,propose,6,,0
36,8,2015,1,2,7,0,107,2,1,0,...,0,54,2,black,2.989828,matched,propose,6,,0
37,8,2015,1,3,6,1,106,2,1,0,...,0,54,2,black,2.989828,matched,propose,6,,6
38,8,2015,1,4,5,0,110,1,2,0,...,0,54,2,black,1.388933,matched,free,6,,10
39,8,2015,1,5,8,2,103,1,1,0,...,0,54,2,black,1.890846,matched,free,6,,0
55,12,2015,1,1,7,0,107,1,1,0,...,0,29,2,asian,1.331506,matched,propose,7,,0
56,12,2015,1,2,1,2,80,2,4,1,...,0,29,2,asian,2.364,matched,free,7,,0
57,12,2015,1,3,6,1,106,2,1,0,...,0,29,2,asian,2.331506,matched,free,7,,6
58,12,2015,1,4,10,0,106,2,1,0,...,0,29,2,asian,2.331506,matched,free,7,,3
59,12,2015,1,5,2,1,104,2,2,0,...,0,29,2,asian,2.092777,matched,free,7,,0


## Randomness in Enrollment

In [370]:
for s in all_students:
    student_choices = list(result[result['Student ID'] == s]['School ID'])
    assigned = result[result['Student ID'] == s]['Assignment'].values[0]
    not_assigned = [x for x in student_choices if x != assigned]
    
    if random.uniform(0,1) < p_enroll_assign:
        result.loc[result['Student ID'] == s, 'Enrollment'] = assigned
    else:
        result.loc[result['Student ID'] == s, 'Enrollment'] = random.choice(not_assigned)

In [371]:
result

Unnamed: 0,Student ID,Year,Grade,Choice Rank,School ID,Treatment,Capacity,Priority,Default Tie-breaker Index,Non-lottery,...,Outcome 2,Covariate 1,Covariate 2,Covariate 3,Applicant Rank,Student Status,School Status,Assignment,Enrollment,Temp Capacity
35,8,2015,1,1,4,0,106,2,1,0,...,0,54,2,black,2.989828,matched,propose,6,6,0
36,8,2015,1,2,7,0,107,2,1,0,...,0,54,2,black,2.989828,matched,propose,6,6,0
37,8,2015,1,3,6,1,106,2,1,0,...,0,54,2,black,2.989828,matched,propose,6,6,6
38,8,2015,1,4,5,0,110,1,2,0,...,0,54,2,black,1.388933,matched,free,6,6,10
39,8,2015,1,5,8,2,103,1,1,0,...,0,54,2,black,1.890846,matched,free,6,6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29980,5997,2016,3,1,8,0,102,2,1,0,...,0,83,1,white,2.036750,matched,propose,8,8,0
29981,5997,2016,3,2,10,0,106,1,2,0,...,0,83,1,white,1.244574,matched,free,8,8,0
29982,5997,2016,3,3,4,0,105,2,2,0,...,0,83,1,white,2.244574,matched,free,8,8,0
29983,5997,2016,3,4,3,0,105,2,1,0,...,0,83,1,white,2.036750,matched,free,8,8,0


In [372]:
# Make Assignment and Enrollment dummy
for i in result.index:
    if result.loc[i, 'Assignment'] == result.loc[i, 'School ID']:
        result.loc[i, 'Assignment_dummy'] = 1
    else:
        result.loc[i, 'Assignment_dummy'] = 0
        
    if result.loc[i, 'Enrollment'] == result.loc[i, 'School ID']:
        result.loc[i, 'Enrollment_dummy'] = 1
    else:
        result.loc[i, 'Enrollment_dummy'] = 0

## Guaranteed assignment & Check Capacity

In [840]:
# Guaranteed assignment
for i in range(1, num_total_students+1):
    df_new = df.loc[df['Student ID'] == i]
    for j in df_new.index:
        if df.loc[j, 'Priority'] == 0:
            df.loc[j, 'Assignment'] = 1
            a = list(df_new.index)
            a.remove(j)
            for k in a:
                df.loc[k, 'Assignment'] = 0

In [841]:
# Capacity
over_assign = []
over_enroll = []
for n in range(1, num_schools+1):
    for y in years:
        for g in grades:
            if df.loc[(df['School ID'] == n) & (df['Year'] == y) & (df['Grade'] == g)]['Assignment'].sum() > capacity_dict[n][y][g]:
                over_assign.append((n,y,g))
            if df.loc[(df['School ID'] == n) & (df['Year'] == y) & (df['Grade'] == g)]['Enrollment'].sum() > capacity_dict[n][y][g]:
                over_enroll.append((n,y,g))

In [842]:
len(over_assign)

0

In [843]:
len(over_enroll)

0

## Treatment Effect & Endogeneity

### 1. Baseline (Low motivation + traditional)

In [373]:
temp = result.groupby(['Student ID']).first()
temp = temp[['Outcome 1', 'Outcome 2']]
temp

Unnamed: 0_level_0,Outcome 1,Outcome 2
Student ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
...,...,...
5996,0,0
5997,0,0
5998,0,0
5999,0,0


In [374]:
base1 = np.random.normal(base_mean, base_sd, len(temp))
base2 = np.random.normal(base_mean, base_sd, len(temp))

In [375]:
temp['Outcome 1'] = base1
temp['Outcome 2'] = base2
temp

Unnamed: 0_level_0,Outcome 1,Outcome 2
Student ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,475.150465,655.896743
2,523.949035,486.862523
3,567.221058,549.021750
4,218.115821,622.538942
5,527.842253,500.092186
...,...,...
5996,524.041724,403.020673
5997,544.314754,419.237519
5998,579.211967,301.296821
5999,549.961928,466.112136


In [376]:
result2 = result.set_index('Student ID')
result2

Unnamed: 0_level_0,Year,Grade,Choice Rank,School ID,Treatment,Capacity,Priority,Default Tie-breaker Index,Non-lottery,Tie-breaker Student Group Index,...,Covariate 2,Covariate 3,Applicant Rank,Student Status,School Status,Assignment,Enrollment,Temp Capacity,Assignment_dummy,Enrollment_dummy
Student ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,2015,1,1,4,0,106,2,1,0,0,...,2,black,2.989828,matched,propose,6,6,0,0.0,0.0
8,2015,1,2,7,0,107,2,1,0,0,...,2,black,2.989828,matched,propose,6,6,0,0.0,0.0
8,2015,1,3,6,1,106,2,1,0,0,...,2,black,2.989828,matched,propose,6,6,6,1.0,1.0
8,2015,1,4,5,0,110,1,2,0,0,...,2,black,1.388933,matched,free,6,6,10,0.0,0.0
8,2015,1,5,8,2,103,1,1,0,1,...,2,black,1.890846,matched,free,6,6,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5997,2016,3,1,8,0,102,2,1,0,0,...,1,white,2.036750,matched,propose,8,8,0,1.0,1.0
5997,2016,3,2,10,0,106,1,2,0,1,...,1,white,1.244574,matched,free,8,8,0,0.0,0.0
5997,2016,3,3,4,0,105,2,2,0,1,...,1,white,2.244574,matched,free,8,8,0,0.0,0.0
5997,2016,3,4,3,0,105,2,1,0,0,...,1,white,2.036750,matched,free,8,8,0,0.0,0.0


In [377]:
result2.update(temp)
result2

Unnamed: 0_level_0,Year,Grade,Choice Rank,School ID,Treatment,Capacity,Priority,Default Tie-breaker Index,Non-lottery,Tie-breaker Student Group Index,...,Covariate 2,Covariate 3,Applicant Rank,Student Status,School Status,Assignment,Enrollment,Temp Capacity,Assignment_dummy,Enrollment_dummy
Student ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,2015,1,1,4,0,106,2,1,0,0,...,2,black,2.989828,matched,propose,6,6,0,0.0,0.0
8,2015,1,2,7,0,107,2,1,0,0,...,2,black,2.989828,matched,propose,6,6,0,0.0,0.0
8,2015,1,3,6,1,106,2,1,0,0,...,2,black,2.989828,matched,propose,6,6,6,1.0,1.0
8,2015,1,4,5,0,110,1,2,0,0,...,2,black,1.388933,matched,free,6,6,10,0.0,0.0
8,2015,1,5,8,2,103,1,1,0,1,...,2,black,1.890846,matched,free,6,6,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5997,2016,3,1,8,0,102,2,1,0,0,...,1,white,2.036750,matched,propose,8,8,0,1.0,1.0
5997,2016,3,2,10,0,106,1,2,0,1,...,1,white,1.244574,matched,free,8,8,0,0.0,0.0
5997,2016,3,3,4,0,105,2,2,0,1,...,1,white,2.244574,matched,free,8,8,0,0.0,0.0
5997,2016,3,4,3,0,105,2,1,0,0,...,1,white,2.036750,matched,free,8,8,0,0.0,0.0


In [378]:
result3 = result2.reset_index()

In [379]:
for i in result3.index:
    # high motivation students
    if mot_dict[result3.loc[i, 'Student ID']] == 1:
        result3.loc[i, 'Outcome 1'] += treat_high * base_sd
        result3.loc[i, 'Outcome 1'] += treat_high * base_sd
    
    # treatment effect - charter
    if type_dict[result3.loc[i, 'Enrollment']] == 1:
        result3.loc[i, 'Outcome 1'] += endo_char * base_sd
        result3.loc[i, 'Outcome 1'] += endo_char * base_sd
        
    # treatment effect - innovation
    if type_dict[result3.loc[i, 'Enrollment']] == 2:
        result3.loc[i, 'Outcome 1'] += endo_inno * base_sd
        result3.loc[i, 'Outcome 1'] += endo_inno * base_sd

In [380]:
result3['Outcome 1']

0        500.773848
1        500.773848
2        500.773848
3        500.773848
4        500.773848
            ...    
29995    584.314754
29996    584.314754
29997    584.314754
29998    584.314754
29999    584.314754
Name: Outcome 1, Length: 30000, dtype: float64

# Export data

In [381]:
result3.to_excel('mock_endo_da_0805-1.xlsx')

# Import data

In [303]:
df = pd.read_excel('mock_endo_da_0803-1.xlsx')
df

Unnamed: 0.1,Unnamed: 0,Student ID,Year,Grade,Choice Rank,School ID,Treatment,Capacity,Priority,Default Tie-breaker Index,...,Covariate 2,Covariate 3,Applicant Rank,Student Status,School Status,Assignment,Enrollment,Temp Capacity,Assignment_dummy,Enrollment_dummy
0,0,8,2015,1,1,4,0,107,2,1,...,2,white,2.989828,matched,propose,6,6,0,0,0
1,1,8,2015,1,2,7,2,104,2,1,...,2,white,2.989828,matched,free,6,6,0,0,0
2,2,8,2015,1,3,6,1,103,2,1,...,2,white,2.989828,matched,propose,6,6,8,1,1
3,3,8,2015,1,4,5,0,103,1,1,...,2,white,1.989828,matched,free,6,6,3,0,0
4,4,8,2015,1,5,8,1,102,1,1,...,2,white,1.692880,matched,free,6,6,5,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,29995,5997,2016,3,1,8,0,106,2,2,...,2,black,2.349392,matched,propose,8,8,0,1,1
29996,29996,5997,2016,3,2,10,2,102,1,2,...,2,black,1.244574,matched,free,8,8,3,0,0
29997,29997,5997,2016,3,3,4,1,103,2,2,...,2,black,2.279513,matched,free,8,8,4,0,0
29998,29998,5997,2016,3,4,3,2,101,2,2,...,2,black,2.349392,matched,free,8,8,0,0,0


In [307]:
test = []
for i in df.index:
    if df.loc[i, 'Non-lottery'] == 1:
        test.append(df.loc[i, 'Student ID'])
test = set(test)

In [309]:
df2 = df[df['Student ID'].isin(test)]
df2

Unnamed: 0.1,Unnamed: 0,Student ID,Year,Grade,Choice Rank,School ID,Treatment,Capacity,Priority,Default Tie-breaker Index,...,Covariate 2,Covariate 3,Applicant Rank,Student Status,School Status,Assignment,Enrollment,Temp Capacity,Assignment_dummy,Enrollment_dummy
5,5,12,2015,1,1,7,2,104,1,1,...,0,white,1.331506,matched,propose,7,7,0,1,1
6,6,12,2015,1,2,1,0,109,2,3,...,0,white,2.187200,matched,free,7,7,9,0,0
7,7,12,2015,1,3,6,1,103,2,1,...,0,white,2.331506,matched,free,7,7,8,0,0
8,8,12,2015,1,4,10,0,101,2,2,...,0,white,2.115972,matched,free,7,7,0,0,0
9,9,12,2015,1,5,2,2,109,2,2,...,0,white,2.104375,matched,free,7,7,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29975,29975,5971,2016,3,1,6,2,108,1,2,...,0,asian,1.343186,matched,propose,6,6,0,1,1
29976,29976,5971,2016,3,2,1,1,105,2,3,...,0,asian,2.009900,matched,free,6,6,0,0,0
29977,29977,5971,2016,3,3,3,2,101,2,2,...,0,asian,2.428983,matched,free,6,6,0,0,0
29978,29978,5971,2016,3,4,5,1,109,1,1,...,0,asian,1.604664,matched,free,6,6,11,0,0


In [317]:
students = set(df2['Student ID'])
for s in students:
    temp = df2[df2['Student ID'] == s]
    for i in temp.index:
        if temp.loc[i, 'Non-lottery'] == 1:
            a = i
            c = temp.loc[i, 'Temp Capacity'] 
        if temp.loc[i, 'Assignment_dummy'] == 1:
            b = i
    if (a < b) and (c > 0):
        print(s)

In [314]:
df2[(df2['Student ID'] == 1)]['Temp Capacity']

5000    14
5001     0
5002     0
5003     0
5004     8
Name: Temp Capacity, dtype: int64

## Check if it's (partly) clean

In [15]:
df.loc[(df['Priority'] == 0)]

Unnamed: 0,Student ID,Year,Grade,Choice Rank,School ID,Treatment,Capacity,Priority,Default Tie-breaker Index,Non-lottery,...,Default Tie-breaker,Effective Tie-breaker,Assignment,Enrollment,Outcome 1,Outcome 2,Outcome 3,Covariate 1,Covariate 2,Covariate 3
0,1,2017,3,1,7,1,44,0,4,0,...,0.046638,0.041974,1,0,61,1,nocollege,84,0,asian
42,9,2018,2,3,14,0,42,0,18,1,...,0.737355,0.737355,1,0,30,2,college2,99,1,white
68,14,2017,1,4,10,1,45,0,11,1,...,0.269556,0.269556,1,0,12,0,nocollege,83,2,asian
84,17,2017,2,5,2,2,44,0,17,1,...,0.703187,0.703187,1,0,9,1,college2,55,1,black
120,25,2018,3,1,25,0,41,0,4,0,...,0.351395,0.351395,1,1,59,1,college2,51,2,black
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29880,5977,2018,3,1,3,0,45,0,16,1,...,0.025036,0.022532,1,0,90,1,college2,29,1,white
29888,5978,2018,1,4,24,2,42,0,18,1,...,0.170245,0.085123,1,0,20,2,college4,35,1,white
29916,5984,2018,2,2,21,1,44,0,10,0,...,0.503744,0.453370,0,1,87,2,college2,70,2,white
29918,5984,2018,2,4,17,0,41,0,9,0,...,0.343394,0.343394,1,0,87,2,college2,70,2,white


In [16]:
df.loc[(df['School ID'] == 15) & (df['Year'] == 2017)]

Unnamed: 0,Student ID,Year,Grade,Choice Rank,School ID,Treatment,Capacity,Priority,Default Tie-breaker Index,Non-lottery,...,Default Tie-breaker,Effective Tie-breaker,Assignment,Enrollment,Outcome 1,Outcome 2,Outcome 3,Covariate 1,Covariate 2,Covariate 3
4,1,2017,3,5,15,0,44,5,18,1,...,0.806738,0.403369,0,0,61,1,nocollege,84,0,asian
165,34,2017,3,1,15,0,44,0,18,1,...,0.501119,0.250559,1,0,82,2,college4,48,1,black
269,54,2017,3,5,15,0,44,4,18,1,...,0.204144,0.102072,0,0,35,2,nocollege,87,0,black
339,68,2017,1,5,15,0,44,5,9,0,...,0.836572,0.836572,0,1,33,2,college2,87,1,asian
340,69,2017,2,1,15,0,41,1,10,0,...,0.043625,0.030538,1,1,98,1,college4,66,1,white
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29725,5946,2017,3,1,15,0,44,4,18,1,...,0.367194,0.367194,1,1,3,1,college2,75,0,black
29820,5965,2017,2,1,15,0,41,2,10,0,...,0.370495,0.370495,0,0,72,0,college2,76,0,white
29861,5973,2017,2,2,15,0,41,5,10,0,...,0.791994,0.791994,0,0,62,0,nocollege,66,2,asian
29930,5987,2017,1,1,15,0,44,2,9,0,...,0.014310,0.014310,0,0,76,2,college2,75,2,black


In [17]:
df.loc[(df['Student ID'] == 1502) & (df['Year'] == 2017)]

Unnamed: 0,Student ID,Year,Grade,Choice Rank,School ID,Treatment,Capacity,Priority,Default Tie-breaker Index,Non-lottery,...,Default Tie-breaker,Effective Tie-breaker,Assignment,Enrollment,Outcome 1,Outcome 2,Outcome 3,Covariate 1,Covariate 2,Covariate 3
7505,1502,2017,3,1,14,2,40,3,17,1,...,0.758016,0.758016,0,0,24,1,college2,98,2,black
7506,1502,2017,3,2,18,1,41,1,20,1,...,0.4446,0.4446,0,0,24,1,college2,98,2,black
7507,1502,2017,3,3,12,1,41,5,2,0,...,0.645095,0.451566,0,0,24,1,college2,98,2,black
7508,1502,2017,3,4,11,0,45,4,9,0,...,0.231134,0.231134,1,1,24,1,college2,98,2,black
7509,1502,2017,3,5,16,0,40,1,5,0,...,0.032324,0.032324,0,0,24,1,college2,98,2,black


# 2. Create issues that result in errors / warnings

## (erroneous cells are marked  red in the excel file)

## (1) Inconsistency within a student

### (1.1) Inconsistent grade within a student

Student ID 1 \
Grade \
index 2

### (1.2) Inconsistent Outcomes within a student

Student ID 11 \
Outcome 1 \
index 50

### (1.3) Inconsistent Default Tie-breaker within a (Student ID, Default Tie-breaker Index) pair

In [None]:
df.loc[(df['Student ID'] == 8)]

Student ID 8 \
Default Tie-breaker \
index 38 and a lot more due to checkpoint #12 (correlation, orange mark in excel)

## (2) Inconsistency within a (school, year, grade)

### (2.1) Inconsistent Treatment value within a (School, Year, Grade)

In [None]:
df.loc[(df['School ID'] == 4) & (df['Year'] == 2017) & (df['Grade'] == 3)]

School ID 4, 25 \
Treatment \
index 13, 25

### (2.2) Inconsistent Capacity within a (School, Year, Grade)

In [None]:
df.loc[(df['School ID'] == 7) & (df['Year'] == 2018) & (df['Grade'] == 2)]

School ID 7 \
Capacity \
index 31

### (2.3) Inconsistent advantage within a (School, Year, Grade) pair

In [None]:
df.loc[(df['School ID'] == 16) & (df['Year'] == 2017) & (df['Grade'] == 1)]

School ID 16 \
Advantage \
index 40

### (3) A student chose multiple schools for the same rank

Student ID 2, 3 \
Choice Rank \
index 6, 7, 12, 13

### (4) A student chose the same school for multiple ranks

Student ID 3, 4 \
School ID \
index 12, 13, 15, 16

### (5) Inconsecutive Choice Rank (e.g., 1,2,4)

Student ID 5 \
Choice Rank \
index 22, 23, 24

## (6) Multiple Assignment / Enrollment

### (6.1) A student is assigned to multiple schools

Student ID 12 \
Assignment \
index 55, 56

### (6.2) A student is enrolled in multiple schools

Student ID 13 \
Enrollment \
index 60, 61

## (7) Over capacity 

### (7.1) A (school, year, grade) is assigned with more students than its capacity and contains at least one student who is not guaranteed an assignment

In [None]:
over_assign

In [None]:
df.loc[(df['School ID'] == 6) & (df['Year'] == 2018) & (df['Grade'] == 3)]

In [None]:
df.loc[(df['School ID'] == 6) & (df['Year'] == 2018) & (df['Grade'] == 3)]['Assignment'].sum()

Most of people have non-zero priority.

Thus, Stata should return the 3 schools (6, 15, 16) in the over_assign list.

### (7.2) A (school, year, grade) pair is enrolled by more students than its capacity and contains at least one student who is not guaranteed an assignment

In [None]:
over_enroll

Stata should return these two schools

### (8) A student is not assigned to a school although the student was guaranteed an assignment to that school and she was not assigned to any school she prefers to that school.

Student ID 15 \
Assignment \
index 74

### (9) A student is assigned to school s, even though (1) she prefers school s' to s, (2) her applicant position at s' was better than her position at s, (3) there were still available spots at s', (4) and she is eligible at s'

There should be a lot of cases of this. The following case should be included: \
Student ID 16 \
School ID 15 \
while she prefers school 3, 24 and her applicant position is better at those schools. \
Assignment \
index 78

### (10) Abnormally large value (e.g., greater than 10 × mean of that column) found in a column that is unlikely to have a huge outlier

Student ID 18 \
Grade \
index 85-89

### (11) A school uses non-lottery tie-breaker, and correlation between Priority and Tie-breaker within the (school, year, grade) approximates 1

School ID 1 \
Year 2017 \
Grade 1 \
Orange