## NCES District Level Cleaning Notebook

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


#pd.options.display.max_rows = 4000
#pd.options.display.max_columns = 200

In [3]:
#re-name the columns for simplicity and merging later
district_columns21 = ['Agency Name', \
                    'State', \
                    'NCES Agency ID', \
                    'County Name',\
                    'County #', \
                    'Total Public Schools 2020-2021', \
                    'Total Operational Public Schools 2020-2021', \
                    'State Agency ID', \
                    'Total Students 2020-2021', \
                    'FTE Teachers 2020-2021', \
                    'Total Staff 2020-2021']

# read in the csv file and treat –, ‡ as NaN
# we will replace † with 0
df_district21 = pd.read_csv('nces data/nces_district_2021.csv',names = district_columns21, na_values=["–", "‡"])
df_district21.replace('†', 0, inplace=True)

In [4]:
df_district21['District #'] = df_district21['State Agency ID'].str[3:]

In [5]:
df_district21.dropna(subset = ["State"], inplace = True)
df_district21 = df_district21.drop(index=3)
df_district21.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1235 entries, 4 to 1238
Data columns (total 12 columns):
Agency Name                                   1235 non-null object
State                                         1235 non-null object
NCES Agency ID                                1235 non-null object
County Name                                   1235 non-null object
County #                                      1235 non-null object
Total Public Schools 2020-2021                1235 non-null object
Total Operational Public Schools 2020-2021    1235 non-null object
State Agency ID                               1235 non-null object
Total Students 2020-2021                      1234 non-null object
FTE Teachers 2020-2021                        1232 non-null object
Total Staff 2020-2021                         1232 non-null object
District #                                    1235 non-null object
dtypes: object(12)
memory usage: 125.4+ KB


In [6]:
num_cols21 = ['NCES Agency ID', \
              'County #', \
              'Total Public Schools 2020-2021', \
              'Total Operational Public Schools 2020-2021', \
              'Total Students 2020-2021', \
              'FTE Teachers 2020-2021', \
              'Total Staff 2020-2021']

for col in num_cols21:
    df_district21[col] = df_district21[col].astype(str)
    df_district21[col] = df_district21[col].map(lambda x: x.lstrip('="').rstrip('"'))
    df_district21[col] = df_district21[col].astype(float)



In [7]:
df_district21.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1235 entries, 4 to 1238
Data columns (total 12 columns):
Agency Name                                   1235 non-null object
State                                         1235 non-null object
NCES Agency ID                                1235 non-null float64
County Name                                   1235 non-null object
County #                                      1235 non-null float64
Total Public Schools 2020-2021                1235 non-null float64
Total Operational Public Schools 2020-2021    1235 non-null float64
State Agency ID                               1235 non-null object
Total Students 2020-2021                      1234 non-null float64
FTE Teachers 2020-2021                        1232 non-null float64
Total Staff 2020-2021                         1232 non-null float64
District #                                    1235 non-null object
dtypes: float64(7), object(5)
memory usage: 125.4+ KB


In [8]:
df_district21['% Operational Schools 2020-2021'] = df_district21['Total Operational Public Schools 2020-2021'] / df_district21['Total Public Schools 2020-2021']

In [9]:
#df_district21.to_csv('nces_district21r.csv', index = None)

# remove instances with 0 students and
# remove instances with no district #
no_students = df_district21[df_district21['Total Students 2020-2021'] == 0]
df_district21.drop(no_students.index, inplace=True)
no_schools = df_district21[df_district21['Total Operational Public Schools 2020-2021'] == 0]
df_district21.drop(no_schools.index, inplace=True)
df_district21.dropna(subset = ['State Agency ID'], inplace = True)
df_district21

Unnamed: 0,Agency Name,State,NCES Agency ID,County Name,County #,Total Public Schools 2020-2021,Total Operational Public Schools 2020-2021,State Agency ID,Total Students 2020-2021,FTE Teachers 2020-2021,Total Staff 2020-2021,District #,% Operational Schools 2020-2021
4,A W BROWN LEADERSHIP ACADEMY,Texas,4800095.0,Dallas County,48113.0,2.0,2.0,TX-057816,1383.0,76.79,170.87,057816,1.000000
5,A+ ACADEMY,Texas,4800203.0,Dallas County,48113.0,2.0,2.0,TX-057829,1456.0,92.87,213.39,057829,1.000000
6,A+ UNLIMITED POTENTIAL,TEXAS,4801453.0,Harris County,48201.0,1.0,1.0,TX-101871,152.0,6.91,13.61,101871,1.000000
7,ABBOTT ISD,Texas,4807380.0,Hill County,48217.0,2.0,1.0,TX-109901,274.0,25.13,41.72,109901,0.500000
8,ABERNATHY ISD,Texas,4807410.0,Hale County,48189.0,4.0,3.0,TX-095901,810.0,72.55,124.29,095901,0.750000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1234,YORKTOWN ISD,Texas,4846650.0,DeWitt County,48123.0,3.0,3.0,TX-062904,524.0,51.22,91.83,062904,1.000000
1235,YSLETA ISD,Texas,4846680.0,El Paso County,48141.0,60.0,58.0,TX-071905,38390.0,2642.51,6119.42,071905,0.966667
1236,ZAPATA COUNTY ISD,Texas,4846710.0,Zapata County,48505.0,6.0,6.0,TX-253901,3486.0,227.95,519.55,253901,1.000000
1237,ZAVALLA ISD,Texas,4846740.0,Angelina County,48005.0,3.0,2.0,TX-003906,300.0,32.14,59.02,003906,0.666667


In [10]:
df_district19 = pd.read_csv('nces data/nces_district_2019.csv', header = 3, na_values=["–", "‡"])

In [11]:
#re-name the columns for simplicity and merging later
district_columns19 = ['Agency Name', \
                    'State', \
                    'NCES Agency ID', \
                    'County Name', \
                    'County #', \
                    'Total Public Schools 2018-2019', \
                    'Total Operational Public Schools 2018-2019', \
                    'State Agency ID', \
                    'Total Students 2018-2019', \
                    'FTE Teachers 2018-2019', \
                    'Total Staff 2018-2019']

# read in the csv file and treat –, ‡ as NaN
# we will replace † with 0
df_district19 = pd.read_csv('nces data/nces_district_2019.csv', header = 3, names = district_columns19, na_values=["–", "‡"])
df_district19.replace('†', 0, inplace=True)
df_district19.dropna(subset = ["State"], inplace = True)
df_district19


Unnamed: 0,Agency Name,State,NCES Agency ID,County Name,County #,Total Public Schools 2018-2019,Total Operational Public Schools 2018-2019,State Agency ID,Total Students 2018-2019,FTE Teachers 2018-2019,Total Staff 2018-2019
0,A W BROWN LEADERSHIP ACADEMY,Texas,4800095.0,Dallas County,48113.0,2,2,TX-057816,2084,115.60,218.40
1,A+ ACADEMY,Texas,4800203.0,Dallas County,48113.0,2,2,TX-057829,1409,86.58,192.28
2,A+ UNLIMITED POTENTIAL,TEXAS,4801453.0,Harris County,48201.0,2,2,TX-101871,180,7.67,10.34
3,ABBOTT ISD,Texas,4807380.0,Hill County,48217.0,2,1,TX-109901,277,24.25,40.00
4,ABERNATHY ISD,Texas,4807410.0,Hale County,48189.0,4,3,TX-095901,780,68.00,124.97
...,...,...,...,...,...,...,...,...,...,...,...
1225,YORKTOWN ISD,Texas,4846650.0,DeWitt County,48123.0,3,3,TX-062904,546,43.19,85.28
1226,YSLETA ISD,Texas,4846680.0,El Paso County,48141.0,61,61,TX-071905,41064,2747.52,6272.21
1227,ZAPATA COUNTY ISD,Texas,4846710.0,Zapata County,48505.0,6,6,TX-253901,3522,226.78,490.74
1228,ZAVALLA ISD,Texas,4846740.0,Angelina County,48005.0,3,2,TX-003906,360,34.35,59.76


In [12]:
df_district19['District #'] = df_district19['State Agency ID'].str[3:]


In [13]:
#Had to additionally convert NCES Agency ID and County # in order to merge 19 & 21

num_cols19 = ['NCES Agency ID', \
              'County #', \
              'Total Public Schools 2018-2019', \
              'Total Operational Public Schools 2018-2019', \
                'Total Students 2018-2019', \
                'FTE Teachers 2018-2019', \
                'Total Staff 2018-2019']

for col in num_cols19:
    df_district19[col] = df_district19[col].astype(str)
    df_district19[col] = df_district19[col].map(lambda x: x.lstrip('="').rstrip('"'))
    df_district19[col] = df_district19[col].astype(float)

In [14]:
df_district19#.info()

Unnamed: 0,Agency Name,State,NCES Agency ID,County Name,County #,Total Public Schools 2018-2019,Total Operational Public Schools 2018-2019,State Agency ID,Total Students 2018-2019,FTE Teachers 2018-2019,Total Staff 2018-2019,District #
0,A W BROWN LEADERSHIP ACADEMY,Texas,4800095.0,Dallas County,48113.0,2.0,2.0,TX-057816,2084.0,115.60,218.40,057816
1,A+ ACADEMY,Texas,4800203.0,Dallas County,48113.0,2.0,2.0,TX-057829,1409.0,86.58,192.28,057829
2,A+ UNLIMITED POTENTIAL,TEXAS,4801453.0,Harris County,48201.0,2.0,2.0,TX-101871,180.0,7.67,10.34,101871
3,ABBOTT ISD,Texas,4807380.0,Hill County,48217.0,2.0,1.0,TX-109901,277.0,24.25,40.00,109901
4,ABERNATHY ISD,Texas,4807410.0,Hale County,48189.0,4.0,3.0,TX-095901,780.0,68.00,124.97,095901
...,...,...,...,...,...,...,...,...,...,...,...,...
1225,YORKTOWN ISD,Texas,4846650.0,DeWitt County,48123.0,3.0,3.0,TX-062904,546.0,43.19,85.28,062904
1226,YSLETA ISD,Texas,4846680.0,El Paso County,48141.0,61.0,61.0,TX-071905,41064.0,2747.52,6272.21,071905
1227,ZAPATA COUNTY ISD,Texas,4846710.0,Zapata County,48505.0,6.0,6.0,TX-253901,3522.0,226.78,490.74,253901
1228,ZAVALLA ISD,Texas,4846740.0,Angelina County,48005.0,3.0,2.0,TX-003906,360.0,34.35,59.76,003906


In [15]:
df_district19['% Operational Schools 2018-2019'] = df_district19['Total Operational Public Schools 2018-2019'] / df_district19['Total Public Schools 2018-2019']
df_district19

Unnamed: 0,Agency Name,State,NCES Agency ID,County Name,County #,Total Public Schools 2018-2019,Total Operational Public Schools 2018-2019,State Agency ID,Total Students 2018-2019,FTE Teachers 2018-2019,Total Staff 2018-2019,District #,% Operational Schools 2018-2019
0,A W BROWN LEADERSHIP ACADEMY,Texas,4800095.0,Dallas County,48113.0,2.0,2.0,TX-057816,2084.0,115.60,218.40,057816,1.000000
1,A+ ACADEMY,Texas,4800203.0,Dallas County,48113.0,2.0,2.0,TX-057829,1409.0,86.58,192.28,057829,1.000000
2,A+ UNLIMITED POTENTIAL,TEXAS,4801453.0,Harris County,48201.0,2.0,2.0,TX-101871,180.0,7.67,10.34,101871,1.000000
3,ABBOTT ISD,Texas,4807380.0,Hill County,48217.0,2.0,1.0,TX-109901,277.0,24.25,40.00,109901,0.500000
4,ABERNATHY ISD,Texas,4807410.0,Hale County,48189.0,4.0,3.0,TX-095901,780.0,68.00,124.97,095901,0.750000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1225,YORKTOWN ISD,Texas,4846650.0,DeWitt County,48123.0,3.0,3.0,TX-062904,546.0,43.19,85.28,062904,1.000000
1226,YSLETA ISD,Texas,4846680.0,El Paso County,48141.0,61.0,61.0,TX-071905,41064.0,2747.52,6272.21,071905,1.000000
1227,ZAPATA COUNTY ISD,Texas,4846710.0,Zapata County,48505.0,6.0,6.0,TX-253901,3522.0,226.78,490.74,253901,1.000000
1228,ZAVALLA ISD,Texas,4846740.0,Angelina County,48005.0,3.0,2.0,TX-003906,360.0,34.35,59.76,003906,0.666667


In [16]:

# remove instances with 0 students and
# remove instances with no district #
no_students = df_district19[df_district19['Total Students 2018-2019'] == 0]
df_district19.drop(no_students.index, inplace=True)
no_schools = df_district19[df_district19['Total Operational Public Schools 2018-2019'] == 0]
df_district19.drop(no_schools.index, inplace=True)
df_district19.dropna(subset = ['State Agency ID'], inplace = True)

In [17]:
df_district1921 = pd.concat([df_district19, df_district21], axis=0, sort=False)

In [18]:
df_district1921#.info()

Unnamed: 0,Agency Name,State,NCES Agency ID,County Name,County #,Total Public Schools 2018-2019,Total Operational Public Schools 2018-2019,State Agency ID,Total Students 2018-2019,FTE Teachers 2018-2019,Total Staff 2018-2019,District #,% Operational Schools 2018-2019,Total Public Schools 2020-2021,Total Operational Public Schools 2020-2021,Total Students 2020-2021,FTE Teachers 2020-2021,Total Staff 2020-2021,% Operational Schools 2020-2021
0,A W BROWN LEADERSHIP ACADEMY,Texas,4800095.0,Dallas County,48113.0,2.0,2.0,TX-057816,2084.0,115.60,218.40,057816,1.00,,,,,,
1,A+ ACADEMY,Texas,4800203.0,Dallas County,48113.0,2.0,2.0,TX-057829,1409.0,86.58,192.28,057829,1.00,,,,,,
2,A+ UNLIMITED POTENTIAL,TEXAS,4801453.0,Harris County,48201.0,2.0,2.0,TX-101871,180.0,7.67,10.34,101871,1.00,,,,,,
3,ABBOTT ISD,Texas,4807380.0,Hill County,48217.0,2.0,1.0,TX-109901,277.0,24.25,40.00,109901,0.50,,,,,,
4,ABERNATHY ISD,Texas,4807410.0,Hale County,48189.0,4.0,3.0,TX-095901,780.0,68.00,124.97,095901,0.75,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1234,YORKTOWN ISD,Texas,4846650.0,DeWitt County,48123.0,,,TX-062904,,,,062904,,3.0,3.0,524.0,51.22,91.83,1.000000
1235,YSLETA ISD,Texas,4846680.0,El Paso County,48141.0,,,TX-071905,,,,071905,,60.0,58.0,38390.0,2642.51,6119.42,0.966667
1236,ZAPATA COUNTY ISD,Texas,4846710.0,Zapata County,48505.0,,,TX-253901,,,,253901,,6.0,6.0,3486.0,227.95,519.55,1.000000
1237,ZAVALLA ISD,Texas,4846740.0,Angelina County,48005.0,,,TX-003906,,,,003906,,3.0,2.0,300.0,32.14,59.02,0.666667


In [19]:
#re-name the columns for simplicity and merging later
grade_columns = ['Agency Name', \
                'State', \
                'Grade 3 2020-2021', \
                'Grade 3 2018-2019', \
                'Grade 4 2020-2021', \
                'Grade 4 2018-2019', \
                'Grade 5 2020-2021', \
                'Grade 5 2018-2019', \
                'Grade 6 2020-2021', \
                'Grade 6 2018-2019', \
                'Grade 7 2020-2021', \
                'Grade 7 2018-2019', \
                'Grade 8 2020-2021', \
                'Grade 8 2018-2019', \
                'State Agency ID 2020-2021', \
                'State Agency ID 2018-2019']

# read in the csv file and treat –, ‡ as NaN
# we will replace † with 0
df_grades = pd.read_csv('nces data/nces_grades.csv', header = 3, names = grade_columns, na_values=["–", "‡", "†"])
#df_grades.replace('†', 0, inplace=True)
#df_grades.dropna(inplace=True)
df_grades['State Agency ID 2020-2021'].fillna(df_grades['State Agency ID 2018-2019'], inplace = True)
df_grades['State Agency ID 2018-2019'].fillna(df_grades['State Agency ID 2020-2021'], inplace = True)
df_grades['District #'] = df_grades['State Agency ID 2020-2021'].str[3:]

In [20]:
df_grades.dropna(subset = ['State Agency ID 2018-2019', 'State Agency ID 2020-2021'], inplace = True)
df_grades.drop(columns = ['State Agency ID 2018-2019', 'State Agency ID 2020-2021'], inplace = True)

In [21]:
grades = ['Grade 3 2020-2021', \
                'Grade 3 2018-2019', \
                'Grade 4 2020-2021', \
                'Grade 4 2018-2019', \
                'Grade 5 2020-2021', \
                'Grade 5 2018-2019', \
                'Grade 6 2020-2021', \
                'Grade 6 2018-2019', \
                'Grade 7 2020-2021', \
                'Grade 7 2018-2019', \
                'Grade 8 2020-2021', \
                'Grade 8 2018-2019']

for col in grades:
    df_grades[col] = df_grades[col].astype(str)
    df_grades[col] = df_grades[col].map(lambda x: x.lstrip('="').rstrip('"'))
    df_grades[col] = df_grades[col].astype(float)
    

In [22]:
df_grades.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19789 entries, 0 to 19788
Data columns (total 15 columns):
Agency Name          19789 non-null object
State                19789 non-null object
Grade 3 2020-2021    15911 non-null float64
Grade 3 2018-2019    16080 non-null float64
Grade 4 2020-2021    15897 non-null float64
Grade 4 2018-2019    16092 non-null float64
Grade 5 2020-2021    15951 non-null float64
Grade 5 2018-2019    16154 non-null float64
Grade 6 2020-2021    16071 non-null float64
Grade 6 2018-2019    16278 non-null float64
Grade 7 2020-2021    15781 non-null float64
Grade 7 2018-2019    16068 non-null float64
Grade 8 2020-2021    15749 non-null float64
Grade 8 2018-2019    16018 non-null float64
District #           19789 non-null object
dtypes: float64(12), object(3)
memory usage: 2.4+ MB


In [23]:
df_merge = pd.merge(df_district1921, df_grades, how = "left", on = ["Agency Name", "District #", "State"])

In [24]:
df_1921 = pd.read_csv('nces data/nces_campus_1921.csv',header = 3, na_values=["–", "‡", "†"])

In [25]:
df_1921.dropna(subset = ["State Name [Public School] Latest available year"], inplace = True)

In [26]:
df_1921

Unnamed: 0,School Name,State Name [Public School] Latest available year,State Agency ID [Public School] 2020-21,State Agency ID [Public School] 2018-19,Virtual School Status (SY 2016-17 onward) [Public School] 2020-21,Virtual School Status (SY 2016-17 onward) [Public School] 2018-19,School-wide Title I [Public School] 2020-21,School-wide Title I [Public School] 2018-19,Title I Eligible School [Public School] 2020-21,Title I Eligible School [Public School] 2018-19,...,Black or African American Students [Public School] 2020-21,Black or African American Students [Public School] 2018-19,White Students [Public School] 2020-21,White Students [Public School] 2018-19,Nat. Hawaiian or Other Pacific Isl. Students [Public School] 2020-21,Nat. Hawaiian or Other Pacific Isl. Students [Public School] 2018-19,Two or More Races Students [Public School] 2020-21,Two or More Races Students [Public School] 2018-19,Total Race/Ethnicity [Public School] 2020-21,Total Race/Ethnicity [Public School] 2018-19
0,21ST CENTURY EARLY LEARNING FOUNDATIONS ACADEMY,TEXAS,TX-108913,TX-108913,,NOTVIRTUAL,1-Yes,1-Yes,1-Yes,1-Yes,...,"=""0""","=""0""",2,11,"=""0""","=""0""","=""0""","=""0""",424,514
1,3D ACADEMY,Texas,TX-108902,TX-108902,,NOTVIRTUAL,1-Yes,1-Yes,1-Yes,1-Yes,...,"=""0""","=""0""","=""0""","=""0""","=""0""","=""0""","=""0""","=""0""",127,158
2,3RD H S,Texas,TX-043914,TX-043914,,,,,,,...,,,,,,,,,,
3,7TH H S - NORTHEAST,Texas,TX-246909,TX-246909,,,,,,,...,,,,,,,,,,
4,A & M CONS H S,Texas,TX-021901,TX-021901,,NOTVIRTUAL,2-No,2-No,1-Yes,1-Yes,...,256,252,988,878,4,"=""0""",60,64,1897,1763
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9701,ZEPHYR SCHOOL,Texas,TX-025906,TX-025906,,NOTVIRTUAL,1-Yes,1-Yes,1-Yes,1-Yes,...,1,1,162,177,"=""0""","=""0""",14,10,199,218
9702,ZILKER EL,Texas,TX-227901,TX-227901,,NOTVIRTUAL,,,2-No,2-No,...,6,7,291,315,1,"=""0""",21,28,443,524
9703,ZUE S BALES INT,Texas,TX-084911,TX-084911,,NOTVIRTUAL,1-Yes,1-Yes,1-Yes,1-Yes,...,12,8,402,428,"=""0""",1,15,26,599,632
9704,ZUNDELOWITZ EL,TEXAS,TX-243905,TX-243905,,NOTVIRTUAL,1-Yes,1-Yes,1-Yes,1-Yes,...,38,38,86,123,"=""0""","=""0""",34,33,512,566


In [27]:
df_2921 = df_1921.drop(columns = ['Virtual School Status (SY 2016-17 onward) [Public School] 2020-21', \
                         'Virtual School Status (SY 2016-17 onward) [Public School] 2018-19', \
                        'American Indian/Alaska Native Students [Public School] 2020-21',\
                       'American Indian/Alaska Native Students [Public School] 2018-19',\
                        'Two or More Races Students [Public School] 2020-21',\
                       'Two or More Races Students [Public School] 2018-19', \
                       'Nat. Hawaiian or Other Pacific Isl. Students [Public School] 2020-21',\
                       'Nat. Hawaiian or Other Pacific Isl. Students [Public School] 2018-19',\
                        'Title I School Status [Public School] 2020-21',\
                        'Title I School Status [Public School] 2018-19'], inplace = True)

In [28]:
new_columns  ={'State Name [Public School] Latest available year' : 'State Name',\
               'State Agency ID [Public School] 2020-21': 'State Agency ID 2020-2021',\
               'State Agency ID [Public School] 2018-19':'State Agency ID 2018-2019',\
               'Free Lunch Eligible [Public School] 2020-21': 'Free Lunch 2020-2021',\
               'Free Lunch Eligible [Public School] 2018-19':'Free Lunch 2018-2019',\
               'Reduced-price Lunch Eligible Students [Public School] 2020-21': 'Reduced-price Lunch 2020-2021', \
               'Reduced-price Lunch Eligible Students [Public School] 2018-19': 'Reduced-price Lunch 2018-2019',\
               'Asian or Asian/Pacific Islander Students [Public School] 2020-21': 'Asian or Asian/Pacific Islander Students 2020-2021',\
               'Asian or Asian/Pacific Islander Students [Public School] 2018-19':'Asian or Asian/Pacific Islander Students 2018-2019',\
               'Hispanic Students [Public School] 2020-21': 'Hispanic Students 2020-2021',\
               'Hispanic Students [Public School] 2018-19' : 'Hispanic Students 2018-2019',\
               'Black or African American Students [Public School] 2020-21' : 'Black or African American Students 2020-2021',\
               'Black or African American Students [Public School] 2018-19': 'Black or African American Students 2018-2019',\
               'White Students [Public School] 2020-21': 'White Students 2020-2021',\
               'White Students [Public School] 2018-19': 'White Students 2018-2019',\
               'Total Race/Ethnicity [Public School] 2020-21' : 'Total Race/Ethnicity 2020-2021',\
               'Total Race/Ethnicity [Public School] 2018-19' : 'Total Race/Ethnicity 2018-2019',\
                'School-wide Title I [Public School] 2020-21': 'School-wide Title I 2020-2021', \
                'School-wide Title I [Public School] 2018-19':'School-wide Title I 2018-2019', \
                'Title I Eligible School [Public School] 2020-21':'Title I Eligible School 2020-2021', \
                'Title I Eligible School [Public School] 2018-19':'Title I Eligible School 2018-2019'}
    
df_1921.rename(columns = new_columns, inplace = True)

In [29]:
columns = ['Title I Eligible School 2020-2021','Title I Eligible School 2018-2019', 'School-wide Title I 2018-2019', 'School-wide Title I 2020-2021']
for col in columns:
    df_1921[col] = df_1921[col].astype(str)
    df_1921[col] = df_1921[col].map(lambda x: x.lstrip('1-').rstrip('"'))
    df_1921[col] = df_1921[col].map(lambda x: x.lstrip('2-').rstrip('"'))

df_1921=df_1921.replace(to_replace = 'Yes', value="1")
df_1921=df_1921.replace(to_replace = 'nan', value="0")
df_1921=df_1921.replace(to_replace = 'No', value="0")
df_1921

Unnamed: 0,School Name,State Name,State Agency ID 2020-2021,State Agency ID 2018-2019,School-wide Title I 2020-2021,School-wide Title I 2018-2019,Title I Eligible School 2020-2021,Title I Eligible School 2018-2019,Free Lunch 2020-2021,Free Lunch 2018-2019,...,Asian or Asian/Pacific Islander Students 2020-2021,Asian or Asian/Pacific Islander Students 2018-2019,Hispanic Students 2020-2021,Hispanic Students 2018-2019,Black or African American Students 2020-2021,Black or African American Students 2018-2019,White Students 2020-2021,White Students 2018-2019,Total Race/Ethnicity 2020-2021,Total Race/Ethnicity 2018-2019
0,21ST CENTURY EARLY LEARNING FOUNDATIONS ACADEMY,TEXAS,TX-108913,TX-108913,1,1,1,1,423.0,512,...,"=""0""","=""0""",422,503,"=""0""","=""0""",2,11,424,514
1,3D ACADEMY,Texas,TX-108902,TX-108902,1,1,1,1,121.0,152,...,"=""0""","=""0""",127,158,"=""0""","=""0""","=""0""","=""0""",127,158
2,3RD H S,Texas,TX-043914,TX-043914,0,0,0,0,,,...,,,,,,,,,,
3,7TH H S - NORTHEAST,Texas,TX-246909,TX-246909,0,0,0,0,,,...,,,,,,,,,,
4,A & M CONS H S,Texas,TX-021901,TX-021901,0,0,1,1,496.0,533,...,135,134,449,427,256,252,988,878,1897,1763
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9701,ZEPHYR SCHOOL,Texas,TX-025906,TX-025906,1,1,1,1,104.0,119,...,"=""0""","=""0""",22,30,1,1,162,177,199,218
9702,ZILKER EL,Texas,TX-227901,TX-227901,0,0,0,0,78.0,72,...,7,10,116,163,6,7,291,315,443,524
9703,ZUE S BALES INT,Texas,TX-084911,TX-084911,1,1,1,1,102.0,87,...,18,24,152,143,12,8,402,428,599,632
9704,ZUNDELOWITZ EL,TEXAS,TX-243905,TX-243905,1,1,1,1,403.0,463,...,3,1,347,366,38,38,86,123,512,566


In [30]:
df_1921['School-wide Title I 2018-2019'].astype(float)
df_1921['School-wide Title I 2020-2021'].astype(float)
df_1921['Title I Eligible School 2018-2019'].astype(float)
df_1921['Title I Eligible School 2020-2021'].astype(float)

0       1.0
1       1.0
2       0.0
3       0.0
4       1.0
       ... 
9701    1.0
9702    0.0
9703    1.0
9704    1.0
9705    1.0
Name: Title I Eligible School 2020-2021, Length: 9706, dtype: float64

In [31]:
no_id = df_1921[df_1921['State Agency ID 2020-2021'] == 0]
no_id

Unnamed: 0,School Name,State Name,State Agency ID 2020-2021,State Agency ID 2018-2019,School-wide Title I 2020-2021,School-wide Title I 2018-2019,Title I Eligible School 2020-2021,Title I Eligible School 2018-2019,Free Lunch 2020-2021,Free Lunch 2018-2019,...,Asian or Asian/Pacific Islander Students 2020-2021,Asian or Asian/Pacific Islander Students 2018-2019,Hispanic Students 2020-2021,Hispanic Students 2018-2019,Black or African American Students 2020-2021,Black or African American Students 2018-2019,White Students 2020-2021,White Students 2018-2019,Total Race/Ethnicity 2020-2021,Total Race/Ethnicity 2018-2019


In [32]:
df_1921['State Agency ID 2020-2021'].fillna(df_1921['State Agency ID 2018-2019'], inplace = True)
df_1921['State Agency ID 2018-2019'].fillna(df_1921['State Agency ID 2020-2021'], inplace = True)

In [33]:
df_1921['State Agency ID 2020-2021'].value_counts(dropna = False)

TX-101912    282
TX-057905    249
TX-220905    150
TX-227901    136
TX-015915    130
            ... 
TX-019910      1
TX-101878      1
TX-049906      1
TX-028906      1
TX-112907      1
Name: State Agency ID 2020-2021, Length: 1219, dtype: int64

In [34]:
df_1921['State Agency ID 2018-2019'].value_counts(dropna = False)

TX-101912    282
TX-057905    249
TX-220905    150
TX-227901    136
TX-015915    130
            ... 
TX-019910      1
TX-101878      1
TX-049906      1
TX-028906      1
TX-112907      1
Name: State Agency ID 2018-2019, Length: 1219, dtype: int64

In [35]:
df_1921.dropna(subset = ['State Agency ID 2018-2019', 'State Agency ID 2020-2021'], inplace = True)

In [36]:
num_cols = ['Free Lunch 2020-2021', \
            'Free Lunch 2018-2019', \
            'Reduced-price Lunch 2020-2021', \
            'Reduced-price Lunch 2018-2019', \
            'Title I Eligible School 2020-2021',\
            'Title I Eligible School 2018-2019',\
            'School-wide Title I 2020-2021', \
            'School-wide Title I 2018-2019', \
            'Asian or Asian/Pacific Islander Students 2020-2021', \
            'Asian or Asian/Pacific Islander Students 2018-2019', \
            'Hispanic Students 2020-2021', \
            'Hispanic Students 2018-2019', \
            'Black or African American Students 2020-2021', \
            'Black or African American Students 2018-2019', \
            'White Students 2020-2021', \
            'White Students 2018-2019', \
            'Total Race/Ethnicity 2020-2021', \
            'Total Race/Ethnicity 2018-2019']

for col in num_cols:
    df_1921[col] = df_1921[col].astype(str)
    df_1921[col] = df_1921[col].map(lambda x: x.lstrip('="').rstrip('"'))
    df_1921[col] = df_1921[col].astype(float)

In [37]:
df_1921['District #'] = df_1921['State Agency ID 2020-2021'].str[3:]
df_1921['District #'].value_counts(dropna = False)


101912    282
057905    249
220905    150
227901    136
015915    130
         ... 
090905      1
025904      1
144903      1
180903      1
111801      1
Name: District #, Length: 1219, dtype: int64

In [38]:
df_1921['School-wide Title I 2018-2019'].value_counts()

1.0    6998
0.0    2708
Name: School-wide Title I 2018-2019, dtype: int64

In [39]:
df_1921['School-wide Title I 2018-2019'].value_counts()

1.0    6998
0.0    2708
Name: School-wide Title I 2018-2019, dtype: int64

In [40]:
df_1921['Title I Eligible School 2020-2021'].value_counts()

1.0    7418
0.0    2288
Name: Title I Eligible School 2020-2021, dtype: int64

In [41]:
df_1921['Title I Eligible School 2018-2019'].value_counts()

1.0    7309
0.0    2397
Name: Title I Eligible School 2018-2019, dtype: int64

In [42]:
#df_1921.fillna(0, inplace = True)

In [43]:
no_students = df_1921[df_1921['Total Race/Ethnicity 2020-2021'] == 0]

no_students = no_students[no_students['Total Race/Ethnicity 2018-2019'] == 0]

df_1921.drop(no_students.index, inplace=True)

In [44]:
df_1921.groupby('District #').agg('sum')

Unnamed: 0_level_0,School-wide Title I 2020-2021,School-wide Title I 2018-2019,Title I Eligible School 2020-2021,Title I Eligible School 2018-2019,Free Lunch 2020-2021,Free Lunch 2018-2019,Reduced-price Lunch 2020-2021,Reduced-price Lunch 2018-2019,Asian or Asian/Pacific Islander Students 2020-2021,Asian or Asian/Pacific Islander Students 2018-2019,Hispanic Students 2020-2021,Hispanic Students 2018-2019,Black or African American Students 2020-2021,Black or African American Students 2018-2019,White Students 2020-2021,White Students 2018-2019,Total Race/Ethnicity 2020-2021,Total Race/Ethnicity 2018-2019
District #,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
001902,2.0,2.0,2.0,3.0,159.0,181.0,43.0,59.0,3.0,3.0,52.0,38.0,22.0,20.0,431.0,478.0,535.0,564.0
001903,4.0,4.0,4.0,4.0,515.0,660.0,56.0,67.0,2.0,4.0,137.0,142.0,53.0,68.0,951.0,981.0,1200.0,1255.0
001904,3.0,3.0,3.0,3.0,390.0,392.0,38.0,40.0,9.0,8.0,62.0,66.0,79.0,66.0,586.0,629.0,769.0,804.0
001906,2.0,2.0,2.0,2.0,142.0,137.0,26.0,38.0,1.0,0.0,45.0,51.0,24.0,31.0,254.0,268.0,339.0,366.0
001907,6.0,6.0,6.0,6.0,2574.0,2316.0,152.0,228.0,26.0,31.0,1445.0,1398.0,883.0,902.0,894.0,936.0,3400.0,3393.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
252902,1.0,1.0,1.0,1.0,113.0,115.0,18.0,13.0,0.0,0.0,47.0,43.0,0.0,0.0,156.0,159.0,210.0,206.0
252903,3.0,3.0,3.0,3.0,351.0,404.0,46.0,59.0,4.0,4.0,259.0,264.0,17.0,15.0,405.0,370.0,707.0,677.0
253901,6.0,6.0,6.0,6.0,3067.0,2989.0,0.0,0.0,3.0,4.0,3461.0,3492.0,0.0,1.0,21.0,23.0,3486.0,3522.0
254901,5.0,5.0,5.0,5.0,1531.0,1630.0,0.0,0.0,3.0,4.0,1810.0,1902.0,9.0,13.0,12.0,7.0,1838.0,1936.0


In [45]:
df_merge = pd.merge(df_merge, df_1921, how="outer", on= ["District #"])

#district_1921.nunique()

In [46]:
df_merge

Unnamed: 0,Agency Name,State,NCES Agency ID,County Name,County #,Total Public Schools 2018-2019,Total Operational Public Schools 2018-2019,State Agency ID,Total Students 2018-2019,FTE Teachers 2018-2019,...,Asian or Asian/Pacific Islander Students 2020-2021,Asian or Asian/Pacific Islander Students 2018-2019,Hispanic Students 2020-2021,Hispanic Students 2018-2019,Black or African American Students 2020-2021,Black or African American Students 2018-2019,White Students 2020-2021,White Students 2018-2019,Total Race/Ethnicity 2020-2021,Total Race/Ethnicity 2018-2019
0,A W BROWN LEADERSHIP ACADEMY,Texas,4800095.0,Dallas County,48113.0,2.0,2.0,TX-057816,2084.0,115.60,...,2.0,1.0,27.0,47.0,720.0,1109.0,1.0,6.0,769.0,1193.0
1,A W BROWN LEADERSHIP ACADEMY,Texas,4800095.0,Dallas County,48113.0,2.0,2.0,TX-057816,2084.0,115.60,...,1.0,0.0,25.0,42.0,577.0,842.0,2.0,1.0,614.0,891.0
2,A W BROWN LEADERSHIP ACADEMY,Texas,4800095.0,Dallas County,48113.0,,,TX-057816,,,...,2.0,1.0,27.0,47.0,720.0,1109.0,1.0,6.0,769.0,1193.0
3,A W BROWN LEADERSHIP ACADEMY,Texas,4800095.0,Dallas County,48113.0,,,TX-057816,,,...,1.0,0.0,25.0,42.0,577.0,842.0,2.0,1.0,614.0,891.0
4,A+ ACADEMY,Texas,4800203.0,Dallas County,48113.0,2.0,2.0,TX-057829,1409.0,86.58,...,5.0,6.0,743.0,730.0,29.0,38.0,17.0,44.0,797.0,823.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19236,,,,,,,,,,,...,,,,,,,,,,
19237,,,,,,,,,,,...,,,,,,,,,,
19238,,,,,,,,,,,...,,,,,,,,,,
19239,,,,,,,,,,,...,,,,,,,,,,


In [47]:
df_merge = df_merge.drop(columns = ['State Agency ID 2018-2019','State Agency ID 2020-2021','School Name', 'State Name' ])

In [48]:
df_merge = df_merge[df_merge['State'].notna()]
df_merge.to_csv('DATA_NCES_DISTRICT.csv', index = None)

In [49]:
del df_merge