# NCES DISTRICT LEVEL CLEANING NOTEBOOK

**v2 UPDATE**  
- Add `Locale [District]` to 'nces_district_2019.csv' and 'nces_district_2021.csv'

***Import libraries:***

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


pd.options.display.max_rows = 4000
pd.options.display.max_columns = 200

Let's make sure we have the data in this folder, in order to read it in properly...

In [2]:
ls

DATA_NCES_DISTRICT.csv           nces_district_cleaning_v2.ipynb
DATA_NCES_DISTRICT_v3.csv        nces_district_cleaning_v3.ipynb
nces_campus_1921.csv             nces_grades.csv
nces_district_2019.csv           [34mold[m[m/
nces_district_2021.csv


In [3]:
# read in the csv file and treat –, ‡ as NaN
df_district21 = pd.read_csv('nces_district_2021.csv', header = 3, na_values=["–", "‡"])

In [4]:
print('Shape:', df_district21.shape)
df_district21.head()

Shape: (1239, 12)


Unnamed: 0,Agency Name,State Name [District] Latest available year,Agency ID - NCES Assigned [District] Latest available year,County Name [District] 2020-21,County Number [District] 2020-21,Total Number Operational Schools [Public School] 2020-21,Total Number of Public Schools [Public School] 2020-21,State Agency ID [District] 2020-21,Total Students All Grades (Excludes AE) [District] 2020-21,Full-Time Equivalent (FTE) Teachers [District] 2020-21,Total Staff [District] 2020-21,Locale [District] 2020-21
0,A W BROWN LEADERSHIP ACADEMY,Texas,4800095.0,Dallas County,48113.0,2,2,TX-057816,1383,76.79,170.87,11-City: Large
1,A+ ACADEMY,Texas,4800203.0,Dallas County,48113.0,2,2,TX-057829,1456,92.87,213.39,11-City: Large
2,A+ UNLIMITED POTENTIAL,TEXAS,4801453.0,Harris County,48201.0,1,1,TX-101871,152,6.91,13.61,11-City: Large
3,ABBOTT ISD,Texas,4807380.0,Hill County,48217.0,1,2,TX-109901,274,25.13,41.72,42-Rural: Distant
4,ABERNATHY ISD,Texas,4807410.0,Hale County,48189.0,3,4,TX-095901,810,72.55,124.29,31-Town: Fringe


In [5]:
list(df_district21.columns)

['Agency Name',
 'State Name [District] Latest available year',
 'Agency ID - NCES Assigned [District] Latest available year',
 'County Name [District] 2020-21',
 'County Number [District] 2020-21',
 'Total Number Operational Schools [Public School] 2020-21',
 'Total Number of Public Schools [Public School] 2020-21',
 'State Agency ID [District] 2020-21',
 'Total Students All Grades (Excludes AE) [District] 2020-21',
 'Full-Time Equivalent (FTE) Teachers [District] 2020-21',
 'Total Staff [District] 2020-21',
 'Locale [District] 2020-21']

In [6]:
#re-name the columns for simplicity and merging later
district_columns21 = ['Agency Name', \
                    'State', \
                    'NCES Agency ID', \
                    'County Name', \
                    'County #', \
                    'Total Operational Public Schools 2020-2021', \
                    'Total Public Schools 2020-2021', \
                    'State Agency ID', \
                    'Total Students 2020-2021', \
                    'FTE Teachers 2020-2021', \
                    'Total Staff 2020-2021',
                     #v2 update
                     'Locale 2020-2021']

# read in the csv file and treat –, ‡ as NaN
# we will replace † with 0
df_district21 = pd.read_csv('nces_district_2021.csv', header = 3, names = district_columns21, na_values=["–", "‡"])
df_district21.replace('†', 0, inplace=True)

In [7]:
df_district21.head()

Unnamed: 0,Agency Name,State,NCES Agency ID,County Name,County #,Total Operational Public Schools 2020-2021,Total Public Schools 2020-2021,State Agency ID,Total Students 2020-2021,FTE Teachers 2020-2021,Total Staff 2020-2021,Locale 2020-2021
0,A W BROWN LEADERSHIP ACADEMY,Texas,4800095.0,Dallas County,48113.0,2,2,TX-057816,1383,76.79,170.87,11-City: Large
1,A+ ACADEMY,Texas,4800203.0,Dallas County,48113.0,2,2,TX-057829,1456,92.87,213.39,11-City: Large
2,A+ UNLIMITED POTENTIAL,TEXAS,4801453.0,Harris County,48201.0,1,1,TX-101871,152,6.91,13.61,11-City: Large
3,ABBOTT ISD,Texas,4807380.0,Hill County,48217.0,1,2,TX-109901,274,25.13,41.72,42-Rural: Distant
4,ABERNATHY ISD,Texas,4807410.0,Hale County,48189.0,3,4,TX-095901,810,72.55,124.29,31-Town: Fringe


In [8]:
df_district21['District #'] = df_district21['State Agency ID'].str[3:]

In [9]:
df_district21.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1239 entries, 0 to 1238
Data columns (total 13 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Agency Name                                 1239 non-null   object 
 1   State                                       1235 non-null   object 
 2   NCES Agency ID                              1235 non-null   float64
 3   County Name                                 1235 non-null   object 
 4   County #                                    1235 non-null   float64
 5   Total Operational Public Schools 2020-2021  1235 non-null   object 
 6   Total Public Schools 2020-2021              1235 non-null   object 
 7   State Agency ID                             1235 non-null   object 
 8   Total Students 2020-2021                    1234 non-null   object 
 9   FTE Teachers 2020-2021                      1232 non-null   object 
 10  Total Staff 

In [10]:
num_cols21 = ['Total Public Schools 2020-2021', \
                    'Total Operational Public Schools 2020-2021', \
                    'Total Students 2020-2021', \
                    'FTE Teachers 2020-2021', \
                    'Total Staff 2020-2021']

for col in num_cols21:
    df_district21[col] = df_district21[col].astype(str)
    df_district21[col] = df_district21[col].map(lambda x: x.lstrip('="').rstrip('"'))
    df_district21[col] = df_district21[col].astype(float)

In [11]:
df_district21.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1239 entries, 0 to 1238
Data columns (total 13 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Agency Name                                 1239 non-null   object 
 1   State                                       1235 non-null   object 
 2   NCES Agency ID                              1235 non-null   float64
 3   County Name                                 1235 non-null   object 
 4   County #                                    1235 non-null   float64
 5   Total Operational Public Schools 2020-2021  1235 non-null   float64
 6   Total Public Schools 2020-2021              1235 non-null   float64
 7   State Agency ID                             1235 non-null   object 
 8   Total Students 2020-2021                    1234 non-null   float64
 9   FTE Teachers 2020-2021                      1232 non-null   float64
 10  Total Staff 

In [12]:
# df_district21['% Operational Schools 2020-2021'] = df_district21['Total Operational Public Schools 2020-2021'] / df_district21['Total Public Schools 2020-2021']

In [13]:
#df_district21.to_csv('nces_district21r.csv', index = None)

# drop redundant columns
df_district21.drop(columns = ['County Name'], inplace = True)

# remove instances with 0 students and
# remove instances with no district #
no_students = df_district21[df_district21['Total Students 2020-2021'] == 0]
df_district21.drop(no_students.index, inplace=True)
no_schools = df_district21[df_district21['Total Operational Public Schools 2020-2021'] == 0]
df_district21.drop(no_schools.index, inplace=True)
df_district21.dropna(subset = ['State Agency ID'], inplace = True)


In [14]:
df_district21.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1207 entries, 0 to 1234
Data columns (total 12 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Agency Name                                 1207 non-null   object 
 1   State                                       1207 non-null   object 
 2   NCES Agency ID                              1207 non-null   float64
 3   County #                                    1207 non-null   float64
 4   Total Operational Public Schools 2020-2021  1207 non-null   float64
 5   Total Public Schools 2020-2021              1207 non-null   float64
 6   State Agency ID                             1207 non-null   object 
 7   Total Students 2020-2021                    1207 non-null   float64
 8   FTE Teachers 2020-2021                      1204 non-null   float64
 9   Total Staff 2020-2021                       1204 non-null   float64
 10  Locale 2020-

In [15]:
# remove instances with no FTE Teachers #

no_teachers = df_district21[df_district21['FTE Teachers 2020-2021'].isna()]
df_district21.drop(no_teachers.index, inplace=True)

In [16]:
df_district21.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1204 entries, 0 to 1234
Data columns (total 12 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Agency Name                                 1204 non-null   object 
 1   State                                       1204 non-null   object 
 2   NCES Agency ID                              1204 non-null   float64
 3   County #                                    1204 non-null   float64
 4   Total Operational Public Schools 2020-2021  1204 non-null   float64
 5   Total Public Schools 2020-2021              1204 non-null   float64
 6   State Agency ID                             1204 non-null   object 
 7   Total Students 2020-2021                    1204 non-null   float64
 8   FTE Teachers 2020-2021                      1204 non-null   float64
 9   Total Staff 2020-2021                       1204 non-null   float64
 10  Locale 2020-

In [17]:
# read in the csv file and treat –, ‡ as NaN
df_district19 = pd.read_csv('nces_district_2019.csv', header = 3, na_values=["–", "‡"])

In [18]:
print('Shape:', df_district19.shape)
df_district19.head()

Shape: (1234, 12)


Unnamed: 0,Agency Name,State Name [District] Latest available year,Agency ID - NCES Assigned [District] Latest available year,County Name [District] 2018-19,County Number [District] 2018-19,Total Number Operational Schools [Public School] 2018-19,Total Number of Public Schools [Public School] 2018-19,State Agency ID [District] 2018-19,Total Students All Grades (Excludes AE) [District] 2018-19,Full-Time Equivalent (FTE) Teachers [District] 2018-19,Total Staff [District] 2018-19,Locale [District] 2018-19
0,A W BROWN LEADERSHIP ACADEMY,Texas,4800095.0,Dallas County,48113.0,2,2,TX-057816,2084,115.6,218.4,11-City: Large
1,A+ ACADEMY,Texas,4800203.0,Dallas County,48113.0,2,2,TX-057829,1409,86.58,192.28,11-City: Large
2,A+ UNLIMITED POTENTIAL,TEXAS,4801453.0,Harris County,48201.0,2,2,TX-101871,180,7.67,10.34,11-City: Large
3,ABBOTT ISD,Texas,4807380.0,Hill County,48217.0,1,2,TX-109901,277,24.25,40.0,42-Rural: Distant
4,ABERNATHY ISD,Texas,4807410.0,Hale County,48189.0,3,4,TX-095901,780,68.0,124.97,31-Town: Fringe


In [19]:
list(df_district19.columns)

['Agency Name',
 'State Name [District] Latest available year',
 'Agency ID - NCES Assigned [District] Latest available year',
 'County Name [District] 2018-19',
 'County Number [District] 2018-19',
 'Total Number Operational Schools [Public School] 2018-19',
 'Total Number of Public Schools [Public School] 2018-19',
 'State Agency ID [District] 2018-19',
 'Total Students All Grades (Excludes AE) [District] 2018-19',
 'Full-Time Equivalent (FTE) Teachers [District] 2018-19',
 'Total Staff [District] 2018-19',
 'Locale [District] 2018-19']

In [20]:
#re-name the columns for simplicity and merging later
district_columns19 = ['Agency Name', \
                    'State', \
                    'NCES Agency ID', \
                    'County Name', \
                    'County #', \
                    'Total Operational Public Schools 2018-2019', \
                    'Total Public Schools 2018-2019', \
                    'State Agency ID', \
                    'Total Students 2018-2019', \
                    'FTE Teachers 2018-2019', \
                    'Total Staff 2018-2019',
                     #v2 update
                    'Locale 2018-2019']

# read in the csv file and treat –, ‡ as NaN
# we will replace † with 0
df_district19 = pd.read_csv('nces_district_2019.csv', header = 3, names = district_columns19, na_values=["–", "‡"])
df_district19.replace('†', 0, inplace=True)

In [21]:
df_district19['District #'] = df_district19['State Agency ID'].str[3:]

In [22]:
num_cols19 = ['Total Public Schools 2018-2019', \
                    'Total Operational Public Schools 2018-2019', \
                    'Total Students 2018-2019', \
                    'FTE Teachers 2018-2019', \
                    'Total Staff 2018-2019']

for col in num_cols19:
    df_district19[col] = df_district19[col].astype(str)
    df_district19[col] = df_district19[col].map(lambda x: x.lstrip('="').rstrip('"'))
    df_district19[col] = df_district19[col].astype(float)

In [23]:
# df_district19['% Operational Schools 2018-2019'] = df_district19['Total Operational Public Schools 2018-2019'] / df_district19['Total Public Schools 2018-2019']

In [24]:
# drop redundant columns
df_district19.drop(columns = ['County Name'], inplace = True)

# remove instances with 0 students and
# remove instances with no district #
no_students = df_district19[df_district19['Total Students 2018-2019'] == 0]
df_district19.drop(no_students.index, inplace=True)
no_schools = df_district19[df_district19['Total Operational Public Schools 2018-2019'] == 0]
df_district19.drop(no_schools.index, inplace=True)
df_district19.dropna(subset = ['State Agency ID'], inplace = True)



In [25]:
df_district19.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1204 entries, 0 to 1229
Data columns (total 12 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Agency Name                                 1204 non-null   object 
 1   State                                       1204 non-null   object 
 2   NCES Agency ID                              1204 non-null   float64
 3   County #                                    1204 non-null   float64
 4   Total Operational Public Schools 2018-2019  1204 non-null   float64
 5   Total Public Schools 2018-2019              1204 non-null   float64
 6   State Agency ID                             1204 non-null   object 
 7   Total Students 2018-2019                    1204 non-null   float64
 8   FTE Teachers 2018-2019                      1203 non-null   float64
 9   Total Staff 2018-2019                       1201 non-null   float64
 10  Locale 2018-

In [26]:
df_district19.dropna(inplace=True)
df_district19.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1200 entries, 0 to 1229
Data columns (total 12 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Agency Name                                 1200 non-null   object 
 1   State                                       1200 non-null   object 
 2   NCES Agency ID                              1200 non-null   float64
 3   County #                                    1200 non-null   float64
 4   Total Operational Public Schools 2018-2019  1200 non-null   float64
 5   Total Public Schools 2018-2019              1200 non-null   float64
 6   State Agency ID                             1200 non-null   object 
 7   Total Students 2018-2019                    1200 non-null   float64
 8   FTE Teachers 2018-2019                      1200 non-null   float64
 9   Total Staff 2018-2019                       1200 non-null   float64
 10  Locale 2018-

In [27]:
# read in the csv file and treat –, ‡ as NaN
df_grades = pd.read_csv('nces_grades.csv', header = 3, na_values=["–", "‡"])

In [28]:
list(df_grades.columns)

['Agency Name',
 'State Name [District] Latest available year',
 'Grade 3 Students [District] 2020-21',
 'Grade 3 Students [District] 2018-19',
 'Grade 4 Students [District] 2020-21',
 'Grade 4 Students [District] 2018-19',
 'Grade 5 Students [District] 2020-21',
 'Grade 5 Students [District] 2018-19',
 'Grade 6 Students [District] 2020-21',
 'Grade 6 Students [District] 2018-19',
 'Grade 7 Students [District] 2020-21',
 'Grade 7 Students [District] 2018-19',
 'Grade 8 Students [District] 2020-21',
 'Grade 8 Students [District] 2018-19',
 'State Agency ID [District] 2020-21',
 'State Agency ID [District] 2018-19',
 'Grades 1-8 Students [District] 2020-21',
 'Grades 1-8 Students [District] 2018-19',
 'Grades 9-12 Students [District] 2020-21',
 'Grades 9-12 Students [District] 2018-19',
 'Prekindergarten Students [District] 2020-21',
 'Prekindergarten Students [District] 2018-19',
 'Kindergarten Students [District] 2020-21',
 'Kindergarten Students [District] 2018-19',
 'Grade 1 Students

In [29]:
#re-name the columns for simplicity and merging later
grade_columns = ['Agency Name', \
                'State', \
                'Grade 3 2020-2021', \
                'Grade 3 2018-2019', \
                'Grade 4 2020-2021', \
                'Grade 4 2018-2019', \
                'Grade 5 2020-2021', \
                'Grade 5 2018-2019', \
                'Grade 6 2020-2021', \
                'Grade 6 2018-2019', \
                'Grade 7 2020-2021', \
                'Grade 7 2018-2019', \
                'Grade 8 2020-2021', \
                'Grade 8 2018-2019', \
                'State Agency ID 2020-2021', \
                'State Agency ID 2018-2019',
                # added v2
                'Grades 1-8 2020-2021',
                'Grades 1-8 2018-2019',
                'Grades 9-12 2020-2021',
                'Grades 9-12 2018-2019',
                'Prek 2020-2021',
                'Prek 2018-2019',
                'K 2020-2021',
                'K 2018-2019',
                'Grade 1 2020-2021',
                'Grade 1 2018-2019',
                'Grade 2 2020-2021',
                'Grade 2 2018-2019',
                'Grade 9 2020-2021',
                'Grade 9 2018-2019',
                'Grade 10 2020-2021',
                'Grade 10 2018-2019',
                'Grade 11 2020-2021',
                'Grade 11 2018-2019',
                'Grade 12 2020-2021',
                'Grade 12 2018-2019']

# read in the csv file and treat –, ‡ as NaN
# we will replace † with 0
df_grades = pd.read_csv('nces_grades.csv', header = 3, names = grade_columns, na_values=["–", "‡", "†"])
#df_grades.replace('†', 0, inplace=True)
#df_grades.dropna(inplace=True)
df_grades['State Agency ID 2020-2021'].fillna(df_grades['State Agency ID 2018-2019'], inplace = True)
df_grades['State Agency ID 2018-2019'].fillna(df_grades['State Agency ID 2020-2021'], inplace = True)
df_grades['District #'] = df_grades['State Agency ID 2020-2021'].str[3:]

In [30]:
df_grades.dropna(subset = ['State Agency ID 2018-2019', 'State Agency ID 2020-2021'], inplace = True)
df_grades.drop(columns = ['State Agency ID 2018-2019', 'State Agency ID 2020-2021', 'Agency Name', 'State'], inplace = True)

In [31]:
df_grades.isna().sum()

Grade 3 2020-2021         77
Grade 3 2018-2019         75
Grade 4 2020-2021         78
Grade 4 2018-2019         75
Grade 5 2020-2021         75
Grade 5 2018-2019         75
Grade 6 2020-2021         77
Grade 6 2018-2019         75
Grade 7 2020-2021         90
Grade 7 2018-2019         89
Grade 8 2020-2021         93
Grade 8 2018-2019         91
Grades 1-8 2020-2021      53
Grades 1-8 2018-2019      55
Grades 9-12 2020-2021    141
Grades 9-12 2018-2019    142
Prek 2020-2021           142
Prek 2018-2019           150
K 2020-2021               77
K 2018-2019               82
Grade 1 2020-2021         73
Grade 1 2018-2019         76
Grade 2 2020-2021         75
Grade 2 2018-2019         76
Grade 9 2020-2021        142
Grade 9 2018-2019        143
Grade 10 2020-2021       145
Grade 10 2018-2019       148
Grade 11 2020-2021       147
Grade 11 2018-2019       152
Grade 12 2020-2021       151
Grade 12 2018-2019       154
District #                 0
dtype: int64

In [32]:
df_grades[df_grades['Grade 3 2018-2019']==0]

Unnamed: 0,Grade 3 2020-2021,Grade 3 2018-2019,Grade 4 2020-2021,Grade 4 2018-2019,Grade 5 2020-2021,Grade 5 2018-2019,Grade 6 2020-2021,Grade 6 2018-2019,Grade 7 2020-2021,Grade 7 2018-2019,Grade 8 2020-2021,Grade 8 2018-2019,Grades 1-8 2020-2021,Grades 1-8 2018-2019,Grades 9-12 2020-2021,Grades 9-12 2018-2019,Prek 2020-2021,Prek 2018-2019,K 2020-2021,K 2018-2019,Grade 1 2020-2021,Grade 1 2018-2019,Grade 2 2020-2021,Grade 2 2018-2019,Grade 9 2020-2021,Grade 9 2018-2019,Grade 10 2020-2021,Grade 10 2018-2019,Grade 11 2020-2021,Grade 11 2018-2019,Grade 12 2020-2021,Grade 12 2018-2019,District #


In [33]:
# grades = ['Grade 3 2020-2021', \
#                 'Grade 3 2018-2019', \
#                 'Grade 4 2020-2021', \
#                 'Grade 4 2018-2019', \
#                 'Grade 5 2020-2021', \
#                 'Grade 5 2018-2019', \
#                 'Grade 6 2020-2021', \
#                 'Grade 6 2018-2019', \
#                 'Grade 7 2020-2021', \
#                 'Grade 7 2018-2019', \
#                 'Grade 8 2020-2021', \
#                 'Grade 8 2018-2019']

for col in df_grades.columns:
    if col != 'District #':
        df_grades[col] = df_grades[col].astype(str)
        df_grades[col] = df_grades[col].map(lambda x: x.lstrip('="').rstrip('"'))
        df_grades[col] = df_grades[col].astype(float)

In [34]:
df_grades.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1240 entries, 0 to 1239
Data columns (total 33 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Grade 3 2020-2021      1163 non-null   float64
 1   Grade 3 2018-2019      1165 non-null   float64
 2   Grade 4 2020-2021      1162 non-null   float64
 3   Grade 4 2018-2019      1165 non-null   float64
 4   Grade 5 2020-2021      1165 non-null   float64
 5   Grade 5 2018-2019      1165 non-null   float64
 6   Grade 6 2020-2021      1163 non-null   float64
 7   Grade 6 2018-2019      1165 non-null   float64
 8   Grade 7 2020-2021      1150 non-null   float64
 9   Grade 7 2018-2019      1151 non-null   float64
 10  Grade 8 2020-2021      1147 non-null   float64
 11  Grade 8 2018-2019      1149 non-null   float64
 12  Grades 1-8 2020-2021   1187 non-null   float64
 13  Grades 1-8 2018-2019   1185 non-null   float64
 14  Grades 9-12 2020-2021  1099 non-null   float64
 15  Grad

Merge

In [35]:
df_district19

Unnamed: 0,Agency Name,State,NCES Agency ID,County #,Total Operational Public Schools 2018-2019,Total Public Schools 2018-2019,State Agency ID,Total Students 2018-2019,FTE Teachers 2018-2019,Total Staff 2018-2019,Locale 2018-2019,District #
0,A W BROWN LEADERSHIP ACADEMY,Texas,4800095.0,48113.0,2.0,2.0,TX-057816,2084.0,115.6,218.4,11-City: Large,57816
1,A+ ACADEMY,Texas,4800203.0,48113.0,2.0,2.0,TX-057829,1409.0,86.58,192.28,11-City: Large,57829
2,A+ UNLIMITED POTENTIAL,TEXAS,4801453.0,48201.0,2.0,2.0,TX-101871,180.0,7.67,10.34,11-City: Large,101871
3,ABBOTT ISD,Texas,4807380.0,48217.0,1.0,2.0,TX-109901,277.0,24.25,40.0,42-Rural: Distant,109901
4,ABERNATHY ISD,Texas,4807410.0,48189.0,3.0,4.0,TX-095901,780.0,68.0,124.97,31-Town: Fringe,95901
5,ABILENE ISD,Texas,4807440.0,48441.0,30.0,31.0,TX-221901,16645.0,1090.71,2481.36,12-City: Mid-size,221901
6,ACADEMY FOR ACADEMIC EXCELLENCE,Texas,4800093.0,48113.0,5.0,5.0,TX-057814,505.0,54.96,93.57,11-City: Large,57814
7,ACADEMY ISD,Texas,4807470.0,48027.0,5.0,6.0,TX-014901,1652.0,113.6,214.33,41-Rural: Fringe,14901
8,ACADEMY OF ACCELERATED LEARNING INC,Texas,4800032.0,48201.0,1.0,1.0,TX-101810,742.0,40.41,66.59,11-City: Large,101810
9,ACADEMY OF DALLAS,Texas,4800090.0,48113.0,1.0,1.0,TX-057810,487.0,27.08,57.34,11-City: Large,57810


In [36]:
df_merge = pd.merge(df_district19, df_district21, how="inner", on= ["State Agency ID", \
                                                                    "District #", \
                                                                    "Agency Name", \
                                                                    "State", \
                                                                    "NCES Agency ID", \
                                                                    "County #"])

In [37]:
print('Shape:', df_merge.shape)
df_merge.info()

Shape: (1190, 18)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1190 entries, 0 to 1189
Data columns (total 18 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Agency Name                                 1190 non-null   object 
 1   State                                       1190 non-null   object 
 2   NCES Agency ID                              1190 non-null   float64
 3   County #                                    1190 non-null   float64
 4   Total Operational Public Schools 2018-2019  1190 non-null   float64
 5   Total Public Schools 2018-2019              1190 non-null   float64
 6   State Agency ID                             1190 non-null   object 
 7   Total Students 2018-2019                    1190 non-null   float64
 8   FTE Teachers 2018-2019                      1190 non-null   float64
 9   Total Staff 2018-2019                       1190 non-null   float64

In [38]:
# df_merge = df_merge.groupby('State Agency ID').agg('max')

# df_merge.info()

In [39]:
# df_merge.dropna(inplace=True)
# df_merge.info()

In [40]:
#df_merge.to_csv('DATA_NCES_DISTRICT.csv', index = None)

In [41]:
df_1921 = pd.read_csv('nces_campus_1921.csv',  header = 3, na_values=["–", "‡"])

In [42]:
print('Shape:', df_1921.shape)

Shape: (9710, 32)


In [43]:
list(df_1921.columns)

['School Name',
 'State Name [Public School] Latest available year',
 'State Agency ID [Public School] 2020-21',
 'State Agency ID [Public School] 2018-19',
 'Virtual School Status (SY 2016-17 onward) [Public School] 2020-21',
 'Virtual School Status (SY 2016-17 onward) [Public School] 2018-19',
 'School-wide Title I [Public School] 2020-21',
 'School-wide Title I [Public School] 2018-19',
 'Title I Eligible School [Public School] 2020-21',
 'Title I Eligible School [Public School] 2018-19',
 'Title I School Status [Public School] 2020-21',
 'Title I School Status [Public School] 2018-19',
 'Free Lunch Eligible [Public School] 2020-21',
 'Free Lunch Eligible [Public School] 2018-19',
 'Reduced-price Lunch Eligible Students [Public School] 2020-21',
 'Reduced-price Lunch Eligible Students [Public School] 2018-19',
 'American Indian/Alaska Native Students [Public School] 2020-21',
 'American Indian/Alaska Native Students [Public School] 2018-19',
 'Asian or Asian/Pacific Islander Student

In [44]:
col_names = ['School Name', \
            'State', \
            'State Agency ID 2020-2021', \
            'State Agency ID 2018-2019', \
            'Virtual Status 2020-2021', \
            'Virtual Status 2018-2019', \
            'School-wide Title I 2020-2021', \
            'School-wide Title I 2018-2019', \
            'Title I Eligible School 2020-2021', \
            'Title I Eligible School 2018-2019', \
            'Title I School Status 2020-2021', \
            'Title I School Status 2018-2019', \
            'Free Lunch 2020-2021', \
            'Free Lunch 2018-2019', \
            'Reduced-price Lunch 2020-2021', \
            'Reduced-price Lunch 2018-2019', \
            'American Indian/Alaska Native Students 2020-2021', \
            'American Indian/Alaska Native Students 2018-2019', \
            'Asian or Asian/Pacific Islander Students 2020-2021', \
            'Asian or Asian/Pacific Islander Students 2018-2019', \
            'Hispanic Students 2020-2021', \
            'Hispanic Students 2018-2019', \
            'Black or African American Students 2020-2021', \
            'Black or African American Students 2018-2019', \
            'White Students 2020-2021', \
            'White Students 2018-2019', \
            'Nat. Hawaiian or Other Pacific Isl. Students 2020-2021', \
            'Nat. Hawaiian or Other Pacific Isl. Students 2018-2019', \
            'Two or More Races Students 2020-2021', \
            'Two or More Races Students 2018-2019', \
            'Total Race/Ethnicity 2020-2021', \
            'Total Race/Ethnicity 2018-2019']

In [45]:
# read in the csv file and treat –, ‡ as NaN
df_1921 = pd.read_csv('nces_campus_1921.csv', header = 3, names = col_names, na_values=["–", "‡", "†"])

In [46]:
#df_1921.replace('†', 0, inplace=True)

df_1921['State Agency ID 2020-2021'].value_counts(dropna = False)

TX-101912    277
TX-057905    246
NaN          243
TX-220905    150
TX-227901    135
TX-015915    130
TX-108807    111
TX-015907    100
TX-071902     98
TX-101907     90
TX-079907     87
TX-043910     85
TX-220901     79
TX-101902     79
TX-015910     75
TX-101914     75
TX-057909     74
TX-043905     74
TX-101917     67
TX-061902     67
TX-170902     65
TX-246909     65
TX-014906     61
TX-071905     60
TX-178904     58
TX-227820     58
TX-031901     57
TX-188901     56
TX-057916     56
TX-071909     53
TX-101915     53
TX-240903     53
TX-101913     52
TX-101920     51
TX-152901     50
TX-057914     50
TX-246913     50
TX-084910     48
TX-220908     48
TX-108909     46
TX-101903     46
TX-108904     46
TX-061901     44
TX-101919     44
TX-079901     44
TX-068901     44
TX-220907     43
TX-057803     43
TX-057910     41
TX-165901     41
TX-057903     41
TX-108912     39
TX-046902     39
TX-221801     38
TX-057912     38
TX-072801     36
TX-220902     35
TX-020901     35
TX-061911     

In [47]:
df_1921['State Agency ID 2018-2019'].value_counts(dropna = False)

NaN          287
TX-101912    282
TX-057905    243
TX-220905    149
TX-227901    133
TX-015915    126
TX-071902    102
TX-015907    101
TX-101902     97
TX-101907     89
TX-043910     85
TX-079907     81
TX-220901     79
TX-015910     75
TX-101914     75
TX-108807     75
TX-057909     74
TX-043905     73
TX-101917     70
TX-061902     68
TX-246909     64
TX-014906     62
TX-170902     62
TX-071905     61
TX-031901     61
TX-178904     59
TX-057916     56
TX-188901     56
TX-101915     53
TX-240903     53
TX-101913     52
TX-071909     51
TX-101920     51
TX-152901     51
TX-057914     51
TX-227820     51
TX-246913     49
TX-084910     48
TX-108909     46
TX-101903     46
TX-079901     46
TX-108904     46
TX-220908     45
TX-068901     44
TX-061901     44
TX-101919     43
TX-220907     42
TX-057910     41
TX-057903     41
TX-165901     39
TX-108912     39
TX-221801     38
TX-057912     38
TX-057803     38
TX-046902     36
TX-072801     35
TX-220902     35
TX-020901     34
TX-043907     

In [48]:
no_id = df_1921[df_1921['State Agency ID 2020-2021'] == 0]
no_id

Unnamed: 0,School Name,State,State Agency ID 2020-2021,State Agency ID 2018-2019,Virtual Status 2020-2021,Virtual Status 2018-2019,School-wide Title I 2020-2021,School-wide Title I 2018-2019,Title I Eligible School 2020-2021,Title I Eligible School 2018-2019,Title I School Status 2020-2021,Title I School Status 2018-2019,Free Lunch 2020-2021,Free Lunch 2018-2019,Reduced-price Lunch 2020-2021,Reduced-price Lunch 2018-2019,American Indian/Alaska Native Students 2020-2021,American Indian/Alaska Native Students 2018-2019,Asian or Asian/Pacific Islander Students 2020-2021,Asian or Asian/Pacific Islander Students 2018-2019,Hispanic Students 2020-2021,Hispanic Students 2018-2019,Black or African American Students 2020-2021,Black or African American Students 2018-2019,White Students 2020-2021,White Students 2018-2019,Nat. Hawaiian or Other Pacific Isl. Students 2020-2021,Nat. Hawaiian or Other Pacific Isl. Students 2018-2019,Two or More Races Students 2020-2021,Two or More Races Students 2018-2019,Total Race/Ethnicity 2020-2021,Total Race/Ethnicity 2018-2019


In [49]:
df_1921['State Agency ID 2020-2021'].fillna(df_1921['State Agency ID 2018-2019'], inplace = True)
df_1921['State Agency ID 2018-2019'].fillna(df_1921['State Agency ID 2020-2021'], inplace = True)

In [50]:
df_1921['State Agency ID 2020-2021'].value_counts(dropna = False)

TX-101912    282
TX-057905    249
TX-220905    150
TX-227901    136
TX-015915    130
TX-108807    111
TX-071902    105
TX-015907    104
TX-101902     97
TX-101907     90
TX-079907     87
TX-043910     85
TX-220901     80
TX-015910     76
TX-101914     76
TX-043905     74
TX-057909     74
TX-101917     70
TX-061902     69
TX-170902     65
TX-246909     65
TX-071905     63
TX-014906     63
TX-031901     61
TX-227820     59
TX-178904     59
TX-057916     56
TX-188901     56
TX-240903     53
TX-101915     53
TX-071909     53
TX-152901     52
TX-101913     52
TX-057914     51
TX-101920     51
TX-246913     50
TX-079901     49
TX-084910     48
TX-220908     48
TX-108904     46
TX-108909     46
TX-101903     46
TX-061901     44
TX-101919     44
TX-068901     44
TX-057803     44
TX-220907     43
TX-165901     42
TX-057910     42
TX-057903     41
TX-221801     41
TX-046902     39
TX-108912     39
TX-057912     38
TX-072801     38
TX-020901     36
TX-220902     35
TX-227904     34
TX-061911     

In [51]:
df_1921['State Agency ID 2018-2019'].value_counts(dropna = False)

TX-101912    282
TX-057905    249
TX-220905    150
TX-227901    136
TX-015915    130
TX-108807    111
TX-071902    105
TX-015907    104
TX-101902     97
TX-101907     90
TX-079907     87
TX-043910     85
TX-220901     80
TX-015910     76
TX-101914     76
TX-043905     74
TX-057909     74
TX-101917     70
TX-061902     69
TX-170902     65
TX-246909     65
TX-071905     63
TX-014906     63
TX-031901     61
TX-227820     59
TX-178904     59
TX-057916     56
TX-188901     56
TX-240903     53
TX-101915     53
TX-071909     53
TX-152901     52
TX-101913     52
TX-057914     51
TX-101920     51
TX-246913     50
TX-079901     49
TX-084910     48
TX-220908     48
TX-108904     46
TX-108909     46
TX-101903     46
TX-061901     44
TX-101919     44
TX-068901     44
TX-057803     44
TX-220907     43
TX-165901     42
TX-057910     42
TX-057903     41
TX-221801     41
TX-046902     39
TX-108912     39
TX-057912     38
TX-072801     38
TX-020901     36
TX-220902     35
TX-227904     34
TX-061911     

In [52]:
df_1921.dropna(subset = ['State Agency ID 2018-2019', 'State Agency ID 2020-2021'], inplace = True)

In [53]:
df_1921

Unnamed: 0,School Name,State,State Agency ID 2020-2021,State Agency ID 2018-2019,Virtual Status 2020-2021,Virtual Status 2018-2019,School-wide Title I 2020-2021,School-wide Title I 2018-2019,Title I Eligible School 2020-2021,Title I Eligible School 2018-2019,Title I School Status 2020-2021,Title I School Status 2018-2019,Free Lunch 2020-2021,Free Lunch 2018-2019,Reduced-price Lunch 2020-2021,Reduced-price Lunch 2018-2019,American Indian/Alaska Native Students 2020-2021,American Indian/Alaska Native Students 2018-2019,Asian or Asian/Pacific Islander Students 2020-2021,Asian or Asian/Pacific Islander Students 2018-2019,Hispanic Students 2020-2021,Hispanic Students 2018-2019,Black or African American Students 2020-2021,Black or African American Students 2018-2019,White Students 2020-2021,White Students 2018-2019,Nat. Hawaiian or Other Pacific Isl. Students 2020-2021,Nat. Hawaiian or Other Pacific Isl. Students 2018-2019,Two or More Races Students 2020-2021,Two or More Races Students 2018-2019,Total Race/Ethnicity 2020-2021,Total Race/Ethnicity 2018-2019
0,21ST CENTURY EARLY LEARNING FOUNDATIONS ACADEMY,TEXAS,TX-108913,TX-108913,,NOTVIRTUAL,1-Yes,1-Yes,1-Yes,1-Yes,4-Title I schoolwide eligible school-No program,4-Title I schoolwide eligible school-No program,423.0,512,1,"=""0""","=""0""","=""0""","=""0""","=""0""",422,503,"=""0""","=""0""",2,11,"=""0""","=""0""","=""0""","=""0""",424,514
1,3D ACADEMY,Texas,TX-108902,TX-108902,,NOTVIRTUAL,1-Yes,1-Yes,1-Yes,1-Yes,5-Title I schoolwide school,5-Title I schoolwide school,121.0,152,2,"=""0""","=""0""","=""0""","=""0""","=""0""",127,158,"=""0""","=""0""","=""0""","=""0""","=""0""","=""0""","=""0""","=""0""",127,158
2,3RD H S,Texas,TX-043914,TX-043914,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,7TH H S - NORTHEAST,Texas,TX-246909,TX-246909,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,A & M CONS H S,Texas,TX-021901,TX-021901,,NOTVIRTUAL,2-No,2-No,1-Yes,1-Yes,1-Title I targeted assistance eligible school-...,1-Title I targeted assistance eligible school-...,496.0,533,83,91,5,8,135,134,449,427,256,252,988,878,4,"=""0""",60,64,1897,1763
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9701,ZEPHYR SCHOOL,Texas,TX-025906,TX-025906,,NOTVIRTUAL,1-Yes,1-Yes,1-Yes,1-Yes,5-Title I schoolwide school,5-Title I schoolwide school,104.0,119,15,15,"=""0""","=""0""","=""0""","=""0""",22,30,1,1,162,177,"=""0""","=""0""",14,10,199,218
9702,ZILKER EL,Texas,TX-227901,TX-227901,,NOTVIRTUAL,,,2-No,2-No,6-Not a Title I school,6-Not a Title I school,78.0,72,4,22,1,1,7,10,116,163,6,7,291,315,1,"=""0""",21,28,443,524
9703,ZUE S BALES INT,Texas,TX-084911,TX-084911,,NOTVIRTUAL,1-Yes,1-Yes,1-Yes,1-Yes,5-Title I schoolwide school,5-Title I schoolwide school,102.0,87,4,16,"=""0""",2,18,24,152,143,12,8,402,428,"=""0""",1,15,26,599,632
9704,ZUNDELOWITZ EL,TEXAS,TX-243905,TX-243905,,NOTVIRTUAL,1-Yes,1-Yes,1-Yes,1-Yes,5-Title I schoolwide school,5-Title I schoolwide school,403.0,463,27,47,4,5,3,1,347,366,38,38,86,123,"=""0""","=""0""",34,33,512,566


In [54]:
df_1921['Virtual Status 2020-2021'].value_counts(dropna = False)

NaN            9697
FULLVIRTUAL       9
Name: Virtual Status 2020-2021, dtype: int64

In [55]:
df_1921['Virtual Status 2018-2019'].value_counts(dropna = False)

NOTVIRTUAL     8971
NaN             727
FULLVIRTUAL       8
Name: Virtual Status 2018-2019, dtype: int64

In [56]:
for col in col_names:
    df_1921[col] = df_1921[col].astype(str)
    df_1921[col] = df_1921[col].map(lambda x: x.lstrip('="').rstrip('"'))

In [57]:
num_cols = ['Free Lunch 2020-2021', \
            'Free Lunch 2018-2019', \
            'Reduced-price Lunch 2020-2021', \
            'Reduced-price Lunch 2018-2019', \
            'American Indian/Alaska Native Students 2020-2021', \
            'American Indian/Alaska Native Students 2018-2019', \
            'Asian or Asian/Pacific Islander Students 2020-2021', \
            'Asian or Asian/Pacific Islander Students 2018-2019', \
            'Hispanic Students 2020-2021', \
            'Hispanic Students 2018-2019', \
            'Black or African American Students 2020-2021', \
            'Black or African American Students 2018-2019', \
            'White Students 2020-2021', \
            'White Students 2018-2019', \
            'Nat. Hawaiian or Other Pacific Isl. Students 2020-2021', \
            'Nat. Hawaiian or Other Pacific Isl. Students 2018-2019', \
            'Two or More Races Students 2020-2021', \
            'Two or More Races Students 2018-2019', \
            'Total Race/Ethnicity 2020-2021', \
            'Total Race/Ethnicity 2018-2019']

for col in num_cols:
    df_1921[col] = df_1921[col].astype(float)

In [58]:
df_1921.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9706 entries, 0 to 9705
Data columns (total 32 columns):
 #   Column                                                  Non-Null Count  Dtype  
---  ------                                                  --------------  -----  
 0   School Name                                             9706 non-null   object 
 1   State                                                   9706 non-null   object 
 2   State Agency ID 2020-2021                               9706 non-null   object 
 3   State Agency ID 2018-2019                               9706 non-null   object 
 4   Virtual Status 2020-2021                                9706 non-null   object 
 5   Virtual Status 2018-2019                                9706 non-null   object 
 6   School-wide Title I 2020-2021                           9706 non-null   object 
 7   School-wide Title I 2018-2019                           9706 non-null   object 
 8   Title I Eligible School 2020-2021     

In [59]:
df_1921['District #'] = df_1921['State Agency ID 2020-2021'].str[3:]
df_1921['District #'].value_counts(dropna = False)

101912    282
057905    249
220905    150
227901    136
015915    130
108807    111
071902    105
015907    104
101902     97
101907     90
079907     87
043910     85
220901     80
101914     76
015910     76
057909     74
043905     74
101917     70
061902     69
246909     65
170902     65
071905     63
014906     63
031901     61
227820     59
178904     59
188901     56
057916     56
240903     53
071909     53
101915     53
101913     52
152901     52
101920     51
057914     51
246913     50
079901     49
084910     48
220908     48
108904     46
108909     46
101903     46
101919     44
057803     44
061901     44
068901     44
220907     43
165901     42
057910     42
057903     41
221801     41
046902     39
108912     39
072801     38
057912     38
020901     36
220902     35
220916     34
061911     34
227904     34
057848     34
108906     33
043907     33
031903     33
123910     32
015916     32
101911     31
221901     31
015904     31
235902     30
240901     30
220918

In [60]:
df_1921.drop(columns = ['American Indian/Alaska Native Students 2020-2021', \
                        'American Indian/Alaska Native Students 2018-2019', \
                        'Nat. Hawaiian or Other Pacific Isl. Students 2020-2021', \
                        'Nat. Hawaiian or Other Pacific Isl. Students 2018-2019', \
                        'Two or More Races Students 2020-2021', \
                        'Two or More Races Students 2018-2019', \
                        'State', \
                        'State Agency ID 2020-2021', \
                        'State Agency ID 2018-2019'], inplace = True)

In [61]:
df_1921['School-wide Title I 2020-2021'].value_counts(dropna = False)

1-Yes    7102
nan      2288
2-No      316
Name: School-wide Title I 2020-2021, dtype: int64

In [62]:
df_1921['Title I Eligible School 2020-2021'].value_counts(dropna = False)

1-Yes    7418
2-No     1584
nan       704
Name: Title I Eligible School 2020-2021, dtype: int64

In [63]:
df_1921['Title I School Status 2020-2021'].value_counts(dropna = False).sort_index()

1-Title I targeted assistance eligible school-No program              263
2-Title I targeted assistance school                                   53
3-Title I schoolwide eligible-Title I targeted assistance program      34
4-Title I schoolwide eligible school-No program                       863
5-Title I schoolwide school                                          6205
6-Not a Title I school                                               1584
nan                                                                   704
Name: Title I School Status 2020-2021, dtype: int64

In [64]:
df_1921['School-wide Title I 2018-2019'].value_counts(dropna = False)

1-Yes    6998
nan      2397
2-No      311
Name: School-wide Title I 2018-2019, dtype: int64

In [65]:
df_1921['Title I Eligible School 2018-2019'].value_counts(dropna = False)

1-Yes    7309
2-No     1670
nan       727
Name: Title I Eligible School 2018-2019, dtype: int64

In [66]:
df_1921['Title I School Status 2018-2019'].value_counts(dropna = False).sort_index()

1-Title I targeted assistance eligible school-No program              244
2-Title I targeted assistance school                                   67
3-Title I schoolwide eligible-Title I targeted assistance program      30
4-Title I schoolwide eligible school-No program                       895
5-Title I schoolwide school                                          6073
6-Not a Title I school                                               1670
nan                                                                   727
Name: Title I School Status 2018-2019, dtype: int64

In [67]:
# df_1921.loc[(df_1921["Title I Eligible School 2020-2021"] == "nan") & (df_1921["Title I Eligible School 2018-2019"] == "2-No"), "Title I Eligible School 2020-2021"] = "2-No"
# df_1921.loc[(df_1921["Title I Eligible School 2020-2021"] == "nan") & (df_1921["Title I Eligible School 2018-2019"] == "1-Yes"), "Title I Eligible School 2020-2021"] = "1-Yes"

In [68]:
# df_1921['Title I Eligible School 2020-2021'].value_counts(dropna = False)

In [69]:
# df_1921.loc[(df_1921["School-wide Title I 2020-2021"] == "nan") & (df_1921["Title I Eligible School 2020-2021"] == "2-No"), "School-wide Title I 2020-2021"] = "2-No"
# df_1921.loc[(df_1921["School-wide Title I 2020-2021"] == "nan") & (df_1921["Title I Eligible School 2020-2021"] == "1-Yes"), "School-wide Title I 2020-2021"] = "1-Yes"

# df_1921.loc[(df_1921["School-wide Title I 2020-2021"] == "nan") & (df_1921["School-wide Title I 2018-2019"] == "2-No"), "School-wide Title I 2020-2021"] = "2-No"
# df_1921.loc[(df_1921["School-wide Title I 2020-2021"] == "nan") & (df_1921["School-wide Title I 2018-2019"] == "1-Yes"), "School-wide Title I 2020-2021"] = "1-Yes"

In [70]:
# df_1921['School-wide Title I 2020-2021'].value_counts(dropna = False)

In [71]:
df_1921.fillna(0, inplace = True)

In [72]:
no_students = df_1921[df_1921['Total Race/Ethnicity 2020-2021'] == 0]

no_students = no_students[no_students['Total Race/Ethnicity 2018-2019'] == 0]

df_1921.drop(no_students.index, inplace=True)

#no_students.describe()

In [73]:
df_1921['School-wide Title I 2020-2021'].value_counts(dropna = False)

1-Yes    7099
nan      1728
2-No      316
Name: School-wide Title I 2020-2021, dtype: int64

In [74]:
df_1921['Title I Eligible School 2020-2021'].value_counts(dropna = False)

1-Yes    7415
2-No     1469
nan       259
Name: Title I Eligible School 2020-2021, dtype: int64

In [75]:
df_1921

Unnamed: 0,School Name,Virtual Status 2020-2021,Virtual Status 2018-2019,School-wide Title I 2020-2021,School-wide Title I 2018-2019,Title I Eligible School 2020-2021,Title I Eligible School 2018-2019,Title I School Status 2020-2021,Title I School Status 2018-2019,Free Lunch 2020-2021,Free Lunch 2018-2019,Reduced-price Lunch 2020-2021,Reduced-price Lunch 2018-2019,Asian or Asian/Pacific Islander Students 2020-2021,Asian or Asian/Pacific Islander Students 2018-2019,Hispanic Students 2020-2021,Hispanic Students 2018-2019,Black or African American Students 2020-2021,Black or African American Students 2018-2019,White Students 2020-2021,White Students 2018-2019,Total Race/Ethnicity 2020-2021,Total Race/Ethnicity 2018-2019,District #
0,21ST CENTURY EARLY LEARNING FOUNDATIONS ACADEMY,,NOTVIRTUAL,1-Yes,1-Yes,1-Yes,1-Yes,4-Title I schoolwide eligible school-No program,4-Title I schoolwide eligible school-No program,423.0,512.0,1.0,0.0,0.0,0.0,422.0,503.0,0.0,0.0,2.0,11.0,424.0,514.0,108913
1,3D ACADEMY,,NOTVIRTUAL,1-Yes,1-Yes,1-Yes,1-Yes,5-Title I schoolwide school,5-Title I schoolwide school,121.0,152.0,2.0,0.0,0.0,0.0,127.0,158.0,0.0,0.0,0.0,0.0,127.0,158.0,108902
4,A & M CONS H S,,NOTVIRTUAL,2-No,2-No,1-Yes,1-Yes,1-Title I targeted assistance eligible school-...,1-Title I targeted assistance eligible school-...,496.0,533.0,83.0,91.0,135.0,134.0,449.0,427.0,256.0,252.0,988.0,878.0,1897.0,1763.0,021901
5,A & M CONSOLIDATED MIDDLE,,NOTVIRTUAL,1-Yes,1-Yes,1-Yes,1-Yes,4-Title I schoolwide eligible school-No program,4-Title I schoolwide eligible school-No program,279.0,246.0,26.0,32.0,22.0,52.0,196.0,198.0,109.0,102.0,328.0,328.0,689.0,706.0,021901
6,A B DUNCAN COLLEGIATE EL,,NOTVIRTUAL,1-Yes,1-Yes,1-Yes,1-Yes,5-Title I schoolwide school,5-Title I schoolwide school,261.0,353.0,6.0,2.0,1.0,0.0,281.0,359.0,12.0,14.0,37.0,52.0,334.0,428.0,077901
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9701,ZEPHYR SCHOOL,,NOTVIRTUAL,1-Yes,1-Yes,1-Yes,1-Yes,5-Title I schoolwide school,5-Title I schoolwide school,104.0,119.0,15.0,15.0,0.0,0.0,22.0,30.0,1.0,1.0,162.0,177.0,199.0,218.0,025906
9702,ZILKER EL,,NOTVIRTUAL,,,2-No,2-No,6-Not a Title I school,6-Not a Title I school,78.0,72.0,4.0,22.0,7.0,10.0,116.0,163.0,6.0,7.0,291.0,315.0,443.0,524.0,227901
9703,ZUE S BALES INT,,NOTVIRTUAL,1-Yes,1-Yes,1-Yes,1-Yes,5-Title I schoolwide school,5-Title I schoolwide school,102.0,87.0,4.0,16.0,18.0,24.0,152.0,143.0,12.0,8.0,402.0,428.0,599.0,632.0,084911
9704,ZUNDELOWITZ EL,,NOTVIRTUAL,1-Yes,1-Yes,1-Yes,1-Yes,5-Title I schoolwide school,5-Title I schoolwide school,403.0,463.0,27.0,47.0,3.0,1.0,347.0,366.0,38.0,38.0,86.0,123.0,512.0,566.0,243905


In [76]:
aus_isd = df_1921[df_1921['District #'] == '227901']

In [77]:
aus_isd

Unnamed: 0,School Name,Virtual Status 2020-2021,Virtual Status 2018-2019,School-wide Title I 2020-2021,School-wide Title I 2018-2019,Title I Eligible School 2020-2021,Title I Eligible School 2018-2019,Title I School Status 2020-2021,Title I School Status 2018-2019,Free Lunch 2020-2021,Free Lunch 2018-2019,Reduced-price Lunch 2020-2021,Reduced-price Lunch 2018-2019,Asian or Asian/Pacific Islander Students 2020-2021,Asian or Asian/Pacific Islander Students 2018-2019,Hispanic Students 2020-2021,Hispanic Students 2018-2019,Black or African American Students 2020-2021,Black or African American Students 2018-2019,White Students 2020-2021,White Students 2018-2019,Total Race/Ethnicity 2020-2021,Total Race/Ethnicity 2018-2019,District #
99,AKINS H S,,NOTVIRTUAL,1-Yes,1-Yes,1-Yes,1-Yes,4-Title I schoolwide eligible school-No program,4-Title I schoolwide eligible school-No program,1521.0,1449.0,22.0,291.0,56.0,71.0,2167.0,2153.0,166.0,192.0,346.0,324.0,2810.0,2802.0,227901
151,ALLISON EL,,NOTVIRTUAL,1-Yes,1-Yes,1-Yes,1-Yes,5-Title I schoolwide school,5-Title I schoolwide school,377.0,466.0,3.0,7.0,5.0,1.0,391.0,474.0,9.0,27.0,4.0,10.0,409.0,512.0,227901
183,ALTERNATIVE LEARNING CENTER,,NOTVIRTUAL,,,2-No,2-No,6-Not a Title I school,6-Not a Title I school,0.0,59.0,0.0,2.0,0.0,1.0,3.0,59.0,2.0,9.0,1.0,11.0,6.0,83.0,227901
243,ANDERSON H S,,NOTVIRTUAL,,,2-No,2-No,6-Not a Title I school,6-Not a Title I school,494.0,472.0,37.0,74.0,175.0,207.0,672.0,688.0,116.0,134.0,1154.0,1092.0,2241.0,2226.0,227901
251,ANDREWS EL,,NOTVIRTUAL,1-Yes,1-Yes,1-Yes,1-Yes,5-Title I schoolwide school,5-Title I schoolwide school,322.0,313.0,1.0,0.0,21.0,10.0,250.0,284.0,47.0,45.0,11.0,10.0,333.0,353.0,227901
421,AUSTIN H S,,NOTVIRTUAL,,,2-No,2-No,6-Not a Title I school,6-Not a Title I school,527.0,519.0,24.0,72.0,54.0,51.0,827.0,922.0,88.0,97.0,1270.0,1163.0,2349.0,2328.0,227901
422,AUSTIN ISD CHILD DEVELOPMENT CENTER,,NOTVIRTUAL,,,2-No,2-No,6-Not a Title I school,6-Not a Title I school,221.0,146.0,3.0,0.0,3.0,2.0,161.0,88.0,47.0,46.0,13.0,9.0,230.0,149.0,227901
430,AUSTIN ST HOSPITAL,,NOTVIRTUAL,,,2-No,2-No,6-Not a Title I school,6-Not a Title I school,9.0,7.0,0.0,0.0,1.0,0.0,3.0,7.0,2.0,3.0,9.0,16.0,15.0,26.0,227901
468,BAILEY MIDDLE,,NOTVIRTUAL,,,2-No,2-No,6-Not a Title I school,6-Not a Title I school,293.0,278.0,12.0,52.0,27.0,32.0,442.0,500.0,46.0,46.0,354.0,363.0,923.0,1003.0,227901
476,BALDWIN EL,,NOTVIRTUAL,,,2-No,2-No,6-Not a Title I school,6-Not a Title I school,102.0,87.0,1.0,17.0,94.0,84.0,161.0,175.0,12.0,11.0,367.0,465.0,675.0,783.0,227901


In [78]:
df_1921.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9143 entries, 0 to 9705
Data columns (total 24 columns):
 #   Column                                              Non-Null Count  Dtype  
---  ------                                              --------------  -----  
 0   School Name                                         9143 non-null   object 
 1   Virtual Status 2020-2021                            9143 non-null   object 
 2   Virtual Status 2018-2019                            9143 non-null   object 
 3   School-wide Title I 2020-2021                       9143 non-null   object 
 4   School-wide Title I 2018-2019                       9143 non-null   object 
 5   Title I Eligible School 2020-2021                   9143 non-null   object 
 6   Title I Eligible School 2018-2019                   9143 non-null   object 
 7   Title I School Status 2020-2021                     9143 non-null   object 
 8   Title I School Status 2018-2019                     9143 non-null   object 
 9

**Title 1 Cleaning**

In [79]:
cols = ['Virtual Status 2020-2021',
'Virtual Status 2018-2019',
'School-wide Title I 2020-2021',
'School-wide Title I 2018-2019',
'Title I Eligible School 2020-2021',
'Title I Eligible School 2018-2019',
'Title I School Status 2020-2021',
'Title I School Status 2018-2019']

for col in cols: 
    print(df_1921[col].value_counts().sort_index(), '\n')

FULLVIRTUAL       9
nan            9134
Name: Virtual Status 2020-2021, dtype: int64 

FULLVIRTUAL       8
NOTVIRTUAL     8858
nan             277
Name: Virtual Status 2018-2019, dtype: int64 

1-Yes    7099
2-No      316
nan      1728
Name: School-wide Title I 2020-2021, dtype: int64 

1-Yes    6989
2-No      311
nan      1843
Name: School-wide Title I 2018-2019, dtype: int64 

1-Yes    7415
2-No     1469
nan       259
Name: Title I Eligible School 2020-2021, dtype: int64 

1-Yes    7300
2-No     1566
nan       277
Name: Title I Eligible School 2018-2019, dtype: int64 

1-Title I targeted assistance eligible school-No program              263
2-Title I targeted assistance school                                   53
3-Title I schoolwide eligible-Title I targeted assistance program      34
4-Title I schoolwide eligible school-No program                       860
5-Title I schoolwide school                                          6205
6-Not a Title I school                              

In [80]:
df_1921_t1 = df_1921[['District #', 
                     'School-wide Title I 2020-2021',
                    'School-wide Title I 2018-2019',
                    'Title I Eligible School 2020-2021',
                    'Title I Eligible School 2018-2019']]
df_1921_t1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9143 entries, 0 to 9705
Data columns (total 5 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   District #                         9143 non-null   object
 1   School-wide Title I 2020-2021      9143 non-null   object
 2   School-wide Title I 2018-2019      9143 non-null   object
 3   Title I Eligible School 2020-2021  9143 non-null   object
 4   Title I Eligible School 2018-2019  9143 non-null   object
dtypes: object(5)
memory usage: 428.6+ KB


In [81]:
cols_title1 = ['School-wide Title I 2020-2021',
                'School-wide Title I 2018-2019',
                'Title I Eligible School 2020-2021',
                'Title I Eligible School 2018-2019']

for col in cols_title1: 
    df_1921_t1[col] = df_1921_t1[col].apply(lambda x: 1 if x == '1-Yes' else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_1921_t1[col] = df_1921_t1[col].apply(lambda x: 1 if x == '1-Yes' else 0)


In [82]:
for col in cols_title1: 
    print(df_1921_t1[col].value_counts().sort_index(), '\n')

0    2044
1    7099
Name: School-wide Title I 2020-2021, dtype: int64 

0    2154
1    6989
Name: School-wide Title I 2018-2019, dtype: int64 

0    1728
1    7415
Name: Title I Eligible School 2020-2021, dtype: int64 

0    1843
1    7300
Name: Title I Eligible School 2018-2019, dtype: int64 



In [83]:
district_1921_t1=df_1921_t1.groupby(['District #'], as_index=True).agg({
                                                            'School-wide Title I 2020-2021': 'sum',
                                                            'School-wide Title I 2018-2019': 'sum',
                                                            'Title I Eligible School 2020-2021': 'sum',
                                                            'Title I Eligible School 2018-2019': 'sum'})
district_1921_t1.rename(columns={'District #': 'district_count'}, inplace=True)
district_1921_t1.reset_index(inplace=True)
district_1921_t1.head()

Unnamed: 0,District #,School-wide Title I 2020-2021,School-wide Title I 2018-2019,Title I Eligible School 2020-2021,Title I Eligible School 2018-2019
0,1902,2,2,2,3
1,1903,4,4,4,4
2,1904,3,3,3,3
3,1906,2,2,2,2
4,1907,6,6,6,6


In [84]:
district_1921_t1['District #'] = district_1921_t1['District #'].astype(str)
print(district_1921_t1.info())
district_1921_t1.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1212 entries, 0 to 1211
Data columns (total 5 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   District #                         1212 non-null   object
 1   School-wide Title I 2020-2021      1212 non-null   int64 
 2   School-wide Title I 2018-2019      1212 non-null   int64 
 3   Title I Eligible School 2020-2021  1212 non-null   int64 
 4   Title I Eligible School 2018-2019  1212 non-null   int64 
dtypes: int64(4), object(1)
memory usage: 47.5+ KB
None


Unnamed: 0,District #,School-wide Title I 2020-2021,School-wide Title I 2018-2019,Title I Eligible School 2020-2021,Title I Eligible School 2018-2019
0,1902,2,2,2,3
1,1903,4,4,4,4
2,1904,3,3,3,3
3,1906,2,2,2,2
4,1907,6,6,6,6


In [85]:
district_1921 = df_1921.groupby('District #').agg('sum')

In [86]:
district_1921.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1212 entries, 001902 to 254902
Data columns (total 14 columns):
 #   Column                                              Non-Null Count  Dtype  
---  ------                                              --------------  -----  
 0   Free Lunch 2020-2021                                1212 non-null   float64
 1   Free Lunch 2018-2019                                1212 non-null   float64
 2   Reduced-price Lunch 2020-2021                       1212 non-null   float64
 3   Reduced-price Lunch 2018-2019                       1212 non-null   float64
 4   Asian or Asian/Pacific Islander Students 2020-2021  1212 non-null   float64
 5   Asian or Asian/Pacific Islander Students 2018-2019  1212 non-null   float64
 6   Hispanic Students 2020-2021                         1212 non-null   float64
 7   Hispanic Students 2018-2019                         1212 non-null   float64
 8   Black or African American Students 2020-2021        1212 non-null   float64


In [87]:
district_1921.reset_index(inplace = True)

district_1921

Unnamed: 0,District #,Free Lunch 2020-2021,Free Lunch 2018-2019,Reduced-price Lunch 2020-2021,Reduced-price Lunch 2018-2019,Asian or Asian/Pacific Islander Students 2020-2021,Asian or Asian/Pacific Islander Students 2018-2019,Hispanic Students 2020-2021,Hispanic Students 2018-2019,Black or African American Students 2020-2021,Black or African American Students 2018-2019,White Students 2020-2021,White Students 2018-2019,Total Race/Ethnicity 2020-2021,Total Race/Ethnicity 2018-2019
0,1902,159.0,181.0,43.0,59.0,3.0,3.0,52.0,38.0,22.0,20.0,431.0,478.0,535.0,564.0
1,1903,515.0,660.0,56.0,67.0,2.0,4.0,137.0,142.0,53.0,68.0,951.0,981.0,1200.0,1255.0
2,1904,390.0,392.0,38.0,40.0,9.0,8.0,62.0,66.0,79.0,66.0,586.0,629.0,769.0,804.0
3,1906,142.0,137.0,26.0,38.0,1.0,0.0,45.0,51.0,24.0,31.0,254.0,268.0,339.0,366.0
4,1907,2574.0,2316.0,152.0,228.0,26.0,31.0,1445.0,1398.0,883.0,902.0,894.0,936.0,3400.0,3393.0
5,1908,913.0,836.0,15.0,97.0,6.0,9.0,333.0,383.0,268.0,260.0,675.0,799.0,1352.0,1518.0
6,1909,130.0,161.0,22.0,25.0,2.0,3.0,31.0,26.0,3.0,8.0,315.0,354.0,366.0,406.0
7,2901,1740.0,1604.0,183.0,163.0,11.0,17.0,2850.0,2915.0,33.0,36.0,1149.0,1257.0,4128.0,4319.0
8,3801,433.0,443.0,73.0,78.0,11.0,15.0,298.0,278.0,148.0,154.0,501.0,541.0,996.0,1022.0
9,3902,1277.0,1428.0,172.0,181.0,30.0,21.0,761.0,756.0,206.0,224.0,1694.0,1864.0,2773.0,2959.0


In [88]:
df_1921['District #'] = df_1921['District #'].astype(float)

district_1921.nunique()

District #                                            1212
Free Lunch 2020-2021                                   853
Free Lunch 2018-2019                                   852
Reduced-price Lunch 2020-2021                          340
Reduced-price Lunch 2018-2019                          363
Asian or Asian/Pacific Islander Students 2020-2021     215
Asian or Asian/Pacific Islander Students 2018-2019     217
Hispanic Students 2020-2021                            790
Hispanic Students 2018-2019                            782
Black or African American Students 2020-2021           421
Black or African American Students 2018-2019           418
White Students 2020-2021                               784
White Students 2018-2019                               790
Total Race/Ethnicity 2020-2021                         971
Total Race/Ethnicity 2018-2019                         988
dtype: int64

In [89]:
df_merge = pd.merge(df_merge, district_1921, how="left", on= ["District #"])

df_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1190 entries, 0 to 1189
Data columns (total 32 columns):
 #   Column                                              Non-Null Count  Dtype  
---  ------                                              --------------  -----  
 0   Agency Name                                         1190 non-null   object 
 1   State                                               1190 non-null   object 
 2   NCES Agency ID                                      1190 non-null   float64
 3   County #                                            1190 non-null   float64
 4   Total Operational Public Schools 2018-2019          1190 non-null   float64
 5   Total Public Schools 2018-2019                      1190 non-null   float64
 6   State Agency ID                                     1190 non-null   object 
 7   Total Students 2018-2019                            1190 non-null   float64
 8   FTE Teachers 2018-2019                              1190 non-null   float64
 9

Title 1 merging

In [90]:
cols_merge = ['District #',
              'School-wide Title I 2020-2021',
              'School-wide Title I 2018-2019',
              'Title I Eligible School 2020-2021',
              'Title I Eligible School 2018-2019']
df_merge = pd.merge(df_merge, district_1921_t1[cols_merge], how="left", on= "District #")
df_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1190 entries, 0 to 1189
Data columns (total 36 columns):
 #   Column                                              Non-Null Count  Dtype  
---  ------                                              --------------  -----  
 0   Agency Name                                         1190 non-null   object 
 1   State                                               1190 non-null   object 
 2   NCES Agency ID                                      1190 non-null   float64
 3   County #                                            1190 non-null   float64
 4   Total Operational Public Schools 2018-2019          1190 non-null   float64
 5   Total Public Schools 2018-2019                      1190 non-null   float64
 6   State Agency ID                                     1190 non-null   object 
 7   Total Students 2018-2019                            1190 non-null   float64
 8   FTE Teachers 2018-2019                              1190 non-null   float64
 9

In [91]:
df_merge = pd.merge(df_merge, df_grades, how="left", on= "District #")

In [92]:
df_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1190 entries, 0 to 1189
Data columns (total 68 columns):
 #   Column                                              Non-Null Count  Dtype  
---  ------                                              --------------  -----  
 0   Agency Name                                         1190 non-null   object 
 1   State                                               1190 non-null   object 
 2   NCES Agency ID                                      1190 non-null   float64
 3   County #                                            1190 non-null   float64
 4   Total Operational Public Schools 2018-2019          1190 non-null   float64
 5   Total Public Schools 2018-2019                      1190 non-null   float64
 6   State Agency ID                                     1190 non-null   object 
 7   Total Students 2018-2019                            1190 non-null   float64
 8   FTE Teachers 2018-2019                              1190 non-null   float64
 9

In [93]:
locale_diff = ~(df_merge['Locale 2018-2019'] == df_merge['Locale 2020-2021'])
df_merge.loc[locale_diff[locale_diff].index][['Locale 2018-2019', 'Locale 2020-2021']]

Unnamed: 0,Locale 2018-2019,Locale 2020-2021
36,31-Town: Fringe,41-Rural: Fringe
50,11-City: Large,21-Suburb: Large
126,41-Rural: Fringe,22-Suburb: Mid-size
204,12-City: Mid-size,41-Rural: Fringe
326,13-City: Small,12-City: Mid-size
327,41-Rural: Fringe,32-Town: Distant
336,41-Rural: Fringe,31-Town: Fringe
366,41-Rural: Fringe,32-Town: Distant
403,21-Suburb: Large,13-City: Small
417,21-Suburb: Large,13-City: Small


In [94]:
df_merge.drop(columns=['Locale 2018-2019', 'State Agency ID'], inplace=True)
df_merge.rename(columns = {'Locale 2020-2021': 'Locale'}, inplace = True)

In [95]:
total_diff_2019 = df_merge['Total Students 2018-2019'] - df_merge['Total Race/Ethnicity 2018-2019']
print(total_diff_2019.value_counts())

0.0    1190
dtype: int64


In [96]:
total_diff_2021 = df_merge['Total Students 2020-2021'] - df_merge['Total Race/Ethnicity 2020-2021']
print(total_diff_2021.value_counts())

df_merge[total_diff_2021==302]

0.0      1189
302.0       1
dtype: int64


Unnamed: 0,Agency Name,State,NCES Agency ID,County #,Total Operational Public Schools 2018-2019,Total Public Schools 2018-2019,Total Students 2018-2019,FTE Teachers 2018-2019,Total Staff 2018-2019,District #,Total Operational Public Schools 2020-2021,Total Public Schools 2020-2021,Total Students 2020-2021,FTE Teachers 2020-2021,Total Staff 2020-2021,Locale,Free Lunch 2020-2021,Free Lunch 2018-2019,Reduced-price Lunch 2020-2021,Reduced-price Lunch 2018-2019,Asian or Asian/Pacific Islander Students 2020-2021,Asian or Asian/Pacific Islander Students 2018-2019,Hispanic Students 2020-2021,Hispanic Students 2018-2019,Black or African American Students 2020-2021,Black or African American Students 2018-2019,White Students 2020-2021,White Students 2018-2019,Total Race/Ethnicity 2020-2021,Total Race/Ethnicity 2018-2019,School-wide Title I 2020-2021,School-wide Title I 2018-2019,Title I Eligible School 2020-2021,Title I Eligible School 2018-2019,Grade 3 2020-2021,Grade 3 2018-2019,Grade 4 2020-2021,Grade 4 2018-2019,Grade 5 2020-2021,Grade 5 2018-2019,Grade 6 2020-2021,Grade 6 2018-2019,Grade 7 2020-2021,Grade 7 2018-2019,Grade 8 2020-2021,Grade 8 2018-2019,Grades 1-8 2020-2021,Grades 1-8 2018-2019,Grades 9-12 2020-2021,Grades 9-12 2018-2019,Prek 2020-2021,Prek 2018-2019,K 2020-2021,K 2018-2019,Grade 1 2020-2021,Grade 1 2018-2019,Grade 2 2020-2021,Grade 2 2018-2019,Grade 9 2020-2021,Grade 9 2018-2019,Grade 10 2020-2021,Grade 10 2018-2019,Grade 11 2020-2021,Grade 11 2018-2019,Grade 12 2020-2021,Grade 12 2018-2019
1128,WATER VALLEY ISD,Texas,4844710.0,48451.0,7.0,8.0,336.0,33.82,54.36,226905,4.0,7.0,313.0,32.18,53.41,42-Rural: Distant,104.0,130.0,37.0,44.0,0.0,1.0,5.0,78.0,1.0,7.0,4.0,233.0,11.0,336.0,1,2,1,2,29.0,30.0,21.0,29.0,24.0,31.0,27.0,30.0,33.0,20.0,32.0,23.0,194.0,212.0,91.0,95.0,4.0,17.0,24.0,12.0,17.0,27.0,11.0,22.0,20.0,27.0,24.0,24.0,29.0,16.0,18.0,28.0


In [97]:
index_drop = df_merge[total_diff_2021==302].index
index_drop

Int64Index([1128], dtype='int64')

In [98]:
df_merge.drop(index_drop, inplace=True)
df_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1189 entries, 0 to 1189
Data columns (total 66 columns):
 #   Column                                              Non-Null Count  Dtype  
---  ------                                              --------------  -----  
 0   Agency Name                                         1189 non-null   object 
 1   State                                               1189 non-null   object 
 2   NCES Agency ID                                      1189 non-null   float64
 3   County #                                            1189 non-null   float64
 4   Total Operational Public Schools 2018-2019          1189 non-null   float64
 5   Total Public Schools 2018-2019                      1189 non-null   float64
 6   Total Students 2018-2019                            1189 non-null   float64
 7   FTE Teachers 2018-2019                              1189 non-null   float64
 8   Total Staff 2018-2019                               1189 non-null   float64
 9

In [99]:
df_merge.to_csv('DATA_NCES_DISTRICT.csv', index = None)

In [100]:
population = df_merge[["District #", "Agency Name", "Total Public Schools 2018-2019", "Total Students 2018-2019", "Total Public Schools 2020-2021", "Total Students 2020-2021"]]

population

Unnamed: 0,District #,Agency Name,Total Public Schools 2018-2019,Total Students 2018-2019,Total Public Schools 2020-2021,Total Students 2020-2021
0,57816,A W BROWN LEADERSHIP ACADEMY,2.0,2084.0,2.0,1383.0
1,57829,A+ ACADEMY,2.0,1409.0,2.0,1456.0
2,101871,A+ UNLIMITED POTENTIAL,2.0,180.0,1.0,152.0
3,109901,ABBOTT ISD,2.0,277.0,2.0,274.0
4,95901,ABERNATHY ISD,4.0,780.0,4.0,810.0
5,221901,ABILENE ISD,31.0,16645.0,30.0,15680.0
6,57814,ACADEMY FOR ACADEMIC EXCELLENCE,5.0,505.0,4.0,308.0
7,14901,ACADEMY ISD,6.0,1652.0,6.0,1726.0
8,101810,ACADEMY OF ACCELERATED LEARNING INC,1.0,742.0,1.0,851.0
9,57810,ACADEMY OF DALLAS,1.0,487.0,1.0,341.0


In [101]:
cap = df_merge.loc[(population['Total Students 2018-2019']>1000)]

In [102]:
cap.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 570 entries, 0 to 1187
Data columns (total 66 columns):
 #   Column                                              Non-Null Count  Dtype  
---  ------                                              --------------  -----  
 0   Agency Name                                         570 non-null    object 
 1   State                                               570 non-null    object 
 2   NCES Agency ID                                      570 non-null    float64
 3   County #                                            570 non-null    float64
 4   Total Operational Public Schools 2018-2019          570 non-null    float64
 5   Total Public Schools 2018-2019                      570 non-null    float64
 6   Total Students 2018-2019                            570 non-null    float64
 7   FTE Teachers 2018-2019                              570 non-null    float64
 8   Total Staff 2018-2019                               570 non-null    float64
 9 