# STAAR DISTRICT LEVEL CLEANING NOTEBOOK

**v2 UPDATE**  
- Downloaded raw data from [STAAR Aggregate Level Data for 2018-2019](https://tea.texas.gov/student-assessment/testing/staar/staar-aggregate-data-for-2018-2019) and [STAAR Aggregate Level Data for 2020-2021](https://tea.texas.gov/student-assessment/testing/staar/staar-aggregate-data-for-2020-2021)

***Import libraries:***

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

pd.options.display.max_rows = 4000
pd.options.display.max_columns = 200

Let's make sure we have the data in this folder, in order to read it in properly...

In [2]:
ls

DATA_STAAR_DISTRICT_2019.csv      staar_g5_district21.dat
DATA_STAAR_DISTRICT_2021.csv      staar_g6_district19.dat
staar_district_cleaning_v2.ipynb  staar_g6_district21.dat
staar_g3_district19.dat           staar_g7_district19.dat
staar_g3_district21.dat           staar_g7_district21.dat
staar_g4_district19.dat           staar_g8_district19.dat
staar_g4_district21.dat           staar_g8_district21.dat
staar_g5_district19.dat


Data collected from [names link](https://tea.texas.gov/student-assessment/testing/staar/staar-variables-formats-and-descriptions) and [data link](https://tea.texas.gov/student-assessment/testing/staar/staar-aggregate-data)

Update file names

In [3]:
import os

In [4]:
for num in range(3, 9):
    old_name = '_dfy19e'+ str(num) + '.dat'
    new_name = 'staar_g'+ str(num) + '_district19.dat'
    if os.path.exists(old_name): os.rename(old_name, new_name)
    old_name = 'dfy21e'+ str(num) + '.dat'
    new_name = 'staar_g'+ str(num) + '_district21.dat'
    if os.path.exists(old_name): os.rename(old_name, new_name)

***Data Reading:***

In [5]:
cols_staar = ['DISTRICT',
            'r_all_rs',
            'r_ti1y_rs',
            'r_ecoy_rs',
            'r_eco1_rs',
            'r_etha_rs',
            'r_ethh_rs',
            'r_ethb_rs',
            'r_ethw_rs',
            'r_spey_rs',
            'r_all_d',
            'r_ti1y_d',
            'r_ecoy_d',
            'r_eco1_d',
            'r_etha_d',
            'r_ethh_d',
            'r_ethb_d',
            'r_ethw_d',
            'r_spey_d',
            'm_all_rs',
            'm_ti1y_rs',
            'm_ecoy_rs',
            'm_eco1_rs',
            'm_etha_rs',
            'm_ethh_rs',
            'm_ethb_rs',
            'm_ethw_rs',
            'm_spey_rs',
            'm_all_d',
            'm_ti1y_d',
            'm_ecoy_d',
            'm_eco1_d',
            'm_etha_d',
            'm_ethh_d',
            'm_ethb_d',
            'm_ethw_d',
            'm_spey_d']

In [6]:
# 2019 GRADE 3
df_19_g3 = pd.read_csv('staar_g3_district19.dat', usecols = cols_staar)

# 2019 GRADE 4
df_19_g4 = pd.read_csv('staar_g4_district19.dat', usecols = cols_staar)

# 2019 GRADE 5
df_19_g5 = pd.read_csv('staar_g5_district19.dat', usecols = cols_staar)

# 2019 GRADE 6
df_19_g6 = pd.read_csv('staar_g6_district19.dat', usecols = cols_staar)

# 2019 GRADE 7
df_19_g7 = pd.read_csv('staar_g7_district19.dat', usecols = cols_staar)

# 2019 GRADE 8
df_19_g8 = pd.read_csv('staar_g8_district19.dat', usecols = cols_staar)

In [None]:
# 2021 GRADE 3
df_21_g3 = pd.read_csv('staar_g3_district21.dat', usecols = cols_staar)

# 2021 GRADE 4
df_21_g4 = pd.read_csv('staar_g4_district21.dat', usecols = cols_staar)

# 2021 GRADE 5
df_21_g5 = pd.read_csv('staar_g5_district21.dat', usecols = cols_staar)

# 2021 GRADE 6
df_21_g6 = pd.read_csv('staar_g6_district21.dat', usecols = cols_staar)

# 2021 GRADE 7
df_21_g7 = pd.read_csv('staar_g7_district21.dat', usecols = cols_staar)

# 2021 GRADE 8
df_21_g8 = pd.read_csv('staar_g8_district21.dat', usecols = cols_staar)

In [None]:
print('df_19_g3 shape:', df_19_g3.shape)

print('df_19_g4 shape:', df_19_g4.shape)

print('df_19_g5 shape:', df_19_g5.shape)

print('df_19_g6 shape:', df_19_g6.shape)

print('df_19_g7 shape:', df_19_g7.shape)

print('df_19_g8 shape:', df_19_g8.shape)

In [None]:
print('df_21_g3 shape:', df_21_g3.shape)

print('df_21_g4 shape:', df_21_g4.shape)

print('df_21_g5 shape:', df_21_g5.shape)

print('df_21_g6 shape:', df_21_g6.shape)

print('df_21_g7 shape:', df_21_g7.shape)

print('df_21_g8 shape:', df_21_g8.shape)

**Adding suffix for grade and year**

In [None]:
df_19_g3 = df_19_g3.add_suffix('_g3_2019')

df_19_g4 = df_19_g4.add_suffix('_g4_2019')

df_19_g5 = df_19_g5.add_suffix('_g5_2019')

df_19_g6 = df_19_g6.add_suffix('_g6_2019')

df_19_g7 = df_19_g7.add_suffix('_g7_2019')

df_19_g8 = df_19_g8.add_suffix('_g8_2019')

In [None]:
df_21_g3 = df_21_g3.add_suffix('_g3_2021')

df_21_g4 = df_21_g4.add_suffix('_g4_2021')

df_21_g5 = df_21_g5.add_suffix('_g5_2021')

df_21_g6 = df_21_g6.add_suffix('_g6_2021')

df_21_g7 = df_21_g7.add_suffix('_g7_2021')

df_21_g8 = df_21_g8.add_suffix('_g8_2021')

In [None]:
df_19_g3.rename(columns = {'DISTRICT_g3_2019': 'District #'}, inplace = True)

df_19_g4.rename(columns = {'DISTRICT_g4_2019': 'District #'}, inplace = True)

df_19_g5.rename(columns = {'DISTRICT_g5_2019': 'District #'}, inplace = True)

df_19_g6.rename(columns = {'DISTRICT_g6_2019': 'District #'}, inplace = True)

df_19_g7.rename(columns = {'DISTRICT_g7_2019': 'District #'}, inplace = True)

df_19_g8.rename(columns = {'DISTRICT_g8_2019': 'District #'}, inplace = True)

In [None]:
df_21_g3.rename(columns = {'DISTRICT_g3_2021': 'District #'}, inplace = True)

df_21_g4.rename(columns = {'DISTRICT_g4_2021': 'District #'}, inplace = True)

df_21_g5.rename(columns = {'DISTRICT_g5_2021': 'District #'}, inplace = True)

df_21_g6.rename(columns = {'DISTRICT_g6_2021': 'District #'}, inplace = True)

df_21_g7.rename(columns = {'DISTRICT_g7_2021': 'District #'}, inplace = True)

df_21_g8.rename(columns = {'DISTRICT_g8_2021': 'District #'}, inplace = True)

**Checking number of na values**

In [None]:
print(df_19_g3['District #'].isna().sum())

print(df_19_g4['District #'].isna().sum())

print(df_19_g5['District #'].isna().sum())

print(df_19_g6['District #'].isna().sum())

print(df_19_g7['District #'].isna().sum())

print(df_19_g8['District #'].isna().sum())

In [None]:
print(df_21_g3['District #'].isna().sum())

print(df_21_g4['District #'].isna().sum())

print(df_21_g5['District #'].isna().sum())

print(df_21_g6['District #'].isna().sum())

print(df_21_g7['District #'].isna().sum())

print(df_21_g8['District #'].isna().sum())

In [None]:
print('\ndf_19_g3\n', df_19_g3.isna().sum())

print('\ndf_19_g4\n', df_19_g4.isna().sum())

print('\ndf_19_g5\n', df_19_g5.isna().sum())

print('\ndf_19_g6\n', df_19_g6.isna().sum())

print('\ndf_19_g7\n', df_19_g7.isna().sum())

print('\ndf_19_g8\n', df_19_g8.isna().sum())

In [None]:
print('\ndf_21_g3\n', df_21_g3.isna().sum())

print('\ndf_21_g4\n', df_21_g4.isna().sum())

print('\ndf_21_g5\n', df_21_g5.isna().sum())

print('\ndf_21_g6\n', df_21_g6.isna().sum())

print('\ndf_21_g7\n', df_21_g7.isna().sum())

print('\ndf_21_g8\n', df_21_g8.isna().sum())

**Merging dataframes for year of 2019 and 2021**

2019

In [None]:
df_2019 = pd.merge(df_19_g3, df_19_g4, how="outer", on="District #")

In [None]:
df_2019 = pd.merge(df_2019, df_19_g5, how="outer", on="District #")

In [None]:
df_2019 = pd.merge(df_2019, df_19_g6, how="outer", on="District #")

In [None]:
df_2019 = pd.merge(df_2019, df_19_g7, how="outer", on="District #")

In [None]:
df_2019 = pd.merge(df_2019, df_19_g8, how="outer", on="District #")

In [None]:
print(df_2019.info())

In [None]:
df_2019.isna().sum()

2021

In [None]:
df_2021 = pd.merge(df_21_g3, df_21_g4, how="outer", on="District #")

In [None]:
df_2021 = pd.merge(df_2021, df_21_g5, how="outer", on="District #")

In [None]:
df_2021 = pd.merge(df_2021, df_21_g6, how="outer", on="District #")

In [None]:
df_2021 = pd.merge(df_2021, df_21_g7, how="outer", on="District #")

In [None]:
df_2021 = pd.merge(df_2021, df_21_g8, how="outer", on="District #")

In [None]:
print(df_2021.info())

In [None]:
df_2021.isna().sum()

**Saving as csv**

In [None]:
df_2019.to_csv('DATA_STAAR_DISTRICT_2019.csv', index = None)

In [None]:
df_2021.to_csv('DATA_STAAR_DISTRICT_2021.csv', index = None)

**===========Cleaning Done**===========

In [None]:
plotdata = pd.DataFrame({
    "2021 Reading":[1391.7, 1544.6, 1452.7, 1497.5, 1497.6, 1518.75, 1496.7],
    "2019 Reading":[1468.5, 1629.25, 1478.2, 0, 1523, 1545, 1521.83],
    "2021 Math":[1378.3, 1583.6, 1463, 1439.5, 1480.3, 1517.75, 1481.1],
    "2019 Math":[1465.3, 1667.5, 1478.5, 0, 1546.2, 1584.5, 1542.5]
    }, 
    index=['Free Lunch', 'Asian', 'Hispanic', 'Black', 'White', 'Two or More Races', 'All Students']
)
plotdata.plot(kind="barh")
plt.title("Lake Travis ISD Grade 3 STAAR Scores")
plt.xlabel("Average STAAR Score")
plt.ylabel("")
plt.figure(figsize=(25,40))

In [None]:
plotdata = pd.DataFrame({
    "2021 Reading":[1358.8, 1475.2, 1379.3, 1339.2, 1529.4, 1515.1, 1408.5],
    "2019 Reading":[1397.75, 1539.4, 1429, 1372, 1539.9, 1551.3, 1447],
    "2021 Math":[1329.9, 1516.6, 1341.9, 1299.5, 1465.3, 1475, 1368.6],
    "2019 Math":[1431.2, 1586.6, 1464.1, 1390, 1551, 1558.5, 1480]
    }, 
    index=['Free Lunch', 'Asian', 'Hispanic', 'Black', 'White', 'Two or More Races', 'All Students']
)
plotdata.plot(kind="barh")
plt.title("Austin ISD Grade 3 STAAR Scores")
plt.xlabel("Average STAAR Score")
plt.ylabel("")
plt.figure(figsize=(25,40))

In [None]:
plotdata = pd.DataFrame({
    "2021 Reading":[0, 1783, 1678.3, 0, 1723.3, 1740, 1716.3],
    "2019 Reading":[0, 1845.5, 1760.5, 1668, 1784, 1792, 1781],
    "2021 Math":[0, 1863.3, 1696.3, 1676, 1750.7, 1757, 1745.7],
    "2019 Math":[0, 1938, 1778.5, 1737, 1807, 1847, 1813]
    }, 
    index=['Free Lunch', 'Asian', 'Hispanic', 'Black', 'White', 'Two or More Races', 'All Students']
)
plotdata.plot(kind="barh")
plt.title("Lake Travis ISD Grade 8 STAAR Scores")
plt.xlabel("Average STAAR Score")
plt.ylabel("")
plt.figure(figsize=(25,40))

In [None]:
plotdata = pd.DataFrame({
    "2021 Reading":[1576.3, 1741, 1593, 1542.6, 1733.5, 1735.3, 1616.6],
    "2019 Reading":[1632.7, 1776.3, 1648.7, 1622.4, 1730.1, 1779.2, 1670.9],
    "2021 Math":[1548.6, 1728, 1564.9, 1534.6, 1673.3, 1697.3, 1590],
    "2019 Math":[1649.3, 1817, 1658.3, 1638.9, 1738.7, 1769.9, 1677.7]
    }, 
    index=['Free Lunch', 'Asian', 'Hispanic', 'Black', 'White', 'Two or More Races', 'All Students']
)
plotdata.plot(kind="barh")
plt.title("Austin ISD Grade 8 STAAR Scores")
plt.xlabel("Average STAAR Score")
plt.ylabel("")
plt.figure(figsize=(25,40))

In [None]:
plotdata = pd.DataFrame({
    "Free Lunch":[0,1576.3],
    "Asian":[1783,1741],
    "Hispanic":[1678.3,1593],
    "Black":[0,1542.6],
    "White":[1723.3, 1733.5],
    "Two or More Races":[1740, 1735.3],
    "All Students":[1716.3, 1616.6]
    }, 
    index=['Lake Travis', 'Austin']
)
plotdata.plot(kind="barh")
plt.title("2021 Lake Travis ISD vs Austin ISD - Grade 3 Reading STAAR Scores")
plt.xlabel("Average STAAR Score")
plt.ylabel("")
plt.figure(figsize=(25,40))

In [None]:
plotdata = pd.DataFrame({
    "Austin":[1632.7, 1776.3, 1648.7, 1622.4, 1730.1, 1779.2, 1670.9],
    "Lake Travis":[0, 1845.5, 1760.5, 1668, 1784, 1792, 1781]
    }, 
    index=['Free Lunch', 'Asian', 'Hispanic', 'Black', 'White', 'Two or More Races', 'All Students']
)
plotdata.plot(kind="barh")
plt.title("2019 Austin ISD vs Lake Travis ISD - Reading Grade 8 STAAR Scores")
plt.xlabel("Average STAAR Score")
plt.ylabel("")
plt.figure(figsize=(25,40))

In [None]:
plotdata = pd.DataFrame({
    "Austin":[1649.3, 1817, 1658.3, 1638.9, 1738.7, 1769.9, 1677.7],
    "Lake Travis":[0, 1938, 1778.5, 1737, 1807, 1847, 1813]
    }, 
    index=['Free Lunch', 'Asian', 'Hispanic', 'Black', 'White', 'Two or More Races', 'All Students']
)
plotdata.plot(kind="barh")
plt.title("2019 Austin ISD vs Lake Travis ISD - Math Grade 8 STAAR Scores")
plt.xlabel("Average STAAR Score")
plt.ylabel("")
plt.figure(figsize=(25,40))