# Create NCDPI 2018-2019 Raw Datasets
### This program downloads all original datasets from www.ncpublicschools.org and saves them as .csv files. These data files are used to create all the flattened and machine learning datasets within the NCEA repository.

1. This notebook downloads raw datasets directly from NCDPI specific URLs.
2. Each raw dataset is filtered by school year and saved in the original layout as a .csv file.
3. For consistency, both the Year and School code fields are renamed to "year" and "agency_code" in all files.
4. All masking is removed from raw data fields using the following code: replace({"*":0, ">95":100, "<5":0, "<10":5 })
5. All * or carriage returns are removed from column names.
6. All raw datasets created by this program are used to create the "flattened" and "machine learning" Public School datasets.

In [72]:
#import required Libraries
import pandas as pd
import numpy as np

#**********************************************************************************
# Set the following variables before running this code!!!
#**********************************************************************************

#Location where copies of the raw data files will be downloaded and saved as csv files.
#'C:/Users/Jake/Documents/GitHub/EducationDataNC/2018/Raw Datasets/'
dataDir = 'D:/BenepactLLC/Belk/NC_Report_Card_Data/2020/January 2020/2019/Raw Datasets/'

#All raw data files are filtered for the year below
schoolYear = 2019

## Download and Save Copy of the Original SRC Data

In [73]:
import urllib.request
import os 

#Download and save an original copy of the raw SRC data 
url="https://files.nc.gov/dpi/src_datasets.zip"
zipFilePath = dataDir + 'src-datasets.zip'

#Comment out the next line after downloading the original data one time! 
#urllib.request.urlretrieve(url, zipFilePath)

import zipfile

#Extract the zip file and all school datasets to the //Raw Datasets/ folder
zip_ref = zipfile.ZipFile(zipFilePath, 'r')
zip_ref.extractall(dataDir + 'SRC_Datasets/')
zip_ref.close()

#Remove corrupt file
os.remove(dataDir + 'SRC_Datasets/' + "rcd_acc_pc.xlsx")
#Remove Lookup Tables
os.remove(dataDir + 'SRC_Datasets/' + "rcd_ap_crs_list.xlsx")
os.remove(dataDir + 'SRC_Datasets/' + "rcd_cte_enrollment_cluster.xlsx")

In [74]:
##Delete corrupt record from rcd_sat.xlsx
filePath = dataDir + 'SRC_Datasets/' + 'rcd_sat.xlsx'
rcdSat = pd.read_excel(filePath, dtype={'agency_code': object})
rcdSat = rcdSat[rcdSat.year != '/*20']

#Save file without the bold column headings
import pandas.io.formats.excel
pandas.io.formats.excel.header_style = None
rcdSat.to_excel(filePath,index=False)

# Get the Most Recent Year of Data from Each File

In [75]:
#Use ntpath.basename to get a filename from a filepath
import ntpath

def CleanUpRcdFiles(filePath):
    fileName = ntpath.basename(filePath)
    fileName = fileName.replace('xlsx','csv')
    schFile = pd.read_excel(filePath, dtype={'agency_code': object, 'year':int})
    maxYear = schFile['year'].max()
    
    #Filter records for the most recent year
    schFile = schFile[schFile['year'] == maxYear]
    
    #Remove state and district level summary records 
    #schFile = schFile[(schFile['agency_code'] != 'NC-SEA') & (schFile['agency_code'].str.contains("LEA") == False)]
        
    #Remove * character from any fields. 
    schFile = schFile.replace({'*':''})
    schFile.to_csv(dataDir + 'SRC_Datasets/' + fileName, sep=',', index=False)
    
    print(fileName + ', Max Year: ' + str(maxYear))

In [76]:
#Use wildcards to find files in a directory
import glob
import os
#Get and display a list of all .csv file names for 2019 download
rcdFiles = glob.glob(dataDir + 'SRC_Datasets/' + 'rcd*.xlsx')

print('Saving Files to: ' + dataDir + '\n')

for filePth in rcdFiles:
    fileName = ntpath.basename(filePth)
    #if fileName != 'rcd_code_desc.csv': --No rcd_code_desc in the 2019 rcd files 
    CleanUpRcdFiles(filePth)
    #Remove the old xlsx file since we converted it to csv
    if os.path.exists(filePth):
        os.remove(filePth)

Saving Files to: D:/BenepactLLC/Belk/NC_Report_Card_Data/2020/January 2020/2019/Raw Datasets/

rcd_acc_aapart.csv, Max Year: 2019
rcd_acc_act.csv, Max Year: 2019
rcd_acc_awa.csv, Max Year: 2019
rcd_acc_cgr.csv, Max Year: 2019
rcd_acc_eds.csv, Max Year: 2019
rcd_acc_eg.csv, Max Year: 2019
rcd_acc_elp.csv, Max Year: 2019
rcd_acc_essa_desig.csv, Max Year: 2019
rcd_acc_gp.csv, Max Year: 2019
rcd_acc_irm.csv, Max Year: 2019
rcd_acc_lowperf.csv, Max Year: 2019
rcd_acc_ltg.csv, Max Year: 2019
rcd_acc_ltg_detail.csv, Max Year: 2019
rcd_acc_mcr.csv, Max Year: 2019
rcd_acc_part.csv, Max Year: 2019
rcd_acc_part_detail.csv, Max Year: 2019
rcd_acc_rta.csv, Max Year: 2019
rcd_acc_spg1.csv, Max Year: 2017
rcd_acc_spg2.csv, Max Year: 2019
rcd_acc_wk.csv, Max Year: 2019
rcd_ap.csv, Max Year: 2019
rcd_charter.csv, Max Year: 2019
rcd_courses2.csv, Max Year: 2019
rcd_cte_concentrators.csv, Max Year: 2019
rcd_cte_credentials.csv, Max Year: 2019
rcd_cte_endorsement.csv, Max Year: 2019
rcd_cte_enrollment.csv

# Flatten the Raw Data Files
### This section reads raw data files directly from the \\Raw Datasets folder and flattens each file.
1. Each agency_code could represent National, State, District, Or School Campus level data.
2. This code creates new data columns using pivots until there is only one record per agency_code.
3. Percentage fields are typically used for pivot values in cases where count, denominators, or percentages are available.  

In [77]:
def PivotCsv(dataDir, fileName, pivValues, pivIndex, pivColumns, colSuffix):
    pivFile = pd.read_csv(dataDir + fileName, low_memory=False, dtype={pivIndex: object})
    
    pivFile = pd.pivot_table(pivFile, values=pivValues,index=pivIndex,columns=pivColumns)
    
    #concatenate multiindex column names using a list comprehension.
    pivFile.columns = [ '_'.join(str(i) for i in col) + colSuffix for col in pivFile.columns]

    #Make our index a column for merges later
    pivFile.reset_index(level=0, inplace=True)
    return pivFile

### Use table pivots to flatten each dataset
* Each dataset is converted to one record per agency code.

In [78]:
srcDir = dataDir + 'SRC_Datasets/'

#Pivot File - rcd_161 - Missing in 2019
#rcd_161 = PivotCsv(srcDir, 'rcd_161.csv',['ccc_pct'],'agency_code', ['status','subgroup'],'_161')

#Pivot File - rcd_acc_aapart 
rcd_acc_aapart = PivotCsv(srcDir, 'rcd_acc_aapart.csv',['pct'],'agency_code', ['subject','grade'],'_AAPART')

#Pivot File - rcd_acc_act 
rcd_acc_act = PivotCsv(srcDir, 'rcd_acc_act.csv',['pct'],'agency_code', ['subject','subgroup'],'_ACT')

#Pivot File - rcd_acc_awa 
rcd_acc_awa = PivotCsv(srcDir, 'rcd_acc_awa.csv',['pct'],'agency_code', ['subgroup'],'_AWA')

#Pivot File - rcd_acc_cgr
rcd_acc_cgr = PivotCsv(srcDir, 'rcd_acc_cgr.csv',['pct'],'agency_code', ['cgr_type', 'subgroup'],'_CGR')

#File - rcd_acc_eds
rcd_acc_eds = pd.read_csv(srcDir + 'rcd_acc_eds.csv', low_memory=False, dtype={'agency_code': object})
rcd_acc_eds = rcd_acc_eds[['agency_code', 'pct_eds']]

#Pivot File - rcd_acc_elp
rcd_acc_elp = PivotCsv(srcDir, 'rcd_acc_elp.csv',['pct'],'agency_code', ['subgroup'],'_ELP')

#File - rcd_acc_essa_desig
rcd_acc_essa_desig = pd.read_csv(srcDir + 'rcd_acc_essa_desig.csv', low_memory=False, dtype={'agency_code': object})
rcd_acc_essa_desig.drop(['year'], axis=1, inplace=True)

#File - rcd_acc_gp
rcd_acc_gp = pd.read_csv(srcDir + 'rcd_acc_gp.csv', low_memory=False, dtype={'agency_code': object})
rcd_acc_gp.drop(['year'], axis=1, inplace=True)

#Pivot File - rcd_acc_irm
rcd_acc_irm = PivotCsv(srcDir, 'rcd_acc_irm.csv',['pct_prof'],'agency_code', ['grade'],'gr_irm')

#File - rcd_acc_lowperf
rcd_acc_lowperf = pd.read_csv(srcDir + 'rcd_acc_lowperf.csv', low_memory=False, dtype={'agency_code': object})
rcd_acc_lowperf = rcd_acc_lowperf[['agency_code', 'lp_school','rlp_school','clpc_school']]

#Pivot File - rcd_acc_ltg
rcd_acc_ltg = PivotCsv(srcDir, 'rcd_acc_ltg.csv',['pct_met'],'agency_code', ['target'],'_LTG')

#File - rcd_acc_ltg_detail
rcd_acc_ltg_detail = pd.read_csv(srcDir + 'rcd_acc_ltg_detail.csv', low_memory=False, dtype={'agency_code': object})
rcd_acc_ltg_detail.drop(['year'], axis=1, inplace=True)

#Pivot File - rcd_acc_mcr
rcd_acc_mcr = PivotCsv(srcDir, 'rcd_acc_mcr.csv',['pct'],'agency_code', ['subgroup'],'_MCR')

#Pivot File - rcd_acc_part_detail
rcd_acc_part = PivotCsv(srcDir, 'rcd_acc_part.csv',['pct_met'],'agency_code', ['target'],'_PART')

#Pivot File - rcd_acc_part
rcd_acc_part_detail = PivotCsv(srcDir, 'rcd_acc_part_detail.csv',['pct'],'agency_code', ['target','subgroup'],'_PART_DET')

#Pivot File - rcd_acc_pc - WARNING 3323 columns!!! - Missing in 2019
#rcd_acc_pc = PivotCsv(srcDir, 'rcd_acc_pc.csv',['pct'],'agency_code', ['standard','subject','grade','subgroup'],'_PC')

#Pivot File - rcd_acc_part_detail
rcd_acc_rta = PivotCsv(srcDir, 'rcd_acc_rta.csv',['pct'],'agency_code', ['metric'],'_RTA')

#File - rcd_acc_spg1
rcd_acc_spg1 = pd.read_csv(srcDir + 'rcd_acc_spg1.csv', low_memory=False, dtype={'agency_code': object})
rcd_acc_spg1.drop(['year'], axis=1, inplace=True)

#File - rcd_acc_spg2
pivVals = ['aaa_score','awa_score','cgrs_score','elp_score','mcr_score','scgs_score','bi_score',
           'ach_score','eg_status','eg_score','spg_score','spg_grade']
           
rcd_acc_spg2 = PivotCsv(srcDir, 'rcd_acc_spg2.csv',pivVals,'agency_code', ['subgroup'],'_SPG2')

#Pivot File - rcd_acc_wk
rcd_acc_wk = PivotCsv(srcDir, 'rcd_acc_wk.csv',['pct'],'agency_code', ['subgroup'],'_WK')

#File - rcd_adm - Missing in 2019
#rcd_adm = pd.read_csv(srcDir + 'rcd_adm.csv', low_memory=False, dtype={'agency_code': object})
#rcd_adm.drop(['year','category_code'], axis=1, inplace=True)

#File - rcd_ap
#Found 0 duplicate agency_codes in this file, no pivot 
rcd_ap = pd.read_csv(srcDir + 'rcd_ap.csv', low_memory=False, dtype={'agency_code': object})
rcd_ap.drop(['year','category_code'], axis=1, inplace=True)

#File - rcd_arts - Missing in 2019
#rcd_arts = pd.read_csv(srcDir + 'rcd_arts.csv', low_memory=False, dtype={'agency_code': object})
#rcd_arts.drop(['year'], axis=1, inplace=True)

#File - rcd_att - Missing in 2019
#Found 0 duplicate agency_codes in this file, no pivot 
#rcd_att = pd.read_csv(srcDir + 'rcd_att.csv', low_memory=False, dtype={'agency_code': object})
#rcd_att.drop(['year','category_code'], axis=1, inplace=True)

#Pivot File - rcd_charter
rcd_charter = PivotCsv(srcDir, 'rcd_charter.csv',['pct_enrolled'],'agency_code', ['home_lea','subgroup'],'_CHARTER')

#Pivot File - rcd_chronic_absent - Missing in 2019
#rcd_chronic_absent = PivotCsv(srcDir, 'rcd_chronic_absent.csv',['pct'],'agency_code', ['subgroup'],'_CHRON_ABSENT')

#Pivot File - rcd_college - Missing in 2019
#rcd_college = PivotCsv(srcDir, 'rcd_college.csv',['pct_enrolled'],'agency_code', ['status','subgroup'],'_COLLEGE')

#File - rcd_courses1 - 2017 DATA - Missing in 2019
#Found 0 duplicate agency_codes in this file, no pivot 
#rcd_courses1 = pd.read_csv(srcDir + 'rcd_courses1.csv', low_memory=False, dtype={'agency_code': object})
#rcd_courses1.drop(['year','category_code'], axis=1, inplace=True)

#Pivot File - rcd_courses2
rcd_courses2 = PivotCsv(srcDir, 'rcd_courses2.csv',['pct_ap','pct_ccp','pct_ib'],'agency_code', ['category_code','subgroup'],
                        '_COURSES2')

#Pivot File - rcd_cte_concentrators
rcd_cte_concentrators = PivotCsv(srcDir, 'rcd_cte_concentrators.csv',['num_concentrators'],'agency_code',
                                 ['career_cluster'],'')

#File - rcd_cte_credentials
rcd_cte_credentials = pd.read_csv(srcDir + 'rcd_cte_credentials.csv', low_memory=False, dtype={'agency_code': object})
rcd_cte_credentials.drop(['year'], axis=1, inplace=True)

#File - rcd_cte_endorsement
rcd_cte_endorsement = pd.read_csv(srcDir + 'rcd_cte_endorsement.csv', low_memory=False, dtype={'agency_code': object})
rcd_cte_endorsement.drop(['year'], axis=1, inplace=True)

#File - rcd_cte_enrollment
rcd_cte_enrollment = pd.read_csv(srcDir + 'rcd_cte_enrollment.csv', low_memory=False, dtype={'agency_code': object})
rcd_cte_enrollment['cte_enrollment_pct'] = rcd_cte_enrollment['pct'] 
rcd_cte_enrollment.drop(['year','pct'], axis=1, inplace=True)

#File - rcd_dlmi
rcd_dlmi = pd.read_csv(srcDir + 'rcd_dlmi.csv', low_memory=False, dtype={'agency_code': object})
rcd_dlmi.drop(['year'], axis=1, inplace=True)

#Pivot File - rcd_effectiveness - 2017 Data - Missing in 2019
#rcd_effectiveness = PivotCsv(srcDir, 'rcd_effectiveness.csv',['pct_rating'],'agency_code', ['ee_standard','ee_rating'],'')

#File - rcd_esea_att - 2015 DATA - Missing in 2019
#Found 0 duplicate agency_codes in this file, no pivot 
#rcd_esea_att = pd.read_csv(srcDir + 'rcd_esea_att.csv', low_memory=False, dtype={'agency_code': object})
#rcd_esea_att.drop(['year','category_code'], axis=1, inplace=True)

#Pivot File - rcd_experience- Missing in 2019
#expPivColumns = ['pct_experience_0','pct_experience_10','pct_experience_4',
#                 'pct_adv_degree','pct_turnover','total_class_teach','avg_class_teach']
#rcd_experience = PivotCsv(srcDir, 'rcd_experience.csv',expPivColumns,'agency_code', ['staff'],'Exp')

#File - !!!DISTRICT LEVEL DATA!!! - Missing in 2019
#rcd_funds = pd.read_csv(srcDir + 'rcd_funds.csv', low_memory=False, dtype={'agency_code': object})
#rcd_funds.drop(['year'], axis=1, inplace=True)

#Pivot File - rcd_hqt - !!!2016 DATA!!! - Missing in 2019 
#rcd_hqt = PivotCsv(srcDir, 'rcd_hqt.csv',['highqual_class_pct'],'agency_code', ['category_code'],'')

#File - rcd_ib
#Found 0 duplicate agency_codes in this file at school level, no pivot 
rcd_ib = pd.read_csv(srcDir + 'rcd_ib.csv', low_memory=False, dtype={'agency_code': object})
rcd_ib.drop(['year','category_code'], axis=1, inplace=True)

#Pivot File - rcd_improvement - Missing in 2019
#rcd_improvement = PivotCsv(srcDir, 'rcd_improvement.csv',['amount'],'agency_code', ['strategy'],'_Improve_Amt')

#File - rcd_inc1
#Found 0 duplicate agency_codes in this file at school level, no pivot 
rcd_inc1 = pd.read_csv(srcDir + 'rcd_inc1.csv', low_memory=False, dtype={'agency_code': object})
rcd_inc1.drop(['year','category_code'], axis=1, inplace=True)

#File - rcd_inc2 
pivFields = ['iss_per1000','sts_per1000','lts_per1000',
             'exp_per1000','act_per1000','bha_per1000',
             'rpt_per1000','arr_per1000']
rcd_inc2 = PivotCsv(srcDir, 'rcd_inc2.csv',pivFields,'agency_code', ['subgroup'],'')

#File - rcd_licenses  - Missing in 2019
#Found 0 duplicate agency_codes in this file at school level, no pivot 
#rcd_licenses = pd.read_csv(srcDir + 'rcd_licenses.csv', low_memory=False, dtype={'agency_code': object})
#rcd_licenses.drop(['year','category_code'], axis=1, inplace=True)

#File - rcd_location
#Found 0 duplicate agency_codes in this file at school level, no pivot 
rcd_location = pd.read_csv(srcDir + 'rcd_location.csv', low_memory=False, dtype={'agency_code': object})
rcd_location.drop(['year'], axis=1, inplace=True)


#Pivot File - rcd_naep !!!NATIONAL & STATE LEVEL DATA!!! - Missing in 2019
#pivCols = ['grade','naep_subject','subgroup','Proficiency_level']
#rcd_naep = PivotCsv(srcDir, 'rcd_naep.csv',['percent_proficient'],'agency_code', pivCols,'_NAEP')

#File - rcd_nbpts - Missing in 2019
#Found 0 duplicate agency_codes in this file at school level, no pivot 
#rcd_nbpts = pd.read_csv(srcDir + 'rcd_nbpts.csv', low_memory=False, dtype={'agency_code': object})
#rcd_nbpts.drop(['year','category_code','total_nbpts_num'], axis=1, inplace=True)

#Pivot File - rcd_pk_enroll - Missing in 2019
#rcd_pk_enroll = PivotCsv(srcDir, 'rcd_pk_enroll.csv',['pct'],'agency_code', ['subgroup'],'_PK_ENROLL')

#Pivot File - rcd_prin_demo - !!! District Level Data !!!
rcd_prin_demo = PivotCsv(srcDir, 'rcd_prin_demo.csv',['pct_prin_demo'],'agency_code', ['subgroup'],'')

#File - rcd_readiness
#Found 0 duplicate agency_codes in this file at school level, no pivot 
rcd_readiness = pd.read_csv(srcDir + 'rcd_readiness.csv', low_memory=False, dtype={'agency_code': object})
rcd_readiness.drop(['year','category_code'], axis=1, inplace=True)

#Pivot File - rcd_sar - Missing in 2019
#rcd_sar = PivotCsv(srcDir, 'rcd_sar.csv',['avg_size'],'agency_code', ['grade_eoc'],'_SAR')

#File - rcd_sat
#Found 0 duplicate agency_codes in this file at school level, no pivot 
rcd_sat = pd.read_csv(srcDir + 'rcd_sat.csv', low_memory=False, dtype={'agency_code': object})
rcd_sat.drop(['year','category_code'], axis=1, inplace=True)

#File - rcd_welcome
rcd_welcome = pd.read_csv(srcDir + 'rcd_welcome.csv', low_memory=False, dtype={'agency_code': object})
rcd_welcome.drop(['year'], axis=1, inplace=True)

#############  New files added for 2019 data #############
# rcd_ap_crs_list.csv and rcd_ap_crs_list.csv are look up files and excluded from processing

#File - rcd_acc_eg
rcd_acc_eg = PivotCsv(srcDir, 'rcd_acc_eg.csv',['eg_index','eg_score'],'agency_code', ['subject','subgroup'],'')

In [79]:
#Get and display a list of all .csv file names for 2019 download
rcdFiles = glob.glob(srcDir  + 'rcd*.csv')

rcdFileNames = [ntpath.basename(x)[:-4] for x in rcdFiles]

# Save All Flattened Files to \\Raw Datasets Directory
**This code saves all the flattened file versions as .csv files in \\Raw Datasets\

In [80]:
print('Saving Flattened Versions and Record Counts for the Following Raw Data Files: \n')
for fileName in rcdFileNames:
    eval(fileName).to_csv(dataDir + 'Flattened Datasets/' + fileName + '.csv', sep=',', index=False)
    print(fileName + ', ' + str(len(eval(fileName).index)))
    

Saving Flattened Versions and Record Counts for the Following Raw Data Files: 

rcd_acc_aapart, 57
rcd_acc_act, 743
rcd_acc_awa, 699
rcd_acc_cgr, 743
rcd_acc_eds, 2770
rcd_acc_eg, 2548
rcd_acc_elp, 1887
rcd_acc_essa_desig, 2654
rcd_acc_gp, 596
rcd_acc_irm, 1279
rcd_acc_lowperf, 2769
rcd_acc_ltg, 2526
rcd_acc_ltg_detail, 2666
rcd_acc_mcr, 726
rcd_acc_part, 2538
rcd_acc_part_detail, 2538
rcd_acc_rta, 1578
rcd_acc_spg1, 2584
rcd_acc_spg2, 2544
rcd_acc_wk, 520
rcd_ap, 578
rcd_charter, 270
rcd_courses2, 595
rcd_cte_concentrators, 553
rcd_cte_credentials, 533
rcd_cte_endorsement, 529
rcd_cte_enrollment, 1189
rcd_dlmi, 2818
rcd_ib, 51
rcd_inc1, 3097
rcd_inc2, 2719
rcd_location, 2770
rcd_prin_demo, 116
rcd_readiness, 1308
rcd_sat, 694
rcd_welcome, 1222


## Download and Save Copy of the Original Statistical Profiles Data

In [81]:
#Statistical Profiles - Student Body Racial Compositions at the School Level
import io
import requests

url='http://apps.schools.nc.gov/ords/f?p=145:221::CSV::::'
statProfPath = dataDir + 'SRC_Datasets/' + 'ec_pupils.csv'

#Passing this URL directly into pd.read_csv() threw HTTP errors - This is my workaround
s = requests.get(url).content
ec_pupils = pd.read_csv(io.StringIO(s.decode('utf-8')), low_memory=False
                        , dtype={'LEA': object,'School': object})

#Rename year for consistency
ec_pupils.rename({'Year':'year'}, axis=1, inplace=True)

#Create agency_code from LEA and School code as an index
ec_pupils['agency_code'] = ec_pupils['LEA'] + ec_pupils['School']

#Filter to 2018 school year (There is already 2019 school year data in this file)
#ec_pupils = ec_pupils[ec_pupils.year == schoolYear]

#Some schools are missing race data.  Get the most recent year of data available for each agency code
ec_pupils = ec_pupils.sort_values(by=['agency_code', 'year'])
ec_pupils = ec_pupils.drop_duplicates(subset=["agency_code"], keep="last")

#Save the original data to the source datasets folder 
ec_pupils.to_csv(statProfPath, sep=',', index=False)

#Get data for the most recent school year
#CleanUpRcdFiles(statProfPath)

## Create Flattened Statistical Profiles with Racial Composition Percentages

In [82]:
#***********************************************************************
# Statistical Profiles - Student Body Racial Compositions at the School Level Reshape
#
# Statistical Profiles data are already one record per public school but must be converted to percentages
# Creates a new dataset - ec_pupils_pct.csv
#
#***********************************************************************

#Statistical Profiles - Student Body Racial Compositions at the School Level
ec_pupils = pd.read_csv(statProfPath, low_memory=False, dtype={'agency_code': object})

#Create Racial Composition summary variables
ec_pupils['Indian'] = ec_pupils['Indian Male'] + ec_pupils['Indian Female']
ec_pupils['Asian'] = ec_pupils['Asian Male'] + ec_pupils['Asian Female']
ec_pupils['Hispanic'] = ec_pupils['Hispanic Male'] + ec_pupils['Hispanic Female']
ec_pupils['Black'] = ec_pupils['Black Male'] + ec_pupils['Black Female']
ec_pupils['White'] = ec_pupils['White Male'] + ec_pupils['White Female']
ec_pupils['Pacific Island'] = ec_pupils['Pacific Island Male'] + ec_pupils['Pacific Island Female']
ec_pupils['Two or  More'] = ec_pupils['Two or  More Male'] + ec_pupils['Two or  More Female']

#The original total field is corrupted with non-printable characters and will not convert to int or float 
ec_pupils.drop(['Total'], axis=1, inplace=True)
#Create a new totals field by summing race composition fields
ec_pupils['Total'] = ec_pupils['Indian'] + ec_pupils['Asian'] + \
                     ec_pupils['Hispanic'] + ec_pupils['Black'] + \
                     ec_pupils['White'] + ec_pupils['Pacific Island'] + ec_pupils['Two or  More']
#Convert Totals to float64 for division later
ec_pupils['Total'] = ec_pupils['Total'].astype(np.float64)

#Create Minority summary variables 
ec_pupils['Minority Male'] = ec_pupils['Indian Male'] + ec_pupils['Asian Male'] \
                           + ec_pupils['Hispanic Male'] + ec_pupils['Black Male'] \
                           + ec_pupils['Pacific Island Male'] + ec_pupils['Two or  More Male'] 
ec_pupils['Minority Female'] = ec_pupils['Indian Female'] + ec_pupils['Asian Female'] \
                           + ec_pupils['Hispanic Female'] + ec_pupils['Black Female'] \
                           + ec_pupils['Pacific Island Female'] + ec_pupils['Two or  More Female']
ec_pupils['Minority'] = ec_pupils['Minority Male'] + ec_pupils['Minority Female']

#Create Student Body Racial Composition PERCENTAGES at the School Level
ec_pupils_pct = pd.DataFrame({'agency_code'   : ec_pupils['agency_code']
                            , 'School Name' : ec_pupils['___School Name___']
                            , 'IndianPct'   : ec_pupils['Indian'] / ec_pupils['Total']  
                            , 'AsianPct'    : ec_pupils['Asian'] / ec_pupils['Total']
                            , 'HispanicPct' : ec_pupils['Hispanic'] / ec_pupils['Total']
                            , 'BlackPct'    : ec_pupils['Black'] / ec_pupils['Total']
                            , 'WhitePct'    : ec_pupils['White'] / ec_pupils['Total']
                            , 'PacificIslandPct': ec_pupils['Pacific Island'] / ec_pupils['Total']
                            , 'TwoOrMorePct': ec_pupils['Two or  More'] / ec_pupils['Total']
                            , 'MinorityPct' : ec_pupils['Minority'] / ec_pupils['Total']
                            
                              
                            , 'IndianMalePct'   : ec_pupils['Indian Male'] / ec_pupils['Total']  
                            , 'AsianMalePct'    : ec_pupils['Asian Male'] / ec_pupils['Total']
                            , 'HispanicMalePct' : ec_pupils['Hispanic Male'] / ec_pupils['Total']
                            , 'BlackMalePct'    : ec_pupils['Black Male'] / ec_pupils['Total']
                            , 'WhiteMalePct'    : ec_pupils['White Male'] / ec_pupils['Total']
                            , 'PacificIslandMalePct': ec_pupils['Pacific Island Male'] / ec_pupils['Total']
                            , 'TwoOrMoreMalePct': ec_pupils['Two or  More Male'] / ec_pupils['Total']  
                            , 'MinorityMalePct' : ec_pupils['Minority Male'] / ec_pupils['Total']
                                                          
                            , 'IndianFemalePct'   : ec_pupils['Indian Female'] / ec_pupils['Total']  
                            , 'AsianFemalePct'    : ec_pupils['Asian Female'] / ec_pupils['Total']
                            , 'HispanicFemalePct' : ec_pupils['Hispanic Female'] / ec_pupils['Total']
                            , 'BlackFemalePct'    : ec_pupils['Black Female'] / ec_pupils['Total']
                            , 'WhiteFemalePct'    : ec_pupils['White Female'] / ec_pupils['Total']
                            , 'MinorityFemalePct' : ec_pupils['Minority Female'] / ec_pupils['Total'] 
                            , 'PacificIslandFemalePct': ec_pupils['Pacific Island Female'] / ec_pupils['Total']
                            , 'TwoOrMoreFemalePct': ec_pupils['Two or  More Female'] / ec_pupils['Total']
                             })

#Save the flattened racial composition percentage data to disk 
ec_pupils_pct.to_csv(dataDir + 'Flattened Datasets/' + 'ec_pupils_pct.csv', sep=',', index=False)

#Print file details
print('Saving Flattened Versions and Record Counts for the Following Raw Data Files: \n')
print('ec_pupils_pct' + ', ' + str(len(ec_pupils_pct.index)))

Saving Flattened Versions and Record Counts for the Following Raw Data Files: 

ec_pupils_pct, 2569


## Create rcd_pk_enroll.csv Counts Flattened File
* The rcd_pk_enroll.csv percentages always = 1 for the _ALL subgroup (only shows distribution of race)
* Adding actual PK enrollment counts to track PK enrollment growth. 

In [83]:
#Pivot File - rcd_pk_enroll - Missing in 2019
#rcd_pk_enroll_ct = PivotCsv(dataDir, 'SRC_Datasets/rcd_pk_enroll.csv',['count'],'agency_code', ['subgroup'],'_PK_ENROLL')

#rcd_pk_enroll_ct.to_csv(dataDir + 'Flattened Datasets/' + 'rcd_pk_enroll_ct.csv', sep=',', index=False)
#print('rcd_pk_enroll_ct' + ', ' + str(rcd_pk_enroll_ct.index))