# Create NCDPI 2017-2018 Raw Datasets
### This program downloads all original datasets from www.ncpublicschools.org and saves them as .csv files. These data files are used to create all the flattened and machine learning datasets within the NCEA repository.

1. This notebook downloads raw datasets directly from NCDPI specific URLs.
2. Each raw dataset is filtered by school year and saved in the original layout as a .csv file.
3. For consistency, both the Year and School code fields are renamed to "year" and "unit_code" in all files.
4. All masking is removed from raw data fields using the following code: replace({"*":0, ">95":100, "<5":0, "<10":5 })
5. All * or carriage returns are removed from column names.
6. Duplicate column names in accDrillDown files are renamed to include _Ct at the end for all count fields.
7. All raw datasets created by this program are used to create the "flattened" and "machine learning" Public School datasets.

In [7]:
#Run this to add the correct packages path to your jupyter enviroment, if it is missing. 
#import sys
#sys.path.append('C:/Users/Jake/Anaconda2/envs/example_env/Lib/site-packages')

In [8]:
#import required Libraries
import pandas as pd
import numpy as np

#**********************************************************************************
# Set the following variables before running this code!!!
#**********************************************************************************

#Location where copies of the raw data files will be downloaded and saved as csv files.
#'C:/Users/Jake/Documents/GitHub/EducationDataNC/2018/Raw Datasets/'
dataDir = 'D:/BenepactLLC/Belk/NC_Report_Card_Data/2019/April 2019/2018/Raw Datasets/'

#All raw data files are filtered for the year below
schoolYear = 2018

## Download and Save Copy of the Original Data

In [9]:
import urllib.request

#Download and save an original copy of the raw data 
url="http://www.ncpublicschools.org/docs/src/researchers/src-datasets.zip"
zipFilePath = dataDir + 'src-datasets.zip'
urllib.request.urlretrieve(url, zipFilePath)

import zipfile

#Extract the zip file and all school datasets to the //Raw Datasets/ folder
zip_ref = zipfile.ZipFile(zipFilePath, 'r')
zip_ref.extractall(dataDir)
zip_ref.close()

# Get the Most Recent Year of Data from Each File

In [10]:
#Use ntpath.basename to get a filename from a filepath
import ntpath

def CleanUpRcdFiles(filePath):
    fileName = ntpath.basename(filePath)
    schFile = pd.read_csv(filePath, dtype={'agency_code': object}, low_memory=False)
    maxYear = schFile['year'].max()
    
    #Filter records for the most recent year
    schFile = schFile[schFile['year'] == maxYear]
    
    #Remove state and district level summary records 
    #schFile = schFile[(schFile['agency_code'] != 'NC-SEA') & (schFile['agency_code'].str.contains("LEA") == False)]
        
    #Remove * character from any fields. 
    schFile = schFile.replace({'*':''})
    schFile.to_csv(dataDir + fileName, sep=',', index=False)
    
    print(fileName + ', Max Year: ' + str(maxYear))

In [12]:
#Use wildcards to find files in a directory
import glob

#Get and display a list of all .csv file names for 2018 download
rcdFiles = glob.glob(dataDir + 'SRC_Datasets/' + 'rcd*.csv')

print('Saving Files to: ' + dataDir + '\n')

for filePth in rcdFiles:
    fileName = ntpath.basename(filePth)
    if fileName != 'rcd_code_desc.csv': 
        CleanUpRcdFiles(filePth)

Saving Files to: D:/BenepactLLC/Belk/NC_Report_Card_Data/2019/April 2019/2018/Raw Datasets/

rcd_161.csv, Max Year: 2012
rcd_acc_aapart.csv, Max Year: 2018
rcd_acc_act.csv, Max Year: 2018
rcd_acc_awa.csv, Max Year: 2018
rcd_acc_cgr.csv, Max Year: 2018
rcd_acc_eds.csv, Max Year: 2018
rcd_acc_elp.csv, Max Year: 2018
rcd_acc_essa_desig.csv, Max Year: 2018
rcd_acc_gp.csv, Max Year: 2018
rcd_acc_irm.csv, Max Year: 2018
rcd_acc_lowperf.csv, Max Year: 2018
rcd_acc_ltg.csv, Max Year: 2018
rcd_acc_ltg_detail.csv, Max Year: 2018
rcd_acc_mcr.csv, Max Year: 2018
rcd_acc_part.csv, Max Year: 2018
rcd_acc_part_detail.csv, Max Year: 2018
rcd_acc_pc.csv, Max Year: 2018
rcd_acc_rta.csv, Max Year: 2018
rcd_acc_spg1.csv, Max Year: 2017
rcd_acc_spg2.csv, Max Year: 2018
rcd_acc_wk.csv, Max Year: 2018
rcd_adm.csv, Max Year: 2018
rcd_ap.csv, Max Year: 2018
rcd_arts.csv, Max Year: 2018
rcd_att.csv, Max Year: 2018
rcd_charter.csv, Max Year: 2018
rcd_chronic_absent.csv, Max Year: 2018.0
rcd_college.csv, Max Year

In [13]:
#Remove comma from amount field in rcd_improvement
rcd_improvement = pd.read_csv(dataDir + 'rcd_improvement.csv', low_memory=False, dtype={'agency_code': object})
rcd_improvement['amount'] = rcd_improvement['amount'].str.replace(',', '').astype(float)
rcd_improvement.to_csv(dataDir + 'rcd_improvement.csv', sep=',', index=False)

# Flatten the Raw Data Files
### This section reads raw data files directly from the \\Raw Datasets folder and flattens each file.

# A List of All Files Processed

In [15]:
#Get and display a list of all .csv file names for 2018 download
rcdFiles = glob.glob(dataDir + 'rcd*.csv')

rcdFileNames = [ntpath.basename(x)[:-4] for x in rcdFiles]

#print('A List of File Names and Record Counts for Processing:\n')

#for fileName in rcdFileNames:
#    print(fileName + ', ' + str(len(eval(fileName).index)) )

# Flatten All Files 
**This section uses table pivots and other cleanup to reduce each file to 1 record per agency code.**
1. Each agency_code could represent National, State, District, Or School Campus level data.

In [16]:
def PivotCsv(dataDir, fileName, pivValues, pivIndex, pivColumns, colSuffix):
    pivFile = pd.read_csv(dataDir + fileName, low_memory=False, dtype={pivIndex: object})
    
    pivFile[pivColumns] = pivFile[pivColumns].astype(object)
    
    pivFile = pd.pivot_table(pivFile, values=pivValues,index=pivIndex,columns=pivColumns)
    
    #concatenate multiindex column names using a list comprehension.
    if len(pivColumns) >= 2:
        pivFile.columns = [ '_'.join(str(i) for i in col)  + '' for col in pivFile.columns]
        #pivFile.columns = ['_'.join(col) + colSuffix for col in pivFile.columns]
    else:
        pivFile.columns = [str(col) + colSuffix for col in pivFile.columns]
    
    #Make our index a column for merges later
    pivFile.reset_index(level=0, inplace=True)
    return pivFile

In [18]:
#Pivot File - rcd_161 
rcd_161 = PivotCsv(dataDir, 'rcd_161.csv','ccc_pct','agency_code', ['status','subgroup'],'_CCC_PCT')

#Pivot File - rcd_acc_aapart 
rcd_acc_aapart = PivotCsv(dataDir, 'rcd_acc_aapart.csv','pct','agency_code', ['subject','grade'],'_AAPART_PCT')

#Pivot File - rcd_acc_act 
rcd_acc_act = PivotCsv(dataDir, 'rcd_acc_act.csv','pct','agency_code', ['subject','subgroup'],'_ACT_PCT')

#Pivot File - rcd_acc_awa 
rcd_acc_awa = PivotCsv(dataDir, 'rcd_acc_awa.csv','pct','agency_code', ['subgroup'],'_AWA_PCT')

#Pivot File - rcd_acc_cgr
rcd_acc_cgr = PivotCsv(dataDir, 'rcd_acc_cgr.csv','pct','agency_code', ['cgr_type', 'subgroup'],'_CGR_PCT')

#File - rcd_acc_eds
rcd_acc_eds = pd.read_csv(dataDir + 'rcd_acc_eds.csv', low_memory=False, dtype={'agency_code': object})
rcd_acc_eds = rcd_acc_eds[['agency_code', 'pct_eds']]

#Pivot File - rcd_acc_elp
rcd_acc_elp = PivotCsv(dataDir, 'rcd_acc_elp.csv','pct','agency_code', ['subgroup'],'_ELP_PCT')

#File - rcd_acc_essa_desig
rcd_acc_essa_desig = pd.read_csv(dataDir + 'rcd_acc_essa_desig.csv', low_memory=False, dtype={'agency_code': object})
rcd_acc_essa_desig.drop(['year'], axis=1, inplace=True)

#File - rcd_acc_gp
rcd_acc_gp = pd.read_csv(dataDir + 'rcd_acc_gp.csv', low_memory=False, dtype={'agency_code': object})
rcd_acc_gp.drop(['year'], axis=1, inplace=True)

#Pivot File - rcd_acc_irm
rcd_acc_irm = PivotCsv(dataDir, 'rcd_acc_irm.csv','pct_prof','agency_code', ['grade'],'gr_irm_pct_prof')

#File - rcd_acc_lowperf
rcd_acc_lowperf = pd.read_csv(dataDir + 'rcd_acc_lowperf.csv', low_memory=False, dtype={'agency_code': object})
rcd_acc_lowperf = rcd_acc_lowperf[['agency_code', 'lp_school','rlp_school','clpc_school']]

#Pivot File - rcd_acc_ltg
rcd_acc_ltg = PivotCsv(dataDir, 'rcd_acc_ltg.csv','pct_met','agency_code', ['target'],'_LTG_PCT_MET')

#File - rcd_acc_ltg_detail
rcd_acc_ltg_detail = pd.read_csv(dataDir + 'rcd_acc_ltg_detail.csv', low_memory=False, dtype={'agency_code': object})
rcd_acc_ltg_detail.drop(['year'], axis=1, inplace=True)

#Pivot File - rcd_acc_mcr
rcd_acc_mcr = PivotCsv(dataDir, 'rcd_acc_mcr.csv','pct','agency_code', ['subgroup'],'_MCR_PCT')

#Pivot File - rcd_acc_part_detail
rcd_acc_part = PivotCsv(dataDir, 'rcd_acc_part.csv','pct_met','agency_code', ['target'],'_PART_PCT_MET')

#Pivot File - rcd_acc_part
rcd_acc_part_detail = PivotCsv(dataDir, 'rcd_acc_part_detail.csv','pct','agency_code', ['target','subgroup'],'_PART_DET_PCT')

#Pivot File - rcd_acc_pc - WARNING 3323 columns!!! 
rcd_acc_pc = PivotCsv(dataDir, 'rcd_acc_pc.csv','pct','agency_code', ['standard','subject','grade','subgroup'],'_PC_PCT')

#Pivot File - rcd_acc_part_detail
rcd_acc_rta = PivotCsv(dataDir, 'rcd_acc_rta.csv','pct','agency_code', ['metric'],'_RTA_PCT')

#File - rcd_acc_spg1
rcd_acc_spg1 = pd.read_csv(dataDir + 'rcd_acc_spg1.csv', low_memory=False, dtype={'agency_code': object})
rcd_acc_spg1.drop(['year'], axis=1, inplace=True)

#File - rcd_acc_spg2
pivVals = ['aaa_score','awa_score','cgrs_score','elp_score','mcr_score','scgs_score','bi_score',
           'ach_score','eg_status','eg_score','spg_score','spg_grade']
           
rcd_acc_spg2 = PivotCsv(dataDir, 'rcd_acc_spg2.csv',pivVals,'agency_code', ['subgroup'],'_MCR_PCT')

#Pivot File - rcd_acc_wk
rcd_acc_wk = PivotCsv(dataDir, 'rcd_acc_wk.csv','pct','agency_code', ['subgroup'],'_WK_PCT')

#File - rcd_adm
rcd_adm = pd.read_csv(dataDir + 'rcd_adm.csv', low_memory=False, dtype={'agency_code': object})
rcd_adm.drop(['year','category_code'], axis=1, inplace=True)

#File - rcd_ap
#Found 0 duplicate agency_codes in this file, no pivot 
rcd_ap = pd.read_csv(dataDir + 'rcd_ap.csv', low_memory=False, dtype={'agency_code': object})
rcd_ap.drop(['year','category_code'], axis=1, inplace=True)

#File - rcd_arts
rcd_arts = pd.read_csv(dataDir + 'rcd_arts.csv', low_memory=False, dtype={'agency_code': object})
rcd_arts.drop(['year'], axis=1, inplace=True)

#File - rcd_att
#Found 0 duplicate agency_codes in this file, no pivot 
rcd_att = pd.read_csv(dataDir + 'rcd_att.csv', low_memory=False, dtype={'agency_code': object})
rcd_att.drop(['year','category_code'], axis=1, inplace=True)

#Pivot File - rcd_charter
rcd_charter = PivotCsv(dataDir, 'rcd_charter.csv','pct_enrolled','agency_code', ['home_lea','subgroup'],'_CHARTER_PCT')

#Pivot File - rcd_chronic_absent
rcd_chronic_absent = PivotCsv(dataDir, 'rcd_chronic_absent.csv','pct','agency_code', ['subgroup'],'_CHRON_ABSENT_PCT')

#Pivot File - rcd_college
rcd_college = PivotCsv(dataDir, 'rcd_college.csv','pct_enrolled','agency_code', ['status','subgroup'],'_COLLEGE_PCT')

#File - rcd_courses1 - 2017 DATA
#Found 0 duplicate agency_codes in this file, no pivot 
rcd_courses1 = pd.read_csv(dataDir + 'rcd_courses1.csv', low_memory=False, dtype={'agency_code': object})
rcd_courses1.drop(['year','category_code'], axis=1, inplace=True)

#Pivot File - rcd_courses2
rcd_courses2 = PivotCsv(dataDir, 'rcd_courses2.csv',['pct_ap','pct_ccp','pct_ib'],'agency_code', ['category_code','subgroup'],'_COURSES2')

#Pivot File - rcd_cte_concentrators
rcd_cte_concentrators = PivotCsv(dataDir, 'rcd_cte_concentrators.csv','num_concentrators','agency_code', ['career_cluster'],'_CTE_CONCENTRATORS')

#File - rcd_cte_credentials
rcd_cte_credentials = pd.read_csv(dataDir + 'rcd_cte_credentials.csv', low_memory=False, dtype={'agency_code': object})
rcd_cte_credentials.drop(['year'], axis=1, inplace=True)

#File - rcd_cte_endorsement
rcd_cte_endorsement = pd.read_csv(dataDir + 'rcd_cte_endorsement.csv', low_memory=False, dtype={'agency_code': object})
rcd_cte_endorsement.drop(['year'], axis=1, inplace=True)

#File - rcd_cte_enrollment
rcd_cte_enrollment = pd.read_csv(dataDir + 'rcd_cte_enrollment.csv', low_memory=False, dtype={'agency_code': object})
rcd_cte_enrollment['cte_enrollment_pct'] = rcd_cte_enrollment['pct'] 
rcd_cte_enrollment.drop(['year','pct'], axis=1, inplace=True)

#File - rcd_dlmi
rcd_dlmi = pd.read_csv(dataDir + 'rcd_dlmi.csv', low_memory=False, dtype={'agency_code': object})
rcd_dlmi.drop(['year'], axis=1, inplace=True)

#Pivot File - rcd_effectiveness - 2017 Data
rcd_effectiveness = PivotCsv(dataDir, 'rcd_effectiveness.csv',['pct_rating'],'agency_code', ['ee_standard','ee_rating'],'')

#File - rcd_esea_att - 2015 DATA
#Found 0 duplicate agency_codes in this file, no pivot 
rcd_esea_att = pd.read_csv(dataDir + 'rcd_esea_att.csv', low_memory=False, dtype={'agency_code': object})
rcd_esea_att.drop(['year','category_code'], axis=1, inplace=True)

#Pivot File - rcd_experience
expPivColumns = ['pct_experience_0','pct_experience_10','pct_experience_4',
                 'pct_adv_degree','pct_turnover','total_class_teach','avg_class_teach']
rcd_experience = PivotCsv(dataDir, 'rcd_experience.csv',expPivColumns,'agency_code', 'staff','Exp')

#File - !!!DISTRICT LEVEL DATA!!!
rcd_funds = pd.read_csv(dataDir + 'rcd_funds.csv', low_memory=False, dtype={'agency_code': object})
rcd_funds.drop(['year'], axis=1, inplace=True)

#File - rcd_hqt - 2016 DATA
#Found 0 duplicate agency_codes in this file at school level, no pivot 
rcd_hqt = pd.read_csv(dataDir + 'rcd_hqt.csv', low_memory=False, dtype={'agency_code': object})
rcd_hqt.drop(['year','category_code'], axis=1, inplace=True)

#File - rcd_ib
#Found 0 duplicate agency_codes in this file at school level, no pivot 
rcd_ib = pd.read_csv(dataDir + 'rcd_ib.csv', low_memory=False, dtype={'agency_code': object})
rcd_ib.drop(['year','category_code'], axis=1, inplace=True)

#Pivot File - rcd_improvement
rcd_improvement = PivotCsv(dataDir, 'rcd_improvement.csv','amount','agency_code', ['strategy'],'_Improve_Amt')

#File - rcd_inc1
#Found 0 duplicate agency_codes in this file at school level, no pivot 
rcd_inc1 = pd.read_csv(dataDir + 'rcd_inc1.csv', low_memory=False, dtype={'agency_code': object})
rcd_inc1.drop(['year','category_code'], axis=1, inplace=True)

#File - rcd_inc2 
pivFields = ['iss_per1000','sts_per1000','lts_per1000',
             'exp_per1000','crime_per1000','blhr_per1000',
             'rplw_per1000','arre_per1000']
rcd_inc2 = PivotCsv(dataDir, 'rcd_inc2.csv',pivFields,'agency_code', 'subgroup','')

#File - rcd_licenses
#Found 0 duplicate agency_codes in this file at school level, no pivot 
rcd_licenses = pd.read_csv(dataDir + 'rcd_licenses.csv', low_memory=False, dtype={'agency_code': object})
rcd_licenses.drop(['year','category_code'], axis=1, inplace=True)

#File - rcd_location
#Found 0 duplicate agency_codes in this file at school level, no pivot 
rcd_location = pd.read_csv(dataDir + 'rcd_location.csv', low_memory=False, dtype={'agency_code': object})
rcd_location.drop(['year'], axis=1, inplace=True)


#Pivot File - rcd_naep !!!NATIONAL & STATE LEVEL DATA!!!
pivCols = ['grade','naep_subject','subgroup','Proficiency_level']
rcd_naep = PivotCsv(dataDir, 'rcd_naep.csv','percent_proficient','agency_code', pivCols,'')

#File - rcd_licenses
#Found 0 duplicate agency_codes in this file at school level, no pivot 
rcd_nbpts = pd.read_csv(dataDir + 'rcd_nbpts.csv', low_memory=False, dtype={'agency_code': object})
rcd_nbpts.drop(['year','category_code','total_nbpts_num'], axis=1, inplace=True)

#Pivot File - rcd_pk_enroll
rcd_pk_enroll = PivotCsv(dataDir, 'rcd_pk_enroll.csv','pct','agency_code', ['subgroup'],'_PK_ENROLL_PCT')

#Pivot File - rcd_prin_demo - !!! District Level Data !!!
rcd_prin_demo = PivotCsv(dataDir, 'rcd_prin_demo.csv','pct_prin_demo','agency_code', ['subgroup'],'_PCT_PRIN_DEMO')

#File - rcd_readiness
#Found 0 duplicate agency_codes in this file at school level, no pivot 
rcd_readiness = pd.read_csv(dataDir + 'rcd_readiness.csv', low_memory=False, dtype={'agency_code': object})
rcd_readiness.drop(['year','category_code'], axis=1, inplace=True)

#Pivot File - rcd_sar
rcd_sar = PivotCsv(dataDir, 'rcd_sar.csv','avg_size','agency_code', ['grade_eoc'],'_SAR_AVG_SIZE')

#File - rcd_sat
#Found 0 duplicate agency_codes in this file at school level, no pivot 
rcd_sat = pd.read_csv(dataDir + 'rcd_sat.csv', low_memory=False, dtype={'agency_code': object})
rcd_sat.drop(['year','category_code'], axis=1, inplace=True)

#File - rcd_welcome
rcd_welcome = pd.read_csv(dataDir + 'rcd_welcome.csv', low_memory=False, dtype={'agency_code': object})
rcd_welcome.drop(['year'], axis=1, inplace=True)

# Save All Flattened Files to \\Raw Datasets Directory
**This code saves all the flattened file versions as .csv files in \\Raw Datasets\

In [22]:
print('Saving Flattened Versions and Record Counts for the Following Raw Data Files: \n')
for fileName in rcdFileNames:
    eval(fileName).to_csv(dataDir + fileName + '.csv', sep=',', index=False)
    print(fileName + ', ' + str(len(eval(fileName).index)))
    

Saving Flattened Versions and Record Counts for the Following Raw Data Files: 

rcd_161, 644
rcd_acc_aapart, 53
rcd_acc_act, 742
rcd_acc_awa, 688
rcd_acc_cgr, 738
rcd_acc_eds, 2761
rcd_acc_elp, 1809
rcd_acc_essa_desig, 2645
rcd_acc_gp, 658
rcd_acc_irm, 1276
rcd_acc_lowperf, 2760
rcd_acc_ltg, 2461
rcd_acc_ltg_detail, 2631
rcd_acc_mcr, 717
rcd_acc_part, 2527
rcd_acc_part_detail, 2527
rcd_acc_pc, 2697
rcd_acc_rta, 1576
rcd_acc_spg1, 2584
rcd_acc_spg2, 2538
rcd_acc_wk, 517
rcd_adm, 3197
rcd_ap, 563
rcd_arts, 2509
rcd_att, 3115
rcd_charter, 241
rcd_chronic_absent, 2719
rcd_college, 690
rcd_courses1, 773
rcd_courses2, 636
rcd_cte_concentrators, 492
rcd_cte_credentials, 436
rcd_cte_endorsement, 537
rcd_cte_enrollment, 1184
rcd_dlmi, 2723
rcd_effectiveness, 2724
rcd_esea_att, 2700
rcd_experience, 2758
rcd_funds, 292
rcd_hqt, 3073
rcd_ib, 51
rcd_improvement, 19
rcd_inc1, 3097
rcd_inc2, 2792
rcd_licenses, 3121
rcd_location, 2759
rcd_naep, 2
rcd_nbpts, 3124
rcd_pk_enroll, 988
rcd_prin_demo, 116
r