Read data from two US Department of Education public data sets and merged them using an inner join into one data set using the NCES School ID as the primary key. The two data sets are:

https://www2.ed.gov/about/offices/list/ocr/docs/crdc-2015-16.html

https://www2.ed.gov/about/inits/ed/edfacts/data-files/acgr-sch-sy2015-16.csv

In [1]:
import pandas as pd
import numpy as np

In [60]:
grad = pd.read_csv('/Users/flatironschool/BootCamp/AbsenteeismProject/data/acgr-sch-sy2015-16.csv')
grad.head()

Unnamed: 0,STNAM,FIPST,LEAID,LEANM,NCESSCH,SCHNAM,ALL_COHORT_1516,ALL_RATE_1516,MAM_COHORT_1516,MAM_RATE_1516,...,MTR_RATE_1516,MWH_COHORT_1516,MWH_RATE_1516,CWD_COHORT_1516,CWD_RATE_1516,ECD_COHORT_1516,ECD_RATE_1516,LEP_COHORT_1516,LEP_RATE_1516,DATE_CUR
0,ALABAMA,1,100005,Albertville City,10000500871,Albertville High Sch,296,92,2,PS,...,.,193,90-94,18,60-79,108,80-84,9,GE50,01JUL17
1,ALABAMA,1,100006,Marshall County,10000600872,Asbury Sch,67,GE95,2,PS,...,.,47,GE90,7,GE50,54,GE90,2,PS,01JUL17
2,ALABAMA,1,100006,Marshall County,10000600878,Douglas High Sch,153,85-89,.,.,...,.,116,85-89,13,GE50,107,85-89,1,PS,01JUL17
3,ALABAMA,1,100006,Marshall County,10000600883,Kate D Smith DAR High Sch,120,80-84,.,.,...,.,118,80-84,16,21-39,57,60-69,.,.,01JUL17
4,ALABAMA,1,100006,Marshall County,10000601585,Brindlee Mt High Sch,94,85-89,2,PS,...,PS,87,85-89,15,GE50,56,80-89,.,.,01JUL17


In [61]:
grad.describe()

Unnamed: 0,FIPST,LEAID,NCESSCH,ALL_COHORT_1516
count,23090.0,23090.0,23090.0,23090.0
mean,27.680424,2779916.0,277991600000.0,158.25929
std,15.959959,1595166.0,159516600000.0,179.91495
min,1.0,100005.0,10000500000.0,1.0
25%,12.0,1201980.0,120198000000.0,31.0
50%,27.0,2723535.0,272353500000.0,87.0
75%,40.0,4025388.0,402538800000.0,235.0
max,59.0,5900197.0,590019700000.0,4713.0


In [73]:
grad.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23090 entries, 0 to 23089
Data columns (total 27 columns):
STNAM              23090 non-null object
FIPST              23090 non-null int64
LEAID              23090 non-null int64
LEANM              23090 non-null object
NCESSCH            23090 non-null object
SCHNAM             23090 non-null object
ALL_COHORT_1516    23090 non-null int64
ALL_RATE_1516      23090 non-null object
MAM_COHORT_1516    19434 non-null object
MAM_RATE_1516      19434 non-null object
MAS_COHORT_1516    20357 non-null object
MAS_RATE_1516      20357 non-null object
MBL_COHORT_1516    21736 non-null object
MBL_RATE_1516      21736 non-null object
MHI_COHORT_1516    22034 non-null object
MHI_RATE_1516      22034 non-null object
MTR_COHORT_1516    20705 non-null object
MTR_RATE_1516      20705 non-null object
MWH_COHORT_1516    22716 non-null object
MWH_RATE_1516      22716 non-null object
CWD_COHORT_1516    22656 non-null object
CWD_RATE_1516      22656 non-null

In [72]:
#NCES School ID number should be a 12 digit ID number with a leading zero in the first ten states alphabetically.
#NCES School ID number converted to string to add leading zeros
grad['NCESSCH'] = grad['NCESSCH'].astype(str)

In [83]:
schID = grad['NCESSCH']
grad['NCESSCH'] = grad['NCESSCH'].map(lambda schID: str(schID).zfill(12)) 

In [85]:
grad.tail()

Unnamed: 0,STNAM,FIPST,LEAID,LEANM,NCESSCH,SCHNAM,ALL_COHORT_1516,ALL_RATE_1516,MAM_COHORT_1516,MAM_RATE_1516,...,MTR_RATE_1516,MWH_COHORT_1516,MWH_RATE_1516,CWD_COHORT_1516,CWD_RATE_1516,ECD_COHORT_1516,ECD_RATE_1516,LEP_COHORT_1516,LEP_RATE_1516,DATE_CUR
23085,WYOMING,56,5605820,Washakie County School District #2,560582000393,Ten Sleep K-12,10,GE50,1,PS,...,,9,GE50,1,PS,4,PS,.,.,01JUL17
23086,WYOMING,56,5605830,Teton County School District #1,560583000335,Jackson Hole High School,127,GE95,2,PS,...,PS,87,GE95,19,GE80,42,GE90,17,GE80,01JUL17
23087,WYOMING,56,5605830,Teton County School District #1,560583000512,Summit High School,18,GE80,1,PS,...,PS,10,GE50,8,GE50,8,GE50,1,PS,01JUL17
23088,WYOMING,56,5606090,Weston County School District #7,560609000401,Upton High School,21,GE80,1,PS,...,.,19,GE80,5,PS,11,GE50,.,.,01JUL17
23089,WYOMING,56,5606240,Washakie County School District #1,560624000343,Worland High School,105,75-79,.,.,...,PS,78,75-79,16,60-79,48,60-69,5,PS,01JUL17


In [66]:
grad.head()

Unnamed: 0,STNAM,FIPST,LEAID,LEANM,NCESSCH,SCHNAM,ALL_COHORT_1516,ALL_RATE_1516,MAM_COHORT_1516,MAM_RATE_1516,...,MTR_RATE_1516,MWH_COHORT_1516,MWH_RATE_1516,CWD_COHORT_1516,CWD_RATE_1516,ECD_COHORT_1516,ECD_RATE_1516,LEP_COHORT_1516,LEP_RATE_1516,DATE_CUR
0,ALABAMA,1,100005,Albertville City,10000500871,Albertville High Sch,296,92,2,PS,...,.,193,90-94,18,60-79,108,80-84,9,GE50,01JUL17
1,ALABAMA,1,100006,Marshall County,10000600872,Asbury Sch,67,GE95,2,PS,...,.,47,GE90,7,GE50,54,GE90,2,PS,01JUL17
2,ALABAMA,1,100006,Marshall County,10000600878,Douglas High Sch,153,85-89,.,.,...,.,116,85-89,13,GE50,107,85-89,1,PS,01JUL17
3,ALABAMA,1,100006,Marshall County,10000600883,Kate D Smith DAR High Sch,120,80-84,.,.,...,.,118,80-84,16,21-39,57,60-69,.,.,01JUL17
4,ALABAMA,1,100006,Marshall County,10000601585,Brindlee Mt High Sch,94,85-89,2,PS,...,PS,87,85-89,15,GE50,56,80-89,.,.,01JUL17


In [103]:
ocr = pd.read_csv('/Users/flatironschool/BootCamp/AbsenteeismProject/data/2015-16-crdc-data/Data_Files_and_Layouts/CRDC_2015_16_School_Data.csv', encoding = 'unicode_escape')
ocr.tail()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,LEA_STATE,LEA_STATE_NAME,LEAID,LEA_NAME,SCHID,SCH_NAME,COMBOKEY,JJ,SCH_GRADE_PS,SCH_GRADE_KG,...,SCH_FTESERVICES_PSY,SCH_FTESERVICES_SOC,SCH_JJTYPE,SCH_JJSYDAYS,SCH_JJHOURS,SCH_JJPART_LT15,SCH_JJPART_15T30,SCH_JJPART_31T90,SCH_JJPART_91T180,SCH_JJPART_OV180
96355,WY,WYOMING,5680250,Region V BOCES,48,C-Bar-V Ranch,568025000000.0,No,No,No,...,2.5,3.5,-9,-9,-9,-9,-9,-9,-9,-9
96356,WY,WYOMING,5680251,Wyoming Department of Family Services,534,Wyoming Girls School,568025000000.0,Yes,No,No,...,0.0,1.0,Post,224,27,2,5,8,17,50
96357,WY,WYOMING,5680251,Wyoming Department of Family Services,538,Wyoming Boys School,568025000000.0,Yes,No,No,...,0.0,0.0,Post,180,30,17,11,68,86,8
96358,WY,WYOMING,5680252,Youth Emergency Services Inc. - Administration...,350,Youth Emergency Services Inc.,568025000000.0,No,No,No,...,0.0,1.0,-9,-9,-9,-9,-9,-9,-9,-9
96359,WY,WYOMING,5680254,Saint Stephen's Indian School Admin Office,549,Saint Stephen's Indian School,568025000000.0,No,No,Yes,...,0.0,0.0,-9,-9,-9,-9,-9,-9,-9,-9


In [88]:
ocr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96360 entries, 0 to 96359
Columns: 1836 entries, LEA_STATE to SCH_JJPART_OV180
dtypes: float64(31), int64(1740), object(65)
memory usage: 1.3+ GB


In [105]:
#COMBOKEY in some cases as been converted to scientific notation. It was an
#object already. It seemed easier to combine the LEAID and SCHID to equal 
#the NCESSCH id.
#Both LEAID and SCHID needed leading zeros added back in
distID = ocr['LEAID']
ocr['districtID'] = ocr['LEAID'].map(lambda distID: str(distID).zfill(7)) 

In [107]:
schID2 = ocr['SCHID']
ocr['IDSCH'] = ocr['SCHID'].map(lambda schID2: str(schID2).zfill(5))

In [109]:
#the two id numbers are concatentated together to equal the NCESSCH
ocr['NCESSCH'] = ocr['districtID'] + ocr['IDSCH']

In [112]:
combined_data = grad.merge(ocr, left_on='NCESSCH', right_on='NCESSCH', how='inner')

In [113]:
combined_data.head()

Unnamed: 0,STNAM,FIPST,LEAID_x,LEANM,NCESSCH,SCHNAM,ALL_COHORT_1516,ALL_RATE_1516,MAM_COHORT_1516,MAM_RATE_1516,...,SCH_JJTYPE,SCH_JJSYDAYS,SCH_JJHOURS,SCH_JJPART_LT15,SCH_JJPART_15T30,SCH_JJPART_31T90,SCH_JJPART_91T180,SCH_JJPART_OV180,districtID,IDSCH
0,ALABAMA,1,100005,Albertville City,10000500871,Albertville High Sch,296,92,2,PS,...,-9,-9,-9,-9,-9,-9,-9,-9,100005,871
1,ALABAMA,1,100006,Marshall County,10000600872,Asbury Sch,67,GE95,2,PS,...,-9,-9,-9,-9,-9,-9,-9,-9,100006,872
2,ALABAMA,1,100006,Marshall County,10000600878,Douglas High Sch,153,85-89,.,.,...,-9,-9,-9,-9,-9,-9,-9,-9,100006,878
3,ALABAMA,1,100006,Marshall County,10000600883,Kate D Smith DAR High Sch,120,80-84,.,.,...,-9,-9,-9,-9,-9,-9,-9,-9,100006,883
4,ALABAMA,1,100006,Marshall County,10000601585,Brindlee Mt High Sch,94,85-89,2,PS,...,-9,-9,-9,-9,-9,-9,-9,-9,100006,1585


In [114]:
combined_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21867 entries, 0 to 21866
Columns: 1865 entries, STNAM to IDSCH
dtypes: float64(31), int64(1743), object(91)
memory usage: 311.3+ MB


In [115]:
combined_data.tail()

Unnamed: 0,STNAM,FIPST,LEAID_x,LEANM,NCESSCH,SCHNAM,ALL_COHORT_1516,ALL_RATE_1516,MAM_COHORT_1516,MAM_RATE_1516,...,SCH_JJTYPE,SCH_JJSYDAYS,SCH_JJHOURS,SCH_JJPART_LT15,SCH_JJPART_15T30,SCH_JJPART_31T90,SCH_JJPART_91T180,SCH_JJPART_OV180,districtID,IDSCH
21862,WYOMING,56,5605820,Washakie County School District #2,560582000393,Ten Sleep K-12,10,GE50,1,PS,...,-9,-9,-9,-9,-9,-9,-9,-9,5605820,393
21863,WYOMING,56,5605830,Teton County School District #1,560583000335,Jackson Hole High School,127,GE95,2,PS,...,-9,-9,-9,-9,-9,-9,-9,-9,5605830,335
21864,WYOMING,56,5605830,Teton County School District #1,560583000512,Summit High School,18,GE80,1,PS,...,-9,-9,-9,-9,-9,-9,-9,-9,5605830,512
21865,WYOMING,56,5606090,Weston County School District #7,560609000401,Upton High School,21,GE80,1,PS,...,-9,-9,-9,-9,-9,-9,-9,-9,5606090,401
21866,WYOMING,56,5606240,Washakie County School District #1,560624000343,Worland High School,105,75-79,.,.,...,-9,-9,-9,-9,-9,-9,-9,-9,5606240,343


In [116]:
combined_data.to_csv('grad_data.csv')