In [2]:
import pandas as pd

In [27]:
pubschls = "pubschls.csv"
public_schools = pd.read_csv(pubschls, low_memory=False)

In [7]:
public_schools.columns

Index(['CDSCode', 'NCESDist', 'NCESSchool', 'StatusType', 'County', 'District',
       'School', 'Street', 'StreetAbr', 'City', 'Zip', 'State', 'MailStreet',
       'MailStrAbr', 'MailCity', 'MailZip', 'MailState', 'Phone', 'Ext',
       'WebSite', 'OpenDate', 'ClosedDate', 'Charter', 'CharterNum',
       'FundingType', 'DOC', 'DOCType', 'SOC', 'SOCType', 'EdOpsCode',
       'EdOpsName', 'EILCode', 'EILName', 'GSoffered', 'GSserved', 'Virtual',
       'Magnet', 'YearRoundYN', 'FederalDFCDistrictID', 'Latitude',
       'Longitude', 'AdmFName1', 'AdmLName1', 'AdmEmail1', 'AdmFName2',
       'AdmLName2', 'AdmEmail2', 'AdmFName3', 'AdmLName3', 'AdmEmail3',
       'LastUpDate'],
      dtype='object')

In [31]:
public_schools["School"].count()

18104

In [32]:
public_schools["LastUpDate"]

0        5/31/2019
1         9/1/2015
2         7/1/2019
3         7/1/2015
4        2/13/2019
           ...    
18100    6/24/1999
18101    6/24/1999
18102     7/2/2013
18103    2/13/2019
18104          NaN
Name: LastUpDate, Length: 18105, dtype: object

In [37]:
public_schools['time'] = pd.to_datetime(public_schools['LastUpDate'])

In [39]:
public_schools['year'] = public_schools['time'].dt.year 

In [42]:
# Number of public schools information last 4 year
len(public_schools.loc[public_schools["year"] > 2016])

11881

In [46]:
ps_since2016 = public_schools.loc[public_schools["year"] > 2016]

In [47]:
ps_since2016["StatusType"].unique()

array(['Active', 'Closed', 'Merged', 'Pending'], dtype=object)

In [62]:
# Closed, merged and nan rows should be cleaned/We will work on Active/Merged schools

ps_since2016.loc[(ps_since2016["StatusType"] == "Active") & (ps_since2016["StatusType"] == "Merged")]


Unnamed: 0,CDSCode,NCESDist,NCESSchool,StatusType,County,District,School,Street,StreetAbr,City,...,AdmEmail1,AdmFName2,AdmLName2,AdmEmail2,AdmFName3,AdmLName3,AdmEmail3,LastUpDate,time,year


In [66]:
active_ps = ps_since2016[ps_since2016["StatusType"].isin(["Active", "Merged"])]
active_ps["School"].count()

11478

In [67]:
active_ps["Magnet"].unique()

array(['No Data', 'N', 'Y'], dtype=object)

In [68]:
# https://www.cde.ca.gov/sp/eo/mt/index.asp
# Magnet schools focus on a special area of study, such as science, the performing arts, or career education.
# A magnet school is an entire school with a special focus.
# We can see caasps results in specific area and compare with other public schools

# Number of public schools are in Magnet Program
len(active_ps.loc[active_ps["Magnet"] == "Y"])

#Can we see which are that magnet school focuses?

531

In [69]:
# Number of public schools are Charter school
len(active_ps.loc[active_ps["Charter"] == "Y"])


1305

In [70]:
# Number of Public Schools use Year Round Education Program
# We can compare success of schools uses not-common year round program and normal school calendar. 
# https://www.cde.ca.gov/ls/fa/yr/guide.asp

len(active_ps.loc[active_ps["YearRoundYN"] == "Y"])

737

In [71]:
# We can compare success of directly funded and locally funded public schools
# and see the difference between funded schools and no data/nan/not in CS funding models columns

active_ps["FundingType"].unique()

array(['No Data', 'Directly funded', 'Locally funded'], dtype=object)

In [74]:
# EILCode represent school types, if we limit the study among high schools.
active_ps["EILCode"].unique()

array(['No Data', 'HS', 'ELEM', 'INTMIDJR', 'ELEMHIGH', 'PS', 'A', 'UG'],
      dtype=object)

In [80]:
active_ps["EILName"].unique()

array(['No Data', 'High School', 'Elementary',
       'Intermediate/Middle/Junior High', 'Elementary-High Combination',
       'Preschool', 'Adult', 'Ungraded'], dtype=object)

In [86]:
public_schools20162020 = active_ps[["CDSCode", "StatusType", "County", "District", "School","EILName", "OpenDate",
                                 "Charter","Magnet", "YearRoundYN", "FundingType"]]
public_schools20162020.head(100)

Unnamed: 0,CDSCode,StatusType,County,District,School,EILName,OpenDate,Charter,Magnet,YearRoundYN,FundingType
0,01100170000000,Active,Alameda,Alameda County Office of Education,No Data,No Data,No Data,No Data,No Data,No Data,No Data
2,01100170112607,Active,Alameda,Alameda County Office of Education,Envision Academy for Arts & Technology,High School,8/28/2006,Y,N,N,Directly funded
4,01100170123968,Active,Alameda,Alameda County Office of Education,Community School for Creative Education,Elementary,8/22/2011,Y,N,N,Directly funded
5,01100170124172,Active,Alameda,Alameda County Office of Education,Yu Ming Charter,Elementary,8/9/2011,Y,N,N,Directly funded
6,01100170125567,Active,Alameda,Alameda County Office of Education,Urban Montessori Charter,Elementary,8/27/2012,Y,N,N,Directly funded
...,...,...,...,...,...,...,...,...,...,...,...
179,01611760128298,Active,Alameda,Fremont Unified,Glankler Early Learning Center,Elementary,7/1/2013,N,N,N,No Data
180,01611760130062,Active,Alameda,Fremont Unified,American High,High School,7/1/1972,N,N,N,No Data
181,01611760130138,Active,Alameda,Fremont Unified,Robertson High (Continuation),High School,7/1/1980,N,N,N,No Data
182,01611760130435,Active,Alameda,Fremont Unified,Vista Alternative,High School,7/1/1980,N,N,N,No Data


In [90]:

public_schools20162020.to_csv(r'C:\Users\Rabia\Desktop\public_schools20162020.csv', index=False) 

In [83]:
path = "sb_ca2016v4.csv"
a = pd.read_csv(path)

In [95]:
a.columns

Index(['County Code', 'District Code', 'School Code', 'Filler', 'Test Year',
       'Subgroup ID', 'Test Type', 'Total Tested At Entity Level',
       'Total Tested with Scores', 'Grade', 'Test Id',
       'CAASPP Reported Enrollment', 'Students Tested', 'Mean Scale Score',
       'Percentage Standard Exceeded', 'Percentage Standard Met',
       'Percentage Standard Met and Above', 'Percentage Standard Nearly Met',
       'Percentage Standard Not Met', 'Students with Scores',
       'Area 1 Percentage Above Standard', 'Area 1 Percentage Near Standard',
       'Area 1 Percentage Below Standard', 'Area 2 Percentage Above Standard',
       'Area 2 Percentage Near Standard', 'Area 2 Percentage Below Standard',
       'Area 3 Percentage Above Standard', 'Area 3 Percentage Near Standard',
       'Area 3 Percentage Below Standard', 'Area 4 Percentage Above Standard',
       'Area 4 Percentage Near Standard', 'Area 4 Percentage Below Standard',
       'CDSCode'],
      dtype='object')

In [96]:
a.head()

Unnamed: 0,County Code,District Code,School Code,Filler,Test Year,Subgroup ID,Test Type,Total Tested At Entity Level,Total Tested with Scores,Grade,...,Area 2 Percentage Above Standard,Area 2 Percentage Near Standard,Area 2 Percentage Below Standard,Area 3 Percentage Above Standard,Area 3 Percentage Near Standard,Area 3 Percentage Below Standard,Area 4 Percentage Above Standard,Area 4 Percentage Near Standard,Area 4 Percentage Below Standard,CDSCode
0,0,0,0,,2019,1,B,3165580.0,3162910.0,3,...,20.83,49.66,29.5,21.89,61.25,16.86,24.14,48.16,27.7,0
1,0,0,0,,2019,3,B,1616938.0,1615465.0,3,...,17.93,49.03,33.04,20.66,60.54,18.81,22.66,46.69,30.65,0
2,0,0,0,,2019,4,B,1548642.0,1547445.0,3,...,23.85,50.32,25.83,23.17,61.99,14.84,25.68,49.69,24.63,0
3,0,0,0,,2019,6,B,2651676.0,2649540.0,3,...,25.12,51.92,22.96,26.17,61.23,12.6,28.93,49.45,21.62,0
4,0,0,0,,2019,7,B,132465.0,132405.0,3,...,42.74,47.23,10.03,42.25,53.14,4.6,47.75,43.21,9.04,0


In [94]:
path = "combined_data.csv"
combined_data = pd.read_csv(path)
combined_data.columns

Index(['CDSCode', 'CAASPP Reported Enrollment', 'Students Tested',
       'Percentage Standard Exceeded', 'Percentage Standard Met',
       'Percentage Standard Met and Above', 'Percentage Standard Nearly Met',
       'Percentage Standard Not Met', 'Students with Scores', 'School',
       'County', 'Charter', 'Magnet', 'EILCode'],
      dtype='object')

Unnamed: 0,County Code,District Code,School Code,Filler,Test Year,Subgroup ID,Test Type,Total Tested At Entity Level,Total Tested with Scores,Grade,...,Area 2 Percentage Above Standard,Area 2 Percentage Near Standard,Area 2 Percentage Below Standard,Area 3 Percentage Above Standard,Area 3 Percentage Near Standard,Area 3 Percentage Below Standard,Area 4 Percentage Above Standard,Area 4 Percentage Near Standard,Area 4 Percentage Below Standard,CDSCode
0,0,0,0,,2019,1,B,3165580.0,3162910.0,3,...,20.83,49.66,29.5,21.89,61.25,16.86,24.14,48.16,27.7,0
1,0,0,0,,2019,3,B,1616938.0,1615465.0,3,...,17.93,49.03,33.04,20.66,60.54,18.81,22.66,46.69,30.65,0
2,0,0,0,,2019,4,B,1548642.0,1547445.0,3,...,23.85,50.32,25.83,23.17,61.99,14.84,25.68,49.69,24.63,0
3,0,0,0,,2019,6,B,2651676.0,2649540.0,3,...,25.12,51.92,22.96,26.17,61.23,12.6,28.93,49.45,21.62,0
4,0,0,0,,2019,7,B,132465.0,132405.0,3,...,42.74,47.23,10.03,42.25,53.14,4.6,47.75,43.21,9.04,0
