In [189]:
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
import re
import os
import glob

In [3]:
## First, get list of staff codes

In [16]:
resp = requests.get("http://profiles.doe.mass.edu/state_report/teacherbyracegender.aspx")
html = resp.content
soup = bs(html, "html.parser")

In [18]:
options = soup.find_all("option")


options[18]["value"], options[18].text

('1100', 'All')

In [19]:
value = []
jobs = []

for o in options:
    id = (o["value"])
    job = (o.text)
    value.append(id)
    jobs.append(job)


job_codes = value[18:]
jobs = jobs[18:]

job_classes = dict(zip(job_codes, jobs))

job_codes

years = value[4:14]

In [21]:
job_classes

{'1100': 'All',
 '1200': 'Superintendent of Schools',
 '1201': 'Assistant/Associate/ Vice Superintendents',
 '1202': 'School Business Official',
 '1205': 'Other District Wide Administrators',
 '1210': 'Supervisor/Director of Guidance',
 '1211': 'Supervisor/Director of Pupil Personnel',
 '1212': 'Special Education Administrator',
 '1213': 'Supervisor/Director/Coordinator: Arts',
 '1214': 'Supervisor/Director/Coordinator of Assessment',
 '1215': 'Supervisor/Director/Coordinator of Curriculum',
 '1216': 'Supervisor/Director/Coordinator: English Language Learner',
 '1217': 'Supervisor/Director/Coordinator: English',
 '1218': 'Supervisor/Director/Coordinator: Foreign Language',
 '1219': 'Supervisor/Director/Coordinator: History/Social Studies',
 '1220': 'Supervisor/Director/Coordinator: Library/Media',
 '1221': 'Supervisor/Director/Coordinator: Mathematics',
 '1222': 'Supervisor/Director/Coordinator: Reading',
 '1223': 'Supervisor/Director/Coordinator: Science',
 '1224': 'Supervisor/Directo

In [None]:
## Write all staff race-ethnicity files

In [82]:
for year in years:
    for code in job_codes:
        base = 'http://profiles.doe.mass.edu/state_report/teacherbyracegender.aspx?ctl00$ContentPlaceHolder1$fycode={}&export_excel=yes&ctl00$ContentPlaceHolder1$displayType=NUM&ctl00$ContentPlaceHolder1$jobClass={}&ctl00$ContentPlaceHolder1$reportType=SCHOOL'
        base = base.format(year, code)
        filename = "staff_"+"raceeth_"+str(year)+"_"+str(code)+".xls"
        with open(filename, 'w') as output:
            output.write(requests.get(base).text)

In [None]:
## Bind all xls files together (2010 to 2017) Just do 2013 first

In [174]:
def clean_file(file):
    df = pd.read_html(file)
    df = pd.DataFrame(df[1])
    header = df.iloc[0]
    try:
        df.columns = header
        df = df[1:]
        df.set_index('Org Code')
        file_name = os.path.splitext(file)[0]
        df["Source"] = file_name
        return df
    except:
        print("No data in file")   

In [None]:
all_files = glob.glob("staff_raceeth_2011_*.xls")
all_files

In [None]:
all_files = glob.glob("staff_raceeth_2011_*.xls")
df_list = []

for file in all_files:
    print(file)
    file = clean_file(file)
    df_list.append(file)

In [181]:
df = pd.concat(df_list)

In [182]:
## Add additional columns and export to csv
df['Job Code'] = df["Source"].str[-4:]
df['Year'] = df["Source"].str[-9:-5]
df = df.set_index('Org Code')

In [None]:
df

In [184]:
df.to_csv("staff_raceeth_jobcodes_2011.csv")

In [185]:
df['Job Group'] = df['Job Code'].str[0:2]

In [None]:
num_cols = ["African American (# )","Asian (# )", "Hispanic (# )", "White (# )", "Native Hawaiian, Pacific Islander (# )", "Native American (# )", "Multi-Race, Non-Hispanic (# )", "FTE Count"]
df[num_cols] = df[num_cols].apply(pd.to_numeric)
df_grouped = df.groupby(['Job Group','Year', "Org Code", "SCHOOL"])[num_cols].sum()

df_grouped

In [187]:
df_grouped = df_grouped.reset_index()
df_grouped = df_grouped.set_index('Org Code')
df_grouped["Other (# )"] = df_grouped["Native Hawaiian, Pacific Islander (# )"] + df_grouped["Native American (# )"] + df_grouped["Multi-Race, Non-Hispanic (# )"] 
df_grouped.drop(['Native Hawaiian, Pacific Islander (# )', 'Native American (# )', 'Multi-Race, Non-Hispanic (# )' ], axis=1, inplace=True)

In [188]:
df_grouped.to_csv("staff_raceeth_jobcategories_2011.csv")

In [None]:
## bind grouped staff csvs together and join with job code category names

In [286]:
filenames = glob.glob("staff_raceeth_jobcategories_201[1-5].csv")
dfs = []
for filename in filenames:
    dfs.append(pd.read_csv(filename))

# Concatenate all data into one DataFrame
job_categories = pd.concat(dfs)
job_categories

Unnamed: 0,Org Code,Job Group,Year,SCHOOL,African American (# ),Asian (# ),Hispanic (# ),White (# ),FTE Count,Other (# )
0,10003,11,2011,Abington - Abington ECC,1.0,0.0,0.0,68.6,69.6,0.0
1,10015,11,2011,Abington - Woodsdale,2.0,0.0,0.0,37.8,39.8,0.0
2,10405,11,2011,Abington - Frolio Middle School,0.0,0.0,0.0,29.5,29.5,0.0
3,10505,11,2011,Abington - Abington High,1.0,0.0,1.0,49.6,51.6,0.0
4,20010,11,2011,Acton - Merriam,0.0,2.0,0.0,80.5,83.5,1.0
5,20015,11,2011,Acton - McCarthy-Towne,0.0,0.0,0.0,58.8,58.8,0.0
6,20020,11,2011,Acton - Douglas,0.0,0.2,0.0,48.3,48.5,0.0
7,20025,11,2011,Acton - Gates,0.0,1.5,0.0,47.9,49.4,0.0
8,20030,11,2011,Acton - Luther Conant,0.0,2.2,0.0,56.0,58.2,0.0
9,30025,11,2011,Acushnet - Acushnet Elementary Sch,0.0,0.0,0.0,60.3,60.3,0.0


In [287]:
## Create df that matches job group code to the job category
jcl = list(set(job_categories['Job Group'].tolist()))
jobs = ["Student Support", "Related Service Providers", "Paraprofessionals", "All Staff", "District Leaders", "School Leaders", "Health", "Teachers", "Administrative Support"]
job_code_df = pd.DataFrame(list(map(list, zip(jcl,jobs))), columns = ["Job Group", "Job Category"])

In [288]:
job_categories = pd.merge(job_categories, job_code_df, on = "Job Group")

In [289]:
job_categories = job_categories.set_index("Org Code")

In [291]:
job_categories.to_csv("staff_raceeth_jobcategories_2011_2015.csv")