Goal: Download all student mobility data for all schools in the past 6 years and create a master csv.

In [16]:
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
import re
import glob
import os

The export button on the mobility report shows where to specify the year argument in the URL. We wrote a loop to download each year's worth of data to .xls.

In [None]:
years = [2010, 2011, 2012, 2013, 2014, 2015, 2016]

for year in years:
    base = 'http://profiles.doe.mass.edu/state_report/mobilityrates.aspx?&export_excel=yes&ctl00$ContentPlaceHolder1$cohortYear={}&ctl00$ContentPlaceHolder1$reportType=SCHOOL&ctl00$ContentPlaceHolder1$rateType=4-Year:REG&ctl00$ContentPlaceHolder1$studentGroup=ALL'
    base = base.format(year)
    print(base)
    filename = "student_"+"mobility_"+str(year)+".xls"
    with open(filename, 'w') as output:
        output.write(requests.get(base).text)

The below function converts the .xls files to dataframes. Files from 2010 to 2015 are passed through the function and are added to a master dataframe.

In [None]:
def clean_file(file):
    df = pd.read_html(file)
    df = pd.DataFrame(df[1])
    header = df.iloc[0]
    df.columns = header
    df = df[1:]
    file_name = os.path.splitext(file)[0]
    df["Source"] = file_name
    df['Year'] = df["Source"].str[-4:]
    return df

clean_file("student_mobility_2010.xls")

In [19]:
all_files = glob.glob("student_mobility_201[0-5].xls")
df_list = []

for file in all_files:
    file = clean_file(file)
    df_list.append(file)
    
df = pd.concat(df_list)

Rename OrgCode to match other files and set index before writing the master file to a csv. 

In [20]:
df = df.rename(columns={'OrgCode': 'Org Code'})
df

Unnamed: 0,Org Name,Org Code,Churn Enrollment,% Churn,% Intake,Stability Enrollment,% Stability,Source,Year
1,Abby Kelley Foster Charter Public (District) -...,04450105,1465,7.2,4.4,1435,94.7,student_mobility_2010,2010
2,Abington - Abington ECC,00010003,562,11.0,8.2,536,93.3,student_mobility_2010,2010
3,Abington - Abington High,00010505,593,10.8,5.9,577,91.7,student_mobility_2010,2010
4,Abington - Center,00010005,250,10.8,7.2,240,92.9,student_mobility_2010,2010
5,Abington - Frolio Middle School,00010405,415,6.8,4.3,408,94.9,student_mobility_2010,2010
6,Abington - Woodsdale,00010015,486,5.6,3.9,481,95.4,student_mobility_2010,2010
7,Academy Of the Pacific Rim Charter Public (Dis...,04120530,487,4.7,1.9,485,95.7,student_mobility_2010,2010
8,Acton - Douglas,00020020,506,5.1,1.8,499,96.2,student_mobility_2010,2010
9,Acton - Gates,00020025,502,5.2,2.8,491,97.0,student_mobility_2010,2010
10,Acton - Luther Conant,00020030,508,2.6,1.8,500,99.0,student_mobility_2010,2010


In [22]:
df = df.set_index('Org Code')
df.to_csv("student_mobility_all.csv")